# Check GPU Availability

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


# Import Required Modules

In [2]:
import pandas as pd
from preprocess import ArabicTextPreprocessor
from tokenization_transformation import ArabicTextTokenizer
from prediction import DataFramePredictor

# Load Dataset For Testing

In [3]:
test = pd.read_csv('testing_data.csv')

In [4]:
test.head()

Unnamed: 0,text,domain,category,FinalOff,FinalCB
0,حياك اله اخي عبد العزيز وعظم اله اجر الامه في...,News,Racial,no,no
1,رحم اله البطل هواري بومدين والذي قال نحن مع ف...,News,Racial,no,no
2,يلي قتل ضباط و عناصر الجيش ارهابي ولازم ينعدم...,News,Racial,no,no
3,عبد الرحمن رافع العمري جندي سعودي استشهد عام ...,News,Racial,no,no
4,تزايد حالات الاصابه بفيروس يدفع الحكومه البري...,News,Racial,no,no


# Data Preprocessing

In [5]:
preprocessor = ArabicTextPreprocessor(data_column='text')
test_cleaned = preprocessor.fit_transform(test)

In [6]:
test_cleaned.head()

Unnamed: 0,text,domain,category,FinalOff,FinalCB
0,حياك اله اخي عبد العزيز وعظم اله اجر الامه في ...,News,Racial,no,no
1,رحم اله البطل هواري بومدين والذي قال نحن مع فل...,News,Racial,no,no
2,يلي قتل ضباط و عناصر الجيش ارهابي ولازم ينعدم ...,News,Racial,no,no
3,عبد الرحمن رافع العمري جندي سعودي استشهد عام و...,News,Racial,no,no
4,تزايد حالات الاصابه بفيروس يدفع الحكومه البريط...,News,Racial,no,no


# Input Tokenization and Transformation

In [7]:
model_name = 'hugsanaa/CyberAraBERT'
tokenizer = ArabicTextTokenizer(model_name=model_name,
                                data_column='text',
                                max_len=256)

test_cleaned = tokenizer.fit_transform(test_cleaned)

print(test_cleaned['tokenized_column'].iloc[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'input_ids': tensor([[    2, 11323,   209,   581, 46691,   502,  2689,   560,   939,   581,
         24283,   633,   195,   305,  5972,   195,  1028,  1028,   325,  3698,
         20019,   595, 28214,   581,   581, 45650,   595, 54019, 20804, 22618,
             3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

# Text Classification

In [8]:
labels = ["Cyberbullying", "Not Cyberbullying"]

predictor = DataFramePredictor(
    model_name="hugsanaa/CyberAraBERT",
    tokenized_column="tokenized_column",
    labels=labels
)

df = predictor.predict(test_cleaned)
print(df[["predicted_label", "confidence"]])



       predicted_label  confidence
0    Not Cyberbullying    0.977596
1    Not Cyberbullying    0.973961
2        Cyberbullying    0.511319
3    Not Cyberbullying    0.987448
4    Not Cyberbullying    0.995874
..                 ...         ...
896  Not Cyberbullying    0.989939
897      Cyberbullying    0.778517
898      Cyberbullying    0.924472
899  Not Cyberbullying    0.969396
900      Cyberbullying    0.905304

[900 rows x 2 columns]
