# DATA

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


df = pd.read_csv("/content/total.csv", header=0)

df['Tweet'] = df['Tweet'].astype(str)
df['Class'] = df['Class'].astype(int)


In [None]:
df["Class"].value_counts()

0    22789
1    10951
Name: Class, dtype: int64

In [None]:
df.isnull().sum().sum()

0

In [None]:
df

# PREPROCESSING

In [None]:
%pip install pyarabic

import re
import pyarabic.araby as araby

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyarabic
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 13.9 MB/s 
Installing collected packages: pyarabic
Successfully installed pyarabic-0.6.15


In [None]:
pip  install -U farasapy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14


In [None]:

import re
from farasa.stemmer import FarasaStemmer


stopwords = {"،","آض","آمينَ","آه","آهاً","آي","أ","أب","أجل","أجمع","أخ","أخذ","أصبح","أضحى","أقبل","أقل","أكثر","ألا","أم","أما","أمامك","أمامكَ","أمسى","أمّا","أن","أنا","أنت","أنتم","أنتما","أنتن","أنتِ","أنشأ","أنّى","أو","أوشك","أولئك","أولئكم","أولاء","أولالك","أوّهْ","أي","أيا","أين","أينما","أيّ","أَنَّ","أََيُّ","أُفٍّ","إذ","إذا","إذاً","إذما","إذن","إلى","إليكم","إليكما","إليكنّ","إليكَ","إلَيْكَ","إلّا","إمّا","إن","إنّما","إي","إياك","إياكم","إياكما","إياكن","إيانا","إياه","إياها","إياهم","إياهما","إياهن","إياي","إيهٍ","إِنَّ","ا","ابتدأ","اثر","اجل","احد","اخرى","اخلولق","اذا","اربعة","ارتدّ","استحال","اطار","اعادة","اعلنت","اف","اكثر","اكد","الألاء","الألى","الا","الاخيرة","الان","الاول","الاولى","التى","التي","الثاني","الثانية","الذاتي","الذى","الذي","الذين","السابق","الف","اللائي","اللاتي","اللتان","اللتيا","اللتين","اللذان","اللذين","اللواتي","الماضي","المقبل","الوقت","الى","اليوم","اما","امام","امس","ان","انبرى","انقلب","انه","انها","او","اول","اي","ايار","ايام","ايضا","ب","بات","باسم","بان","بخٍ","برس","بسبب","بسّ","بشكل","بضع","بطآن","بعد","بعض","بك","بكم","بكما","بكن","بل","بلى","بما","بماذا","بمن","بن","بنا","به","بها","بي","بيد","بين","بَسْ","بَلْهَ","بِئْسَ","تانِ","تانِك","تبدّل","تجاه","تحوّل","تلقاء","تلك","تلكم","تلكما","تم","تينك","تَيْنِ","تِه","تِي","ثلاثة","ثم","ثمّ","ثمّة","ثُمَّ","جعل","جلل","جميع","جير","حار","حاشا","حاليا","حاي","حتى","حرى","حسب","حم","حوالى","حول","حيث","حيثما","حين","حيَّ","حَبَّذَا","حَتَّى","حَذارِ","خلا","خلال","دون","دونك","ذا","ذات","ذاك","ذانك","ذانِ","ذلك","ذلكم","ذلكما","ذلكن","ذو","ذوا","ذواتا","ذواتي","ذيت","ذينك","ذَيْنِ","ذِه","ذِي","راح","رجع","رويدك","ريث","رُبَّ","زيارة","سبحان","سرعان","سنة","سنوات","سوف","سوى","سَاءَ","سَاءَمَا","شبه","شخصا","شرع","شَتَّانَ","صار","صباح","صفر","صهٍ","صهْ","ضد","ضمن","طاق","طالما","طفق","طَق","ظلّ","عاد","عام","عاما","عامة","عدا","عدة","عدد","عدم","عسى","عشر","عشرة","علق","على","عليك","عليه","عليها","علًّ","عن","عند","عندما","عوض","عين","عَدَسْ","عَمَّا","غدا","غير","ـ","ف","فان","فلان","فو","فى","في","فيم","فيما","فيه","فيها","قال","قام","قبل","قد","قطّ","قلما","قوة","كأنّما","كأين","كأيّ","كأيّن","كاد","كان","كانت","كذا","كذلك","كرب","كل","كلا","كلاهما","كلتا","كلم","كليكما","كليهما","كلّما","كلَّا","كم","كما","كي","كيت","كيف","كيفما","كَأَنَّ","كِخ","لئن","لا","لات","لاسيما","لدن","لدى","لعمر","لقاء","لك","لكم","لكما","لكن","لكنَّما","لكي","لكيلا","للامم","لم","لما","لمّا","لن","لنا","له","لها","لو","لوكالة","لولا","لوما","لي","لَسْتَ","لَسْتُ","لَسْتُم","لَسْتُمَا","لَسْتُنَّ","لَسْتِ","لَسْنَ","لَعَلَّ","لَكِنَّ","لَيْتَ","لَيْسَ","لَيْسَا","لَيْسَتَا","لَيْسَتْ","لَيْسُوا","لَِسْنَا","ما","ماانفك","مابرح","مادام","ماذا","مازال","مافتئ","مايو","متى","مثل","مذ","مساء","مع","معاذ","مقابل","مكانكم","مكانكما","مكانكنّ","مكانَك","مليار","مليون","مما","ممن","من","منذ","منها","مه","مهما","مَنْ","مِن","نحن","نحو","نعم","نفس","نفسه","نهاية","نَخْ","نِعِمّا","نِعْمَ","ها","هاؤم","هاكَ","هاهنا","هبّ","هذا","هذه","هكذا","هل","هلمَّ","هلّا","هم","هما","هن","هنا","هناك","هنالك","هو","هي","هيا","هيت","هيّا","هَؤلاء","هَاتانِ","هَاتَيْنِ","هَاتِه","هَاتِي","هَجْ","هَذا","هَذانِ","هَذَيْنِ","هَذِه","هَذِي","هَيْهَاتَ","و","و6","وا","واحد","واضاف","واضافت","واكد","وان","واهاً","واوضح","وراءَك","وفي","وقال","وقالت","وقد","وقف","وكان","وكانت","ولا","ولم","ومن","مَن","وهو","وهي","ويكأنّ","وَيْ","وُشْكَانََ","يكون","يمكن","يوم","ّأيّان"}

def remove_tashkeel(text):
    return araby.strip_tashkeel(text)

def remove_tatweel(text):
    return araby.strip_tatweel(text)

def normalization(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ي", "ى", text)
    text = re.sub("ة", "ه", text)
    return text

def normalize_hamza(text):
    return araby.normalize_hamza(text)

def remove_usernames(text):
    text = re.sub("@[\w]*", '', text)
    return text

def remove_hashtags(text):
    text = re.sub("#[\w]*", '', text)
    return text

def clean_text(text):
    text = re.sub(r'[.,\"،\/#!$%\^&\*;:{}=\-_`~()?؟﴾﴿]',' ',text) #remove punctuation
    text = re.sub('#\d+K\d+', ' ', text)  # years like 2K19
    text = re.sub('http\S+\s*', ' ', text)  # remove URLs
    text = re.sub('RT|cc', ' ', text)  # remove RT and cc
    text = re.sub(r'[0-9]+',' ',text) #remove numbers
    text = re.sub(r'[\u0660-\u0669]+',' ',text) #remove arabic numbers
    text = re.sub(r"_",' ',text) #remove _
    text = re.sub('\s+', ' ', text).strip() #remove additional whitespaces
    return text

def remove_non_arabic_letters(text):
    ENGLISH_CHARS =r'[a-zA-Z]+\b(?<!urllink|mention)'
    text = re.sub(ENGLISH_CHARS, "",  text)
    return text

def remove_stopwords(text, stopwords):
    word_list = text.split(' ')
    filtered_words = [word for word in word_list if word not in stopwords]
    text = ' '.join(filtered_words)
    return text

def remove_imoji(text):

    emoji_pattern = re.compile("["
                                    u"\U0001F600-\U0001F64F"  # emoticons
                                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                    u"\U00002702-\U000027B0"
                                    u"\U000024C2-\U0001F251"
                                    "]+", flags=re.UNICODE)
    text = re.sub(emoji_pattern, '', text)
    return text

#stemmer = FarasaStemmer()



def clean_tweets(df):

  tempArr = []
  for line in df:

    tweet = remove_tashkeel(line)
    tweet = remove_tatweel(tweet)
    tweet = normalization(tweet)
    tweet = normalize_hamza(tweet)
    tweet = remove_usernames(tweet)
    tweet = remove_hashtags(tweet)
    tweet = remove_non_arabic_letters(tweet)
    tweet = remove_stopwords(tweet, stopwords)
    tweet = remove_imoji(tweet)
    tweet = clean_text(tweet)
    #tweet = stemmer.stem(tweet)

    tempArr.append(tweet)
  return tempArr


In [None]:
# clean data

clean = clean_tweets(df["Tweet"])
clean = pd.DataFrame(clean)

df["Tweet"] = clean
df['Tweet'] = df['Tweet'].astype(str)

In [None]:
df.rename(columns={'Tweet':'texts'}, inplace=True)
df.rename(columns={'Class':'data_labels'}, inplace=True)

# DATA PARTITION

In [None]:
from sklearn.model_selection import train_test_split

X = df.texts.values
Y = df.data_labels.values

X_train, X_val, Y_train, Y_val =train_test_split(X, Y, test_size=0.25, random_state=200)

# BEST ML ALGORITHM

In [None]:
import sklearn 
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import random
import joblib
import pickle

In [None]:
x_train = X_train
y_train = Y_train

x_test= X_val
y_test = Y_val


def detect_hate(my_classifier, name, x_train, y_train, x_test, y_test):
    
    print('parameters')
    print('classifier:', my_classifier.__class__.__name__)
    print('------------------------------------')

    pipeline = Pipeline([
        ('vect', TfidfVectorizer(min_df=0.0001, max_df=0.95,
                                 analyzer='word', lowercase=False,)),
        ('clf', my_classifier),
    ])

    pipeline.fit(x_train, y_train)
    feature_names = pipeline.named_steps['vect'].get_feature_names()

    y_predicted = pipeline.predict(x_test)



    # Print the classification report
    print(metrics.classification_report(y_test, y_predicted,
                                        ))

    # Print the confusion matrix
    cm = metrics.confusion_matrix(y_test, y_predicted)
    print(cm)
    print('# of features:', len(feature_names))
    print('sample of features:', random.sample(feature_names, 40))

    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall =  recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, labels=None, pos_label=1, average='weighted', sample_weight=None)
    
    return name, accuracy, precision, recall, f1


In [None]:

results = []

classifiers = [LinearSVC()]

for alg in classifiers:
  alg_name = alg.__class__.__name__
  r = detect_hate(alg, alg_name, x_train, y_train, x_test, y_test)
  results.append(r)



parameters
classifier: LinearSVC
------------------------------------




              precision    recall  f1-score   support

           0       0.90      0.94      0.92      5662
           1       0.87      0.78      0.82      2773

    accuracy                           0.89      8435
   macro avg       0.88      0.86      0.87      8435
weighted avg       0.89      0.89      0.89      8435

[[5337  325]
 [ 609 2164]]
# of features: 19124
sample of features: ['الاستعانه', 'ومعاه', 'فمك', 'فتلقىان', 'بكم', 'عقىد', 'افتخرت', 'خود', 'توزع', 'الزلمه', 'بىوتهم', 'وشتم', 'حقد', 'علىها', 'المدخلى', 'ىحكمها', 'المستفىد', 'بدماء', 'احذروه', 'واشف', 'ىاولاد', 'وتهرىبهم', 'الكافىه', 'طلبا', 'تتجاوز', 'رءاسه', 'بحكام', 'الصىدلىه', 'تفتخر', 'وجوب', 'تخلفهم', 'الدوام', 'هجوم', 'احمر', 'وبىت', 'وقد', 'احسب', 'مشهور', 'دمو', 'تخرب']


In [None]:
print('{0:25}{1:>10}{2:>10}{3:>10}{4:>10}'.format('algorithm', 'accuracy', 'precision', 'recall', 'F1-score'))
print('---------------------------------------------------------------------------')
for r in results:
    print('{0:25}{1:10.2f}{2:10.2f}{3:10.2f}{4:10.2f}'.format(r[0], r[1], r[2], r[3], r[4]))

algorithm                  accuracy precision    recall  F1-score
---------------------------------------------------------------------------
LinearSVC                      0.89      0.89      0.89      0.89


# SAVE MODEL

In [None]:
import pickle

with open("model", "wb") as f:
   pickle.dump(pipeline, f)
