# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import re

from textblob import TextBlob
import pickle
import pyarabic.araby as araby
import tashaphyne.arabic_const as arabconst
from tashaphyne.stemming import ArabicLightStemmer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.isri import ISRIStemmer
from nltk.tokenize import TweetTokenizer

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

# Loading the Dataset

In [2]:
url = 'https://raw.githubusercontent.com/Hala-Mulki/L-HSAB-First-Arabic-Levantine-HateSpeech-Dataset/master/Dataset/L-HSAB'
data = pd.read_csv(url, sep='\t')

# Data Cleaning and Preprocessing

### Text Normalization

In [3]:
def normalizeArabic(text):
    text = text.strip()
    text = re.sub("[إأٱآا]", "ا", str(text))
    #text = re.sub("ى", "ي", text)
    #text = re.sub("ؤ", "ء", text)
    #text = re.sub("ئ", "ء", text)
    #text = re.sub("ة", "ه", text)
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)
    text = re.sub(r'(.)\1+', r"\1\1", text)
    return araby.strip_tashkeel(text)

In [4]:
data['Tweet'] = data['Tweet'].apply(lambda x:normalizeArabic(x))

### Removing Stopwords

In [5]:
stops = set(stopwords.words("arabic"))

stop_word_comp = {"،","آض","آمينَ","آه","آهاً","آي","أ","أب","أجل","أجمع","أخ","أخذ","أصبح","أضحى","أقبل","أقل","أكثر","ألا","أم","أما","أمامك","أمامكَ","أمسى","أمّا","أن","أنا","أنت","أنتم","أنتما","أنتن","أنتِ","أنشأ","أنّى","أو","أوشك","أولئك","أولئكم","أولاء","أولالك","أوّهْ","أي","أيا","أين","أينما","أيّ","أَنَّ","أََيُّ","أُفٍّ","إذ","إذا","إذاً","إذما","إذن","إلى","إليكم","إليكما","إليكنّ","إليكَ","إلَيْكَ","إلّا","إمّا","إن","إنّما","إي","إياك","إياكم","إياكما","إياكن","إيانا","إياه","إياها","إياهم","إياهما","إياهن","إياي","إيهٍ","إِنَّ","ا","ابتدأ","اثر","اجل","احد","اخرى","اخلولق","اذا","اربعة","ارتدّ","استحال","اطار","اعادة","اعلنت","اف","اكثر","اكد","الألاء","الألى","الا","الاخيرة","الان","الاول","الاولى","التى","التي","الثاني","الثانية","الذاتي","الذى","الذي","الذين","السابق","الف","اللائي","اللاتي","اللتان","اللتيا","اللتين","اللذان","اللذين","اللواتي","الماضي","المقبل","الوقت","الى","اليوم","اما","امام","امس","ان","انبرى","انقلب","انه","انها","او","اول","اي","ايار","ايام","ايضا","ب","بات","باسم","بان","بخٍ","برس","بسبب","بسّ","بشكل","بضع","بطآن","بعد","بعض","بك","بكم","بكما","بكن","بل","بلى","بما","بماذا","بمن","بن","بنا","به","بها","بي","بيد","بين","بَسْ","بَلْهَ","بِئْسَ","تانِ","تانِك","تبدّل","تجاه","تحوّل","تلقاء","تلك","تلكم","تلكما","تم","تينك","تَيْنِ","تِه","تِي","ثلاثة","ثم","ثمّ","ثمّة","ثُمَّ","جعل","جلل","جميع","جير","حار","حاشا","حاليا","حاي","حتى","حرى","حسب","حم","حوالى","حول","حيث","حيثما","حين","حيَّ","حَبَّذَا","حَتَّى","حَذارِ","خلا","خلال","دون","دونك","ذا","ذات","ذاك","ذانك","ذانِ","ذلك","ذلكم","ذلكما","ذلكن","ذو","ذوا","ذواتا","ذواتي","ذيت","ذينك","ذَيْنِ","ذِه","ذِي","راح","رجع","رويدك","ريث","رُبَّ","زيارة","سبحان","سرعان","سنة","سنوات","سوف","سوى","سَاءَ","سَاءَمَا","شبه","شخصا","شرع","شَتَّانَ","صار","صباح","صفر","صهٍ","صهْ","ضد","ضمن","طاق","طالما","طفق","طَق","ظلّ","عاد","عام","عاما","عامة","عدا","عدة","عدد","عدم","عسى","عشر","عشرة","علق","على","عليك","عليه","عليها","علًّ","عن","عند","عندما","عوض","عين","عَدَسْ","عَمَّا","غدا","غير","ـ","ف","فان","فلان","فو","فى","في","فيم","فيما","فيه","فيها","قال","قام","قبل","قد","قطّ","قلما","قوة","كأنّما","كأين","كأيّ","كأيّن","كاد","كان","كانت","كذا","كذلك","كرب","كل","كلا","كلاهما","كلتا","كلم","كليكما","كليهما","كلّما","كلَّا","كم","كما","كي","كيت","كيف","كيفما","كَأَنَّ","كِخ","لئن","لا","لات","لاسيما","لدن","لدى","لعمر","لقاء","لك","لكم","لكما","لكن","لكنَّما","لكي","لكيلا","للامم","لم","لما","لمّا","لن","لنا","له","لها","لو","لوكالة","لولا","لوما","لي","لَسْتَ","لَسْتُ","لَسْتُم","لَسْتُمَا","لَسْتُنَّ","لَسْتِ","لَسْنَ","لَعَلَّ","لَكِنَّ","لَيْتَ","لَيْسَ","لَيْسَا","لَيْسَتَا","لَيْسَتْ","لَيْسُوا","لَِسْنَا","ما","ماانفك","مابرح","مادام","ماذا","مازال","مافتئ","مايو","متى","مثل","مذ","مساء","مع","معاذ","مقابل","مكانكم","مكانكما","مكانكنّ","مكانَك","مليار","مليون","مما","ممن","من","منذ","منها","مه","مهما","مَنْ","مِن","نحن","نحو","نعم","نفس","نفسه","نهاية","نَخْ","نِعِمّا","نِعْمَ","ها","هاؤم","هاكَ","هاهنا","هبّ","هذا","هذه","هكذا","هل","هلمَّ","هلّا","هم","هما","هن","هنا","هناك","هنالك","هو","هي","هيا","هيت","هيّا","هَؤلاء","هَاتانِ","هَاتَيْنِ","هَاتِه","هَاتِي","هَجْ","هَذا","هَذانِ","هَذَيْنِ","هَذِه","هَذِي","هَيْهَاتَ","و","و6","وا","واحد","واضاف","واضافت","واكد","وان","واهاً","واوضح","وراءَك","وفي","وقال","وقالت","وقد","وقف","وكان","وكانت","ولا","ولم","ومن","مَن","وهو","وهي","ويكأنّ","وَيْ","وُشْكَانََ","يكون","يمكن","يوم","ّأيّان"}

In [6]:
def tokenize_text(text):
    tokens = word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stops and token not in stop_word_comp]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [7]:
data['Tweet'] = data['Tweet'].apply(lambda x:remove_stopwords(x))

### Removing Emojis

In [8]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [9]:
data['Tweet'] = data['Tweet'].apply(lambda x:remove_emoji(x))

### Text Cleaning

In [10]:
def clean_text(text):
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)  # remove punctuation
    text = re.sub('\s+', ' ', text)  
    text = re.sub("\d+", " ", text)
    text = re.sub('\W+', ' ', text)
    text = re.sub('[A-Za-z]+',' ',text)
    text = re.sub(r'\\u[A-Za-z0-9\\]+',' ',text)
    text = re.sub('\s+', ' ', text)  
    return text

In [11]:
data['Tweet'] = data['Tweet'].apply(lambda x:clean_text(x))

# Splitting into Training and Testing Datasets

In [12]:
le = preprocessing.LabelEncoder()
le.fit(data.Class)
y_encode = le.transform(data.Class)

In [13]:
data_df = pd.DataFrame({'Target Label': y_encode, 'Target Name': data.Class, 'Tweet': data.Tweet})

In [14]:
train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names = train_test_split(np.array(data_df['Tweet']),
                                                                                                                         np.array(data_df['Target Label']),
                                                                                                                         np.array(data_df['Target Name']),
                                                                                                                         stratify=data_df['Target Label'],
                                                                                                                         test_size=0.2, random_state=42)

In [15]:
print('The Train Corpus has: ', len(train_corpus), ' rows')
print('The Test Corpus has: ', len(test_corpus), ' rows' )

The Train Corpus has:  4676  rows
The Test Corpus has:  1170  rows


# SVC with SMOTE 

In [20]:
pipeline_svc_smote = imbPipeline([('vect', CountVectorizer(min_df=0.0, max_df=0.5, ngram_range=(1,2))), 
                         ('tfidf', TfidfTransformer(use_idf=True, norm='l2')), 
                         ('smote', SMOTE(random_state=12)),
                         ('svc', SVC(C=15, kernel='linear'))])

pipeline_svc_smote.fit(train_corpus, train_label_names)

Pipeline(steps=[('vect',
                 CountVectorizer(max_df=0.5, min_df=0.0, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('smote', SMOTE(random_state=12)),
                ('svc', SVC(C=15, kernel='linear'))])

In [21]:
print("SMOTE SVC Train Accuracy:{}".format(pipeline_svc_smote.score(train_corpus, train_label_names)))
print("SMOTE SVC Test Accuracy:{}".format(pipeline_svc_smote.score(test_corpus, test_label_names)))

SMOTE SVC Train Accuracy:0.9993584260051326
SMOTE SVC Test Accuracy:0.7837606837606838


In [22]:
y_pred_smote = pipeline_svc_smote.predict(test_corpus)

print('Confusion Matrix')
print(confusion_matrix(test_label_names, y_pred_smote))

Confusion Matrix
[[220  13 113]
 [ 35  29  30]
 [ 58   4 668]]


In [23]:
print(classification_report(test_label_names, y_pred))

              precision    recall  f1-score   support

     abusive       0.70      0.64      0.67       346
        hate       0.63      0.31      0.41        94
      normal       0.82      0.92      0.87       730

    accuracy                           0.78      1170
   macro avg       0.72      0.62      0.65      1170
weighted avg       0.77      0.78      0.77      1170

