## **Experiment 1: up-sampling & No Preprocessing**

### **Get Data**

In [None]:
pip install qalsadi

Collecting qalsadi
  Downloading qalsadi-0.4.4-py3-none-any.whl (257 kB)
[K     |████████████████████████████████| 257 kB 7.6 MB/s 
[?25hCollecting pyarabic>=0.6.7
  Downloading PyArabic-0.6.14-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 51.2 MB/s 
[?25hCollecting Arabic-Stopwords>=0.3
  Downloading Arabic_Stopwords-0.3-py3-none-any.whl (353 kB)
[K     |████████████████████████████████| 353 kB 56.2 MB/s 
[?25hCollecting naftawayh>=0.3
  Downloading Naftawayh-0.4-py3-none-any.whl (332 kB)
[K     |████████████████████████████████| 332 kB 71.7 MB/s 
[?25hCollecting arramooz-pysqlite>=0.3
  Downloading arramooz_pysqlite-0.3-py3-none-any.whl (9.2 MB)
[K     |████████████████████████████████| 9.2 MB 21.2 MB/s 
Collecting pickledb>=0.9.2
  Downloading pickleDB-0.9.2.tar.gz (3.7 kB)
Collecting tashaphyne>=0.3.4.1
  Downloading Tashaphyne-0.3.4.1-py3-none-any.whl (244 kB)
[K     |████████████████████████████████| 244 kB 53.3 MB/s 
[?25hCollecting alyahm

In [None]:
import pandas as pd
df_Spam = pd.read_excel("SpamTweets_S.xlsx")
df_Ham = pd.read_excel("HamTweets_S.xlsx")

In [None]:
print('Number of Spam Tweets :' ,len(df_Spam))
print('Number of Ham Tweets :' ,len(df_Ham))

Number of Spam Tweets : 10339
Number of Ham Tweets : 11299


In [None]:
df_Spam.columns

Index(['OSN', 'SN', 'Tweet ID', 'Label', 'Date', 'Time', 'Date Time', 'URL',
       'Tweet Text', 'Cleaned Text', 'User Name', 'Location',
       'Replied Tweet ID', 'Replied Tweet User ID ', 'Replied Tweet User Name',
       'Coordinates', 'Retweet Count', 'Favorite Count', 'Favorited'],
      dtype='object')

In [None]:
df_Ham.columns

Index(['SN', 'OSN', 'Tweet ID', 'Label', 'Date', 'Time', 'Date Time', 'URL',
       'Tweet Text', 'Cleaned Text', 'User Name', 'Location',
       'Replied Tweet ID ', 'Replied Tweet User ID', 'Replied Tweet User name',
       'Coordinates', 'Retweet Count', 'Favorite Count', 'Favorited'],
      dtype='object')

In [None]:
df_Spam = df_Spam[['Cleaned Text','Label']]

In [None]:
df_Ham = df_Ham[['Cleaned Text','Label']]

In [None]:
df_Spam = df_Spam.drop_duplicates(subset="Cleaned Text")
df_Ham = df_Ham.drop_duplicates(subset="Cleaned Text")

In [None]:
print('Number of Spam Tweets :' ,len(df_Spam))
print('Number of Ham Tweets :' ,len(df_Ham))

Number of Spam Tweets : 2230
Number of Ham Tweets : 11034


In [None]:
df = pd.concat([df_Spam, df_Ham])

In [None]:
print('Number of ALL Tweets :' ,len(df))

Number of ALL Tweets : 13264


### **Data Preprocessing**

In [None]:
import qalsadi.lemmatizer
lemmer = qalsadi.lemmatizer.Lemmatizer()


def preprocess_txt(txt):
  return " ".join(lemmer.lemmatize_text(txt))
  
df['Cleaned Text2'] = df['Cleaned Text'].apply(preprocess_txt)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = list(df.iloc[:,:-1]["Cleaned Text"])

#Word-Level
vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,1), max_features=5000)
#Char-Level
#vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=5000)

vectorizer.fit(text)

print(vectorizer.vocabulary_)
print(vectorizer.idf_)

{'سلام': 2856, 'على': 3338, 'علاج': 3327, 'قذف': 3600, 'سريع': 2826, 'طبيعة': 3161, 'برود': 1488, 'جنس': 2153, 'نساء': 4617, 'طلب': 3183, 'تواصل': 1986, 'واتساب': 4783, 'من': 4425, 'هنا': 4764, 'صفح': 3084, 'الدكتوره': 833, 'ام': 1191, 'نايف': 4582, 'ضعف': 3136, 'انتصاب': 1242, 'الخاصه': 809, 'قطر': 3630, 'الاثاره': 458, 'اهلا': 1307, 'سهل': 2882, 'بكم': 1528, 'القذ': 996, 'ف': 3440, 'الامريكي': 619, 'الاخصائيه': 483, 'الافضل': 577, 'اصدقائي': 359, 'حي': 2340, 'صحة': 3051, 'سعيد': 2833, 'انتظار': 1244, 'ان': 1230, 'شاء': 2924, 'لها': 3902, 'تساب': 1753, 'عسل': 3294, 'منتج': 4440, 'السعاده': 871, 'حب': 2220, 'ال': 450, 'لقا': 3876, 'هنالا': 4766, 'توتر': 1991, 'بعد': 1512, 'يوم': 4982, 'حل': 2305, 'عروض': 3285, 'لسن': 3865, 'الجديده': 762, 'لكم': 3887, 'لا': 3784, 'تردي': 1730, 'زوج': 2743, 'جد': 2094, 'شباب': 2945, 'قوى': 3661, 'نشاط': 4626, 'حيوي': 2352, 'قلق': 3640, 'الاقوي': 592, 'حصر': 2273, 'راح': 2597, 'مضمون': 4283, 'بتع': 1447, 'زياد': 2749, 'وزن': 4894, 'كرش': 3724, 'او': 1311

In [None]:
df[2:15]

Unnamed: 0,Cleaned Text,Label,Cleaned Text2
33,نتمني لكم يوم سعيد هنا الافضل لا لترد ...,1,نتمني لكم يوم سعيد هنا الافضل لا تردي اهلا سهل...
34,منتج السعادهالزوجيه الان الان السعاده هنا الق...,1,منتج السعادهالزوجيه ال ال السعاده هنا لقا هنال...
36,هنا راحتك العلاج الافضل اهلا وسهلا بكم ا...,1,هنا راح علاج الافضل اهلا سهل بكم اصدقائي حي صح...
37,بتعاني من زياده الوزن والكرش او الترهلات عنا م...,1,بتع من زياد وزن كرش او ترهل عن منتج طبيعة مضمو...
38,قطره الاثاره النسائيه تعمل علي زياده الرغبه و...,1,قطر الاثاره النسائيه عمل علي زياد الرغبه لمح ع...
39,منتج تكبير وتطويل وتضخيم القضيب يعمل علي قوه و...,1,منتج تكبير تطويل تضخيم قضيب عمل علي قوى شد انت...
51,علكه الاثاره النسائيه تعمل علي زياده الرغبه و...,1,علك الاثاره النسائيه عمل علي زياد الرغبه لمح ع...
3167,باذن اله المنتجان تعجبكم تخسيس و ازاله الكرش ...,1,باذن لها منتج تعجب تخسيس و ازاله كرش و تسمي شع...
3168,السلام عليكم عندي منتجات طبيعيه مثل تخسيس و ا...,1,سلام على عند منتج طبيعة مثل تخسيس و ازاله كرش ...
3184,السلام عليكم عندي منتجات طبيعيه مثل تخسيس و ا...,1,سلام على عند منتج طبيعة مثل تخسيس و ازاله كرش ...


In [None]:
df.to_csv('df.csv', encoding='utf-8')

In [None]:

vector_list = vectorizer.transform(text)

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter
oversample = SMOTE()
X, y = oversample.fit_resample(vector_list, pd.Categorical(df.iloc[:,-1]))




### **Train Test Split**

In [None]:
import numpy as np
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0: 11034, 1: 11034}

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20,random_state = 14)

In [None]:
print('Train size',len(y_train))
print('Test Size' , len(y_test))

Train size 17654
Test Size 4414


###**Feature Extraction**

###**Build The Models**

In [None]:
from sklearn import model_selection, naive_bayes, svm,neural_network,linear_model
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train,y_train)
# predict the labels on validation dataset
predictions_NB = Naive.predict(X_test)
# Use accuracy_score function to get the accuracy
#print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
print("Naive Bayes Precision Score -> ",precision_score(predictions_NB, y_test)*100)
print("Naive Bayes Recall Score -> ",recall_score(predictions_NB, y_test)*100)
print("Naive Bayes F1 Score -> ",f1_score(predictions_NB, y_test)*100)


Naive Bayes Precision Score ->  99.40202391904323
Naive Bayes Recall Score ->  99.44776806258628
Naive Bayes F1 Score ->  99.42489072923854


In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train,y_train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test)
# Use accuracy_score function to get the accuracy
#print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print("SVM Precision Score -> ",precision_score(predictions_SVM, y_test)*100)
print("SVM Recall Score -> ",recall_score(predictions_SVM, y_test)*100)
print("SVM F1 Score -> ",f1_score(predictions_SVM, y_test)*100)

SVM Precision Score ->  99.77000919963201
SVM Recall Score ->  99.67830882352942
SVM F1 Score ->  99.72413793103449


In [None]:
# Classifier - Algorithm - NN
# fit the training dataset on the classifier
NN = neural_network.MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=100)
NN.fit(X_train,y_train)
# predict the labels on validation dataset
predictions_NN = NN.predict(X_test)
# Use accuracy_score function to get the accuracy
#print("NN Accuracy Score -> ",accuracy_score(predictions_NN, Test_Y)*100)
print("NN Precision Score -> ",precision_score(predictions_NN, y_test)*100)
print("NN Recall Score -> ",recall_score(predictions_NN, y_test)*100)
print("NN F1 Score -> ",f1_score(predictions_NN, y_test)*100)

NN Precision Score ->  99.90800367985281
NN Recall Score ->  99.40503432494279
NN F1 Score ->  99.65588437715071




In [None]:
# Classifier - Algorithm - LR
# fit the training dataset on the classifier
LR = linear_model.LogisticRegression()
LR.fit(X_train,y_train)
# predict the labels on validation dataset
predictions_LR = LR.predict(X_test)
# Use accuracy_score function to get the accuracy
#print("LR Accuracy Score -> ",accuracy_score(predictions_LR, Test_Y)*100)
print("LR Precision Score -> ",precision_score(predictions_LR, y_test)*100)
print("LR Recall Score -> ",recall_score(predictions_LR, y_test)*100)
print("LR F1 Score -> ",f1_score(predictions_LR, y_test)*100)

LR Precision Score ->  97.28610855565776
LR Recall Score ->  99.71711456859971
LR F1 Score ->  98.48661233993015
