## **Experiment 1: No up-sampling & No Preprocessing**

### **Get Data**

In [None]:
import pandas as pd
df_Spam = pd.read_excel("SpamTweets_S.xlsx")
df_Ham = pd.read_excel("HamTweets_S.xlsx")

In [None]:
print('Number of Spam Tweets :' ,len(df_Spam))
print('Number of Ham Tweets :' ,len(df_Ham))

Number of Spam Tweets : 10339
Number of Ham Tweets : 11299


In [None]:
df_Spam.columns

Index(['OSN', 'SN', 'Tweet ID', 'Label', 'Date', 'Time', 'Date Time', 'URL',
       'Tweet Text', 'Cleaned Text', 'User Name', 'Location',
       'Replied Tweet ID', 'Replied Tweet User ID ', 'Replied Tweet User Name',
       'Coordinates', 'Retweet Count', 'Favorite Count', 'Favorited'],
      dtype='object')

In [None]:
df_Ham.columns

Index(['SN', 'OSN', 'Tweet ID', 'Label', 'Date', 'Time', 'Date Time', 'URL',
       'Tweet Text', 'Cleaned Text', 'User Name', 'Location',
       'Replied Tweet ID ', 'Replied Tweet User ID', 'Replied Tweet User name',
       'Coordinates', 'Retweet Count', 'Favorite Count', 'Favorited'],
      dtype='object')

In [None]:
df_Spam = df_Spam[['Cleaned Text','Label']]

In [None]:
df_Ham = df_Ham[['Cleaned Text','Label']]

In [None]:
df_Spam = df_Spam.drop_duplicates(subset="Cleaned Text")
df_Ham = df_Ham.drop_duplicates(subset="Cleaned Text")

In [None]:
print('Number of Spam Tweets :' ,len(df_Spam))
print('Number of Ham Tweets :' ,len(df_Ham))

Number of Spam Tweets : 2230
Number of Ham Tweets : 11034


In [None]:
df = pd.concat([df_Spam, df_Ham])

In [None]:
print('Number of ALL Tweets :' ,len(df))

Number of ALL Tweets : 13264


### **Data Preprocessing**

### **Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split
train , test = train_test_split(df,
    test_size = 0.20,
    random_state = 14,
    stratify = df.Label.values)

In [None]:
from sklearn.model_selection import train_test_split
Train_X, Test_X, Train_Y, Test_Y  = train_test_split(
        df['Cleaned Text'] , df['Label'], test_size=0.2, random_state=42, stratify=df.Label.values)

In [None]:
print('Train size',len(Train_X))
print('Test Size' , len(Test_X))

Train size 10611
Test Size 2653


###**Feature Extraction**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Word-Level
#Tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), max_features=5000)

#Char-Level
Tfidf_vect_ngram = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=5000)

Tfidf_vect_ngram.fit(df['Cleaned Text'])
Train_X_Tfidf_ngram =  Tfidf_vect_ngram.transform(Train_X)
Test_X_Tfidf_ngram =  Tfidf_vect_ngram.transform(Test_X)


In [None]:
print(Tfidf_vect_ngram.vocabulary_)

{'ال': 845, 'لس': 3671, 'سل': 2438, 'لا': 3516, 'ام': 875, 'م ': 3898, ' ع': 240, 'عل': 2976, 'لي': 3877, 'يك': 4923, 'كم': 3452, 'اج': 632, 'ج ': 1503, ' ا': 0, 'لق': 3765, 'قذ': 3300, 'ذف': 2015, 'ف ': 3078, 'سر': 2403, 'ري': 2230, 'يع': 4897, 'ع ': 2864, 'لع': 3727, 'لط': 3717, 'طب': 2780, 'بي': 1143, 'عي': 3013, 'ي ': 4736, 'لب': 3545, 'بر': 1033, 'رو': 2212, 'ود': 4573, 'د ': 1862, 'لج': 3592, 'جن': 1581, 'نس': 4268, 'سي': 2474, ' ل': 307, 'لن': 3831, 'سا': 2343, 'اء': 572, 'ء ': 460, 'طل': 2803, 'ب ': 958, 'لت': 3560, 'تو': 1423, 'وا': 4496, 'اص': 748, 'صل': 2661, 'ل ': 3490, ' و': 398, 'ات': 607, 'تس': 1285, 'اب': 593, ' م': 335, 'من': 4105, 'ن ': 4164, ' ه': 387, 'هن': 4451, 'نا': 4192, 'ا ': 546, ' ص': 208, 'صف': 2654, 'فح': 3135, 'حه': 1751, 'ه ': 4361, 'لد': 3634, 'دك': 1955, 'كت': 3411, 'ور': 4583, 'ره': 2208, ' ن': 366, 'اي': 938, 'يف': 4909, 'الس': 858, 'لسل': 3682, 'سلا': 2440, 'لام': 3540, 'ام ': 876, 'م ع': 3913, ' عل': 251, 'علي': 2984, 'ليك': 3891, 'يكم': 4929, 'كم '

###**Build The Models**

In [None]:
from sklearn import model_selection, naive_bayes, svm,neural_network,linear_model
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf_ngram,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf_ngram)
# Use accuracy_score function to get the accuracy
#print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
print("Naive Bayes Precision Score -> ",precision_score(predictions_NB, Test_Y)*100)
print("Naive Bayes Recall Score -> ",recall_score(predictions_NB, Test_Y)*100)
print("Naive Bayes F1 Score -> ",f1_score(predictions_NB, Test_Y)*100)


Naive Bayes Precision Score ->  90.35874439461884
Naive Bayes Recall Score ->  99.75247524752476
Naive Bayes F1 Score ->  94.82352941176471


In [None]:
import inspect
hyperparams = inspect.signature(Naive.__init__)
print(hyperparams)


(alpha=1.0, fit_prior=True, class_prior=None)


In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf_ngram,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf_ngram)
# Use accuracy_score function to get the accuracy
#print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print("SVM Precision Score -> ",precision_score(predictions_SVM, Test_Y)*100)
print("SVM Recall Score -> ",recall_score(predictions_SVM, Test_Y)*100)
print("SVM F1 Score -> ",f1_score(predictions_SVM, Test_Y)*100)

SVM Precision Score ->  97.08520179372198
SVM Recall Score ->  100.0
SVM F1 Score ->  98.52104664391354


In [None]:
import inspect
hyperparams = inspect.signature(SVM.__init__)
print(hyperparams)


(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)


In [None]:
# Classifier - Algorithm - NN
# fit the training dataset on the classifier
NN = neural_network.MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=100)
NN.fit(Train_X_Tfidf_ngram,Train_Y)
# predict the labels on validation dataset
predictions_NN = NN.predict(Test_X_Tfidf_ngram)
# Use accuracy_score function to get the accuracy
#print("NN Accuracy Score -> ",accuracy_score(predictions_NN, Test_Y)*100)
print("NN Precision Score -> ",precision_score(predictions_NN, Test_Y)*100)
print("NN Recall Score -> ",recall_score(predictions_NN, Test_Y)*100)
print("NN F1 Score -> ",f1_score(predictions_NN, Test_Y)*100)

NN Precision Score ->  97.30941704035875
NN Recall Score ->  99.31350114416476
NN F1 Score ->  98.30124575311437


In [None]:
import inspect
hyperparams = inspect.signature(NN.__init__)
print(hyperparams)


(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000)


In [None]:
# Classifier - Algorithm - LR
# fit the training dataset on the classifier
LR = linear_model.LogisticRegression()
LR.fit(Train_X_Tfidf_ngram,Train_Y)
# predict the labels on validation dataset
predictions_LR = LR.predict(Test_X_Tfidf_ngram)
# Use accuracy_score function to get the accuracy
#print("LR Accuracy Score -> ",accuracy_score(predictions_LR, Test_Y)*100)
print("LR Precision Score -> ",precision_score(predictions_LR, Test_Y)*100)
print("LR Recall Score -> ",recall_score(predictions_LR, Test_Y)*100)
print("LR F1 Score -> ",f1_score(predictions_LR, Test_Y)*100)

LR Precision Score ->  91.4798206278027
LR Recall Score ->  100.0
LR F1 Score ->  95.5503512880562


In [None]:
import inspect
hyperparams = inspect.signature(LR.__init__)
print(hyperparams)


(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
