In [1]:
import pandas as pd
import numpy as np
import json
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sys

import pickle


%matplotlib inline
rand_seed = 0  # random state for reproducibility
np.random.seed(rand_seed)


In [2]:
data = pd.read_excel('ALL_data.xlsx')
data = data.dropna()
data.head()


Unnamed: 0,comment,label
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي
1,طيب ما تشرحو طريقه الاشتراك في الباقه دي,محايد
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي
3,رمز الاشتراك شنو,محايد
4,واو,ايجابي


In [3]:
data = data[data['label'] != 'محايد']
data

Unnamed: 0,comment,label
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي
4,واو,ايجابي
8,شكرا التوضيح مفيد اكرر الشكر سوداني الابداع وا...,ايجابي
13,سوداني جميل,ايجابي
...,...,...
3068,خليك سوداني,ايجابي
3069,سوداني,ايجابي
3070,سوداني الاقوي والافضل,ايجابي
3071,خليك سوداني,ايجابي


In [4]:
def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data

In [5]:
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)

print(len(train_data))
print(len(val_data))
print(len(test_data))
print(len(train_data)+len(val_data)+len(test_data))
print(len(data))

output: label
features: ['comment']
1899
237
238
2374
2374


In [6]:
#TF IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['comment'].values.astype('U'))
val_data_features = vectorizer.transform(val_data['comment'].values.astype('U'))
test_data_features = vectorizer.transform(test_data['comment'].values.astype('U'))

In [7]:
train_data_features.shape, val_data_features.shape, test_data_features.shape

((1899, 14373), (237, 14373), (238, 14373))

In [8]:
def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels):
    clf.fit(train_features, train_labels) # please learn patterns from the data

    print("score on training data:")
    print(clf.score(train_features, train_labels))
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    print('accuracy_score: ')
    print(accuracy_score(test_labels, pred_y))
    
    print('f1_score: ')
    print(f1_score(test_labels, pred_y, average='macro'))


In [9]:
logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
0.8836229594523434
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.890295358649789
f1_score: 
0.8647497805092187


In [10]:
mnb = MultinomialNB()

train_n_test_classifier(mnb, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
0.9694576092680358
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.9071729957805907
f1_score: 
0.8875517598343685


In [11]:
svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
0.9963138493944181
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.9156118143459916
f1_score: 
0.9002525252525253


In [12]:
rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
0.9994734070563455
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.9071729957805907
f1_score: 
0.8911209488807217


In [14]:
mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp, train_data_features, train_data[output],
                        val_data_features, val_data[output])


Iteration 1, loss = 0.67799087
Iteration 2, loss = 0.65454975
Iteration 3, loss = 0.63064843
Iteration 4, loss = 0.60176048
Iteration 5, loss = 0.56629748
Iteration 6, loss = 0.51329382
Iteration 7, loss = 0.43338528
Iteration 8, loss = 0.34852978
Iteration 9, loss = 0.26967602
Iteration 10, loss = 0.20255696
Iteration 11, loss = 0.15397687
Iteration 12, loss = 0.11927121
Iteration 13, loss = 0.09180937
Iteration 14, loss = 0.06936450
Iteration 15, loss = 0.05015160
Iteration 16, loss = 0.03498275
Iteration 17, loss = 0.02393390
Iteration 18, loss = 0.01634913
Iteration 19, loss = 0.01162851
Iteration 20, loss = 0.00850964
Iteration 21, loss = 0.00656197
Iteration 22, loss = 0.00535723
Iteration 23, loss = 0.00443225
Iteration 24, loss = 0.00368428
Iteration 25, loss = 0.00333799
Iteration 26, loss = 0.00307049
Iteration 27, loss = 0.00280107
Iteration 28, loss = 0.00241295
Iteration 29, loss = 0.00222765
Iteration 30, loss = 0.00209399
Iteration 31, loss = 0.00219830
Iteration 32, los

In [16]:
# reading our prepared data
data = pd.read_excel('ALL_data.xlsx')
data = data.dropna()
data.head()

Unnamed: 0,comment,label
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي
1,طيب ما تشرحو طريقه الاشتراك في الباقه دي,محايد
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي
3,رمز الاشتراك شنو,محايد
4,واو,ايجابي


In [17]:
data.groupby('label').count()

Unnamed: 0_level_0,comment
label,Unnamed: 1_level_1
ايجابي,773
سلبي,1601
محايد,699


In [18]:
positive_data = data[data['label'] == 'ايجابي'].dropna()
negative_data = data[data['label'] == 'سلبي'].dropna()
neutral_data = data[data['label'] == 'محايد'].dropna()
len(positive_data), len(negative_data), len(neutral_data)

(773, 1601, 699)

In [19]:
non_neutral_data = positive_data.append(negative_data).sample(frac=1).reset_index(drop=True)
non_neutral_data['label'] = 'غير محايد'

  non_neutral_data = positive_data.append(negative_data).sample(frac=1).reset_index(drop=True)


In [20]:
neu_data = neutral_data.append(non_neutral_data).dropna().sample(frac=1).reset_index(drop=True)
neu_data

  neu_data = neutral_data.append(non_neutral_data).dropna().sample(frac=1).reset_index(drop=True)


Unnamed: 0,comment,label
0,الناس تتفق تقفل يومين بس لانت لامكالمات والله ...,غير محايد
1,انتو يا جماعه شركه كنداكه ده وين اختفي كده ولا...,محايد
2,وين السعر وانا اقول العلم التجار حركه عدم الاس...,غير محايد
3,والشبكه زفت الزفت,غير محايد
4,بالغتو والله طلعتو زيتنا تسقطو بس,غير محايد
...,...,...
3068,والله حرام عليكم زيادة فظيعة حتى التي قبلها لم...,غير محايد
3069,1قيقا,محايد
3070,عمل انساني شنو المناقل ماشايفنها,محايد
3071,كل يومين تلاتة زايدين اسعاركم,غير محايد


In [21]:
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

# seed = 0  # random state for reproducibility
output = 'label' # output label column
features = neu_data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

neu_train_data, neu_tmp = random_split(neu_data, features, output, train_fraction, rand_seed)
neu_val_data, neu_test_data = random_split(neu_tmp, features, output, val_fraction, rand_seed)

print(len(neu_train_data))
print(len(neu_val_data))
print(len(neu_test_data))
print(len(neu_train_data)+len(neu_val_data)+len(neu_test_data))
print(len(neu_data))

output: label
features: ['comment']
2458
307
308
3073
3073


In [23]:
# TF IDF
neu_vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
neu_train_data_features = neu_vectorizer.fit_transform(neu_train_data['comment'].values.astype('U'))
neu_val_data_features = neu_vectorizer.transform(neu_val_data['comment'].values.astype('U'))
neu_test_data_features = neu_vectorizer.transform(neu_test_data['comment'].values.astype('U'))


In [24]:
neu_logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(neu_logistic_reg, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.8112286411716843
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7882736156351792
f1_score: 
0.5278599313853071


In [25]:
neu_mlp = MLPClassifier(hidden_layer_sizes=(100,100), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(neu_mlp, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

Iteration 1, loss = 0.61273256
Iteration 2, loss = 0.52792750
Iteration 3, loss = 0.43478158
Iteration 4, loss = 0.33809975
Iteration 5, loss = 0.22833281
Iteration 6, loss = 0.13797808
Iteration 7, loss = 0.07985339
Iteration 8, loss = 0.04659375
Iteration 9, loss = 0.03015936
Iteration 10, loss = 0.02231269
Iteration 11, loss = 0.01905966
Iteration 12, loss = 0.01628922
Iteration 13, loss = 0.01606904
Iteration 14, loss = 0.01487527
Iteration 15, loss = 0.01411753
Iteration 16, loss = 0.01350253
Iteration 17, loss = 0.01380622
Iteration 18, loss = 0.01318912
Iteration 19, loss = 0.01359710
Iteration 20, loss = 0.01288595
Iteration 21, loss = 0.01268367
Iteration 22, loss = 0.01240760
Iteration 23, loss = 0.01214126
Iteration 24, loss = 0.01244790
Iteration 25, loss = 0.01197988
Training loss did not improve more than tol=0.001000 for 10 consecutive epochs. Stopping.
score on training data:
0.9926769731489016
____________________________________________________________________________

In [26]:
neu_mnb = MultinomialNB()
train_n_test_classifier(neu_mnb, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.8563873067534581
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7882736156351792
f1_score: 
0.5174006626520593


In [27]:
neu_svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(neu_svm, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.9759967453213995
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.8338762214983714
f1_score: 
0.7070446253157452


In [40]:
neu_rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(neu_rf, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.9922701383238405
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.8143322475570033
f1_score: 
0.6622662266226623


In [41]:
pickle.dump(vectorizer, open(f'vectorizer.pkl', 'wb'))
pickle.dump(logistic_reg, open(f'logistic_reg.pkl', 'wb'))
pickle.dump(mnb, open(f'mnb.pkl', 'wb'))
pickle.dump(svm, open(f'svm.pkl', 'wb'))
pickle.dump(rf, open(f'rf.pkl', 'wb'))
pickle.dump(mlp, open(f'mlp.pkl', 'wb'))
pickle.dump(neu_vectorizer, open(f'neu_vectorizer.pkl', 'wb'))
pickle.dump(neu_logistic_reg, open(f'neu_logistic_reg.pkl', 'wb'))
pickle.dump(neu_mnb, open(f'neu_mnb.pkl', 'wb'))
pickle.dump(neu_svm, open(f'neu_svm.pkl', 'wb'))
pickle.dump(neu_rf, open(f'neu_rf.pkl', 'wb'))
pickle.dump(neu_mlp, open(f'neu_mlp.pkl', 'wb'))
vectorizer = pickle.load(open(f'vectorizer.pkl', 'rb'))
logistic_reg = pickle.load(open(f'logistic_reg.pkl', 'rb'))
mnb = pickle.load(open(f'mnb.pkl', 'rb'))
svm = pickle.load(open(f'svm.pkl', 'rb'))
rf = pickle.load(open(f'rf.pkl', 'rb'))
mlp = pickle.load(open(f'mlp.pkl', 'rb'))

neu_vectorizer = pickle.load(open(f'neu_vectorizer.pkl', 'rb'))
neu_logistic_reg = pickle.load(open(f'neu_logistic_reg.pkl', 'rb'))
neu_mnb = pickle.load(open(f'neu_mnb.pkl', 'rb'))
neu_svm = pickle.load(open(f'neu_svm.pkl', 'rb'))
neu_rf = pickle.load(open(f'neu_rf.pkl', 'rb'))
neu_mlp = pickle.load(open(f'neu_mlp.pkl', 'rb'))

In [45]:
def predict_multi_level(X, neu_vectorizer, neu_clf, vectorizer, clf):
    neu_y_pred = neu_clf.predict(neu_vectorizer.transform(X))
    if len(X[neu_y_pred == 'غير محايد']) > 0:
        y_pred = clf.predict(vectorizer.transform(X[neu_y_pred == 'غير محايد'])) # classify non neutral into positive or negative
        neu_y_pred[neu_y_pred == 'غير محايد'] = y_pred
    
    final_y_pred = neu_y_pred
    return final_y_pred




In [46]:

X = test_data.dropna()['comment'].values
y = test_data.dropna()['label'].values
pred_y = predict_multi_level(X, neu_vectorizer, neu_mlp, vectorizer, mnb)



In [47]:
print('accuracy_score: ')
print(accuracy_score(y, pred_y))

print('f1_score: ')
print(f1_score(y, pred_y, average='macro'))

accuracy_score: 
0.8613445378151261
f1_score: 
0.5544733208926919


In [48]:
print("__________________________________________________________________________________________________________")

__________________________________________________________________________________________________________


In [84]:
data = pd.read_excel('ALL_data.xlsx')
data = data.dropna()
data.head()


Unnamed: 0,comment,label
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي
1,طيب ما تشرحو طريقه الاشتراك في الباقه دي,محايد
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي
3,رمز الاشتراك شنو,محايد
4,واو,ايجابي


In [85]:
data['clean']=data['comment']

In [86]:
data.head()

Unnamed: 0,comment,label,clean
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي,نفسي يوم تكتبو السعر بدون مانسال
1,طيب ما تشرحو طريقه الاشتراك في الباقه دي,محايد,طيب ما تشرحو طريقه الاشتراك في الباقه دي
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...
3,رمز الاشتراك شنو,محايد,رمز الاشتراك شنو
4,واو,ايجابي,واو


In [87]:

from nltk.corpus import stopwords
from nltk import word_tokenize

arabicStopWords= stopwords.words("arabic")

for i in range(0,len(data)):
    tokenizedRow = word_tokenize(data['clean'][i])
    commentWithNoStopWords= ' '.join([i for i in tokenizedRow if i not in arabicStopWords])
                

    data['clean'][i]=commentWithNoStopWords
    


In [60]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data


#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
features.remove('comment')
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)

print("train data = "+str(len(train_data)))
print("val  data = "+str(len(val_data)))
print("test  data = "+str(len(test_data)))

print("all data = "+str(len(data)))

print(len(train_data))
print(len(val_data))
print(len(test_data))
print(len(train_data)+len(val_data)+len(test_data))
print(len(data))



# TF IDF


vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['clean'])
val_data_features = vectorizer.transform(val_data['clean'])
test_data_features = vectorizer.transform(test_data['clean'])

#SHAPE

train_data_features.shape, val_data_features.shape, test_data_features.shape

#FUNCTION FOR MODEL TRAIN

def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    val_data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    val_data.to_excel(filename)
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


# RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


output: label
features: ['comment', 'clean']
output: label
features: ['clean']
train data = 2458
val  data = 307
test  data = 308
all data = 3073
2458
307
308
3073
3073
----------------------------------------------------------------------------------------------------LogisticRegress
accuracy_score Score on training data:
0.9121236777868186
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.745928338762215
f1_score  on test data:
0.6897513062744789
----------------------------------------------------------------------------------------------------MultinomialNB()
accuracy_score Score on training data:
0.8986981285598047
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.7296416938110749
f1_score  on test data:
0.6123716011184042
------------------------------------------

In [63]:

from aranorm import normalize_arabic_text
for i in range(0,len(data)):

    data['clean'][i]=normalize_arabic_text(data['clean'][i])

In [64]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data


#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
features.remove('comment')
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)

print("train data = "+str(len(train_data)))
print("val  data = "+str(len(val_data)))
print("test  data = "+str(len(test_data)))

print("all data = "+str(len(data)))

print(len(train_data))
print(len(val_data))
print(len(test_data))
print(len(train_data)+len(val_data)+len(test_data))
print(len(data))



# TF IDF


vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['clean'])
val_data_features = vectorizer.transform(val_data['clean'])
test_data_features = vectorizer.transform(test_data['clean'])

#SHAPE

train_data_features.shape, val_data_features.shape, test_data_features.shape

#FUNCTION FOR MODEL TRAIN

def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    val_data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    val_data.to_excel(filename)
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


# RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


output: label
features: ['comment', 'clean']
output: label
features: ['clean']
train data = 2458
val  data = 307
test  data = 308
all data = 3073
2458
307
308
3073
3073
----------------------------------------------------------------------------------------------------LogisticRegress
accuracy_score Score on training data:
0.9137510170870626
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.752442996742671
f1_score  on test data:
0.6968363006023971
----------------------------------------------------------------------------------------------------MultinomialNB()
accuracy_score Score on training data:
0.8958502847843776
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.7231270358306189
f1_score  on test data:
0.6055293884106618
------------------------------------------

In [66]:

import stanza


nlp = stanza.Pipeline(lang='ar', processors='tokenize,lemma')


2023-01-02 20:19:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-01-02 20:19:59 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |
| lemma     | padt    |

2023-01-02 20:19:59 INFO: Use device: cpu
2023-01-02 20:19:59 INFO: Loading: tokenize
2023-01-02 20:19:59 INFO: Loading: mwt
2023-01-02 20:19:59 INFO: Loading: lemma
2023-01-02 20:19:59 INFO: Done loading processors!


In [67]:

for i in range(0,len(data)):
    doc = nlp(data['clean'][i])
    lema=''.join(word.lemma+' ' for sent in doc.sentences for word in sent.words)

    data['clean'][i]=lema


In [68]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data


#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
features.remove('comment')
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)

print("train data = "+str(len(train_data)))
print("val  data = "+str(len(val_data)))
print("test  data = "+str(len(test_data)))

print("all data = "+str(len(data)))

print(len(train_data))
print(len(val_data))
print(len(test_data))
print(len(train_data)+len(val_data)+len(test_data))
print(len(data))



# TF IDF


vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['clean'])
val_data_features = vectorizer.transform(val_data['clean'])
test_data_features = vectorizer.transform(test_data['clean'])

#SHAPE

train_data_features.shape, val_data_features.shape, test_data_features.shape

#FUNCTION FOR MODEL TRAIN

def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    val_data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    val_data.to_excel(filename)
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


# RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


output: label
features: ['comment', 'clean']
output: label
features: ['clean']
train data = 2458
val  data = 307
test  data = 308
all data = 3073
2458
307
308
3073
3073
----------------------------------------------------------------------------------------------------LogisticRegress
accuracy_score Score on training data:
0.8315703824247356
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.7068403908794788
f1_score  on test data:
0.6375790018174123
----------------------------------------------------------------------------------------------------MultinomialNB()
accuracy_score Score on training data:
0.7424735557363711
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.6677524429967426
f1_score  on test data:
0.523066200324412
------------------------------------------

In [71]:

for i in range(0,len(data)):  
    data['clean'][i]=normalize_arabic_text(data['clean'][i])
    

In [72]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data


#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
features.remove('comment')
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)

print("train data = "+str(len(train_data)))
print("val  data = "+str(len(val_data)))
print("test  data = "+str(len(test_data)))

print("all data = "+str(len(data)))

print(len(train_data))
print(len(val_data))
print(len(test_data))
print(len(train_data)+len(val_data)+len(test_data))
print(len(data))



# TF IDF


vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['clean'])
val_data_features = vectorizer.transform(val_data['clean'])
test_data_features = vectorizer.transform(test_data['clean'])

#SHAPE

train_data_features.shape, val_data_features.shape, test_data_features.shape

#FUNCTION FOR MODEL TRAIN

def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    val_data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    val_data.to_excel(filename)
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


# RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


output: label
features: ['comment', 'clean']
output: label
features: ['clean']
train data = 2458
val  data = 307
test  data = 308
all data = 3073
2458
307
308
3073
3073
----------------------------------------------------------------------------------------------------LogisticRegress
accuracy_score Score on training data:
0.9202603742880391
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.758957654723127
f1_score  on test data:
0.7015155479546422
----------------------------------------------------------------------------------------------------MultinomialNB()
accuracy_score Score on training data:
0.8641171684296176
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.7296416938110749
f1_score  on test data:
0.6154767325380203
------------------------------------------

In [106]:
Data = pd.read_excel('ALL_data.xlsx')
Data = data.dropna()
Data.head()


Unnamed: 0,comment,label,clean
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي,نفسي يوم تكتبو السعر بدون مانسال
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...
4,واو,ايجابي,واو
8,شكرا التوضيح مفيد اكرر الشكر سوداني الابداع وا...,ايجابي,شكرا التوضيح مفيد اكرر الشكر سوداني الابداع وا...
13,سوداني جميل,ايجابي,سوداني جميل


In [107]:
Data = Data[Data['label'] != 'محايد']
Data['clean']=Data['comment']
print(Data['clean'][0])

data.head()

نفسي يوم تكتبو السعر بدون مانسال


Unnamed: 0,comment,label,clean
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي,نفسي يوم تكتبو السعر بدون مانسال
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...
4,واو,ايجابي,واو
8,شكرا التوضيح مفيد اكرر الشكر سوداني الابداع وا...,ايجابي,شكرا التوضيح مفيد اكرر الشكر سوداني الابداع وا...
13,سوداني جميل,ايجابي,سوداني جميل


In [105]:

from nltk.corpus import stopwords
from nltk import word_tokenize

arabicStopWords= stopwords.words("arabic")

for i in Data['clean']:
    tokenizedRow = word_tokenize(i)
    commentWithNoStopWords= ' '.join([i for i in tokenizedRow if i not in arabicStopWords])
                

    Data['clean'][i]=commentWithNoStopWords

KeyError: 1

KeyError: 1

In [None]:
for i in range(0,len(data)):
    tokenizedRow = word_tokenize(data['clean'][i])
    commentWithNoStopWords= ' '.join([i for i in tokenizedRow if i not in arabicStopWords])
                

    data['clean'][i]=commentWithNoStopWords
    

for i in range(0,len(data)):

    data['clean'][i]=normalize_arabic_text(data['clean'][i])

for i in range(0,len(data)):
    doc = nlp(data['clean'][i])
    lema=''.join(word.lemma+' ' for sent in doc.sentences for word in sent.words)

    data['clean'][i]=lema

for i in range(0,len(data)):  
    data['clean'][i]=normalize_arabic_text(data['clean'][i])
    