In [12]:
import pandas as pd
import numpy as np
import json
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sys
from aranorm import normalize_arabic_text
from nltk.corpus import stopwords
from nltk import word_tokenize

import pickle


%matplotlib inline
rand_seed = 0  # random state for reproducibility
np.random.seed(rand_seed)


In [13]:

data = pd.read_excel('ALL_data.xlsx')
data = data.dropna()
data['clean']=data['comment']
data.head()


Unnamed: 0,comment,label,clean
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي,نفسي يوم تكتبو السعر بدون مانسال
1,طيب ما تشرحو طريقه الاشتراك في الباقه دي,محايد,طيب ما تشرحو طريقه الاشتراك في الباقه دي
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...
3,رمز الاشتراك شنو,محايد,رمز الاشتراك شنو
4,واو,ايجابي,واو


In [16]:

import stanza


nlp = stanza.Pipeline(lang='ar', processors='tokenize,lemma')


2023-01-05 04:22:50 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-01-05 04:22:53 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |
| lemma     | padt    |

2023-01-05 04:22:53 INFO: Use device: cpu
2023-01-05 04:22:53 INFO: Loading: tokenize
2023-01-05 04:22:53 INFO: Loading: mwt
2023-01-05 04:22:53 INFO: Loading: lemma
2023-01-05 04:22:54 INFO: Done loading processors!


In [17]:

arabicStopWords= stopwords.words("arabic")

for i in range(0,len(data)):
    tokenizedRow = word_tokenize(data['clean'][i])
    commentWithNoStopWords= ' '.join([i for i in tokenizedRow if i not in arabicStopWords])
                

    data['clean'][i]=commentWithNoStopWords
    

for i in range(0,len(data)):

    data['clean'][i]=normalize_arabic_text(data['clean'][i])

for i in range(0,len(data)):
    doc = nlp(data['clean'][i])
    lema=''.join(word.lemma+' ' for sent in doc.sentences for word in sent.words)

    data['clean'][i]=lema

for i in range(0,len(data)):  
    data['clean'][i]=normalize_arabic_text(data['clean'][i])
    

In [19]:
data = data[data['label'] != 'محايد']
data

Unnamed: 0,comment,label,clean
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي,نفسي يوم تكتب سعر بدون مانسال
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي,قلل رساءل دي وادي لي هو ميقات كان اجمل
4,واو,ايجابي,
8,شكرا التوضيح مفيد اكرر الشكر سوداني الابداع وا...,ايجابي,شكر توضيح مفيد اكرر شكر سوداني ابداع تميز
13,سوداني جميل,ايجابي,سوداني جميل
...,...,...,...
3068,خليك سوداني,ايجابي,خليك سوداني
3069,سوداني,ايجابي,سوداني
3070,سوداني الاقوي والافضل,ايجابي,سوداني اقوي وافل
3071,خليك سوداني,ايجابي,خليك سوداني


In [21]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data


#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
features.remove('comment')
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)

print("train data = "+str(len(train_data)))
print("val  data = "+str(len(val_data)))
print("test  data = "+str(len(test_data)))

print("all data = "+str(len(data)))

print(len(train_data))
print(len(val_data))
print(len(test_data))
print(len(train_data)+len(val_data)+len(test_data))
print(len(data))



# TF IDF


vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['clean'])
val_data_features = vectorizer.transform(val_data['clean'])
test_data_features = vectorizer.transform(test_data['clean'])

#SHAPE

train_data_features.shape, val_data_features.shape, test_data_features.shape

#FUNCTION FOR MODEL TRAIN

def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    val_data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    val_data.to_excel(filename)
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


# RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


output: label
features: ['comment', 'clean']
output: label
features: ['clean']
train data = 1899
val  data = 237
test  data = 238
all data = 2374
1899
237
238
2374
2374
----------------------------------------------------------------------------------------------------LogisticRegress
accuracy_score Score on training data:
0.9062664560294892
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.9113924050632911
f1_score  on test data:
0.8912487708947887
----------------------------------------------------------------------------------------------------MultinomialNB()
accuracy_score Score on training data:
0.9652448657187994
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.919831223628692
f1_score  on test data:
0.9024670233273409
------------------------------------------

In [22]:
# غير محايد

In [23]:
# reading our prepared data
data = pd.read_excel('ALL_data.xlsx')
data = data.dropna()
data.head()

Unnamed: 0,comment,label
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي
1,طيب ما تشرحو طريقه الاشتراك في الباقه دي,محايد
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي
3,رمز الاشتراك شنو,محايد
4,واو,ايجابي


In [25]:
data.groupby('label').count()
data['clean']=data['comment']
data.head()

Unnamed: 0,comment,label,clean
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي,نفسي يوم تكتبو السعر بدون مانسال
1,طيب ما تشرحو طريقه الاشتراك في الباقه دي,محايد,طيب ما تشرحو طريقه الاشتراك في الباقه دي
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...
3,رمز الاشتراك شنو,محايد,رمز الاشتراك شنو
4,واو,ايجابي,واو


In [26]:

arabicStopWords= stopwords.words("arabic")

for i in range(0,len(data)):
    tokenizedRow = word_tokenize(data['clean'][i])
    commentWithNoStopWords= ' '.join([i for i in tokenizedRow if i not in arabicStopWords])
                

    data['clean'][i]=commentWithNoStopWords
    

for i in range(0,len(data)):

    data['clean'][i]=normalize_arabic_text(data['clean'][i])

for i in range(0,len(data)):
    doc = nlp(data['clean'][i])
    lema=''.join(word.lemma+' ' for sent in doc.sentences for word in sent.words)

    data['clean'][i]=lema

for i in range(0,len(data)):  
    data['clean'][i]=normalize_arabic_text(data['clean'][i])


In [27]:
data.groupby('label').count()

Unnamed: 0_level_0,comment,clean
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ايجابي,773,773
سلبي,1601,1601
محايد,699,699


In [28]:
positive_data = data[data['label'] == 'ايجابي'].dropna()
negative_data = data[data['label'] == 'سلبي'].dropna()
neutral_data = data[data['label'] == 'محايد'].dropna()
len(positive_data), len(negative_data), len(neutral_data)

(773, 1601, 699)

In [29]:
non_neutral_data = positive_data.append(negative_data).sample(frac=1).reset_index(drop=True)
non_neutral_data['label'] = 'غير محايد'

  non_neutral_data = positive_data.append(negative_data).sample(frac=1).reset_index(drop=True)


In [30]:
neu_data = neutral_data.append(non_neutral_data).dropna().sample(frac=1).reset_index(drop=True)
neu_data

  neu_data = neutral_data.append(non_neutral_data).dropna().sample(frac=1).reset_index(drop=True)


Unnamed: 0,comment,label,clean
0,الناس تتفق تقفل يومين بس لانت لامكالمات والله ...,غير محايد,انسان اتفق تقفل يوم لانت ل امكالم و الل ه يقال...
1,انتو يا جماعه شركه كنداكه ده وين اختفي كده ولا...,محايد,انتو جماع هو شرك هو كنداك هو ده وينه اختفي كده...
2,وين السعر وانا اقول العلم التجار حركه عدم الاس...,غير محايد,و ين سعر و هو اقول علم تاجر حرك هو عدم اسعارمي...
3,والشبكه زفت الزفت,غير محايد,و شبكه زفت زفت
4,بالغتو والله طلعتو زيتنا تسقطو بس,غير محايد,ب غتو و الل ه طلع زيه هو تسقط
...,...,...,...
3068,والله حرام عليكم زيادة فظيعة حتى التي قبلها لم...,غير محايد,و الل ه حرام علي هو زيادا هو فظيع هو قبل هو نش...
3069,1قيقا,محايد,قيقا
3070,عمل انساني شنو المناقل ماشايفنها,محايد,عمل انساني شنا مناقل ماشا
3071,كل يومين تلاتة زايدين اسعاركم,غير محايد,يوم تلات هو زايد اسعاركي


In [38]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data


#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
features.remove('comment')
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)

print("train data = "+str(len(train_data)))
print("val  data = "+str(len(val_data)))
print("test  data = "+str(len(test_data)))

print("all data = "+str(len(data)))

print(len(train_data))
print(len(val_data))
print(len(test_data))
print(len(train_data)+len(val_data)+len(test_data))
print(len(data))



# TF IDF


neu_vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = neu_vectorizer.fit_transform(train_data['clean'])
val_data_features = neu_vectorizer.transform(val_data['clean'])
test_data_features = neu_vectorizer.transform(test_data['clean'])

#SHAPE

train_data_features.shape, val_data_features.shape, test_data_features.shape

#FUNCTION FOR MODEL TRAIN

def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    val_data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    val_data.to_excel(filename)
    
    

#LOGASTIC REGRESSION



neu_logistic_reg  = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(neu_logistic_reg , train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)



# MultinomialNB

neu_mnb = MultinomialNB()

train_n_test_classifier(neu_mnb, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# S V M

neu_svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(neu_svm, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


# RandomForestClassifier

neu_rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(neu_rf, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)

# MLPClassifier

neu_mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(neu_mlp, train_data_features, train_data[output],
                        val_data_features, val_data[output],val_data)


output: label
features: ['comment', 'clean']
output: label
features: ['clean']
train data = 2458
val  data = 307
test  data = 308
all data = 3073
2458
307
308
3073
3073
----------------------------------------------------------------------------------------------------LogisticRegress
accuracy_score Score on training data:
0.9202603742880391
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.758957654723127
f1_score  on test data:
0.7015155479546422
----------------------------------------------------------------------------------------------------MultinomialNB()
accuracy_score Score on training data:
0.8641171684296176
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.7296416938110749
f1_score  on test data:
0.6154767325380203
------------------------------------------

In [39]:
pickle.dump(vectorizer, open(f'vectorizer.pkl', 'wb'))
pickle.dump(logistic_reg, open(f'logistic_reg.pkl', 'wb'))
pickle.dump(mnb, open(f'mnb.pkl', 'wb'))
pickle.dump(svm, open(f'svm.pkl', 'wb'))
pickle.dump(rf, open(f'rf.pkl', 'wb'))
pickle.dump(mlp, open(f'mlp.pkl', 'wb'))
pickle.dump(neu_vectorizer, open(f'neu_vectorizer.pkl', 'wb'))
pickle.dump(neu_logistic_reg, open(f'neu_logistic_reg.pkl', 'wb'))
pickle.dump(neu_mnb, open(f'neu_mnb.pkl', 'wb'))
pickle.dump(neu_svm, open(f'neu_svm.pkl', 'wb'))
pickle.dump(neu_rf, open(f'neu_rf.pkl', 'wb'))
pickle.dump(neu_mlp, open(f'neu_mlp.pkl', 'wb'))
vectorizer = pickle.load(open(f'vectorizer.pkl', 'rb'))
logistic_reg = pickle.load(open(f'logistic_reg.pkl', 'rb'))
mnb = pickle.load(open(f'mnb.pkl', 'rb'))
svm = pickle.load(open(f'svm.pkl', 'rb'))
rf = pickle.load(open(f'rf.pkl', 'rb'))
mlp = pickle.load(open(f'mlp.pkl', 'rb'))

neu_vectorizer = pickle.load(open(f'neu_vectorizer.pkl', 'rb'))
neu_logistic_reg = pickle.load(open(f'neu_logistic_reg.pkl', 'rb'))
neu_mnb = pickle.load(open(f'neu_mnb.pkl', 'rb'))
neu_svm = pickle.load(open(f'neu_svm.pkl', 'rb'))
neu_rf = pickle.load(open(f'neu_rf.pkl', 'rb'))
neu_mlp = pickle.load(open(f'neu_mlp.pkl', 'rb'))


In [40]:
def predict_multi_level(X, neu_vectorizer, neu_clf, vectorizer, clf):
    neu_y_pred = neu_clf.predict(neu_vectorizer.transform(X))
    if len(X[neu_y_pred == 'غير محايد']) > 0:
        y_pred = clf.predict(vectorizer.transform(X[neu_y_pred == 'غير محايد'])) # classify non neutral into positive or negative
        neu_y_pred[neu_y_pred == 'غير محايد'] = y_pred
    
    final_y_pred = neu_y_pred
    return final_y_pred




In [42]:

X = test_data.dropna()['clean'].values
y = test_data.dropna()['label'].values
pred_y = predict_multi_level(X, neu_vectorizer, neu_mlp, vectorizer, mnb)



In [43]:
print('accuracy_score: ')
print(accuracy_score(y, pred_y))

print('f1_score: ')
print(f1_score(y, pred_y, average='macro'))

accuracy_score: 
0.7564935064935064
f1_score: 
0.7299736708483554
