In [1]:
import pandas as pd
import numpy as np
import json
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sys

import pickle


%matplotlib inline

In [2]:
rand_seed = 0  # random state for reproducibility

In [4]:
np.random.seed(rand_seed)

In [10]:
# reading our prepared data
data = pd.read_excel('file/ALL_data.xlsx')
data = data.dropna()
data.head()

Unnamed: 0,comment,label
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي
1,طيب ما تشرحو طريقه الاشتراك في الباقه دي,محايد
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي
3,رمز الاشتراك شنو,محايد
4,واو,ايجابي


In [12]:
data = data[data['label'] != 'محايد']
data

Unnamed: 0,comment,label
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي
4,واو,ايجابي
8,شكرا التوضيح مفيد اكرر الشكر سوداني الابداع وا...,ايجابي
14,سوداني جميل,ايجابي
...,...,...
2483,انا بيع شريحتي دي في زول بشتري,سلبي
2484,اغلي من كده,سلبي
2485,نقلب بس,سلبي
2486,يوم واحد م نشطت سوداني م عارفه بتستغلو بيها كي...,سلبي


In [13]:
def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data

In [14]:
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)

print(len(train_data))
print(len(val_data))
print(len(test_data))
print(len(train_data)+len(val_data)+len(test_data))
print(len(data))

output: label
features: ['comment']
1527
191
191
1909
1909


In [17]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
# vectorizer = CountVectorizer(ngram_range=(1, 2))

train_data_features = vectorizer.fit_transform(train_data['comment'].values.astype('U'))
val_data_features = vectorizer.transform(val_data['comment'].values.astype('U'))
test_data_features = vectorizer.transform(test_data['comment'].values.astype('U'))


In [18]:
train_data_features.shape, val_data_features.shape, test_data_features.shape

((1527, 12363), (191, 12363), (191, 12363))

In [19]:
def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels):
    clf.fit(train_features, train_labels) # please learn patterns from the data

    print("score on training data:")
    print(clf.score(train_features, train_labels))
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    print('accuracy_score: ')
    print(accuracy_score(test_labels, pred_y))
    
    print('f1_score: ')
    print(f1_score(test_labels, pred_y, average='macro'))

In [20]:
logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
0.8781925343811395
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.8900523560209425
f1_score: 
0.8604238438250339


In [21]:
mnb = MultinomialNB()

train_n_test_classifier(mnb, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
0.9731499672560576
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.900523560209424
f1_score: 
0.8737168110797926


In [22]:
svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
0.9973804846103471
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.9214659685863874
f1_score: 
0.904757155679665


In [23]:
rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        val_data_features, val_data[output])

score on training data:
1.0
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.8900523560209425
f1_score: 
0.8718407515097295


In [24]:
mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp, train_data_features, train_data[output],
                        val_data_features, val_data[output])

Iteration 1, loss = 0.72876904
Iteration 2, loss = 0.70050097
Iteration 3, loss = 0.67728836
Iteration 4, loss = 0.65562782
Iteration 5, loss = 0.62985583
Iteration 6, loss = 0.59836234
Iteration 7, loss = 0.56004019
Iteration 8, loss = 0.51118655
Iteration 9, loss = 0.45090170
Iteration 10, loss = 0.38149434
Iteration 11, loss = 0.31377240
Iteration 12, loss = 0.25778517
Iteration 13, loss = 0.21467158
Iteration 14, loss = 0.18140836
Iteration 15, loss = 0.15622048
Iteration 16, loss = 0.13618057
Iteration 17, loss = 0.11924556
Iteration 18, loss = 0.10458006
Iteration 19, loss = 0.09256662
Iteration 20, loss = 0.08235123
Iteration 21, loss = 0.07190770
Iteration 22, loss = 0.06178783
Iteration 23, loss = 0.05109523
Iteration 24, loss = 0.04097320
Iteration 25, loss = 0.03192033
Iteration 26, loss = 0.02409166
Iteration 27, loss = 0.01808993
Iteration 28, loss = 0.01338670
Iteration 29, loss = 0.01003177
Iteration 30, loss = 0.00762824
Iteration 31, loss = 0.00586055
Iteration 32, los

In [25]:
# reading our prepared data
data = pd.read_excel('file/ALL_data.xlsx')
data = data.dropna()
data.head()

Unnamed: 0,comment,label
0,نفسي يوم تكتبو السعر بدون مانسال,سلبي
1,طيب ما تشرحو طريقه الاشتراك في الباقه دي,محايد
2,لو قللتو الرسائل دي واديتونا ليها ميقات يكون ...,سلبي
3,رمز الاشتراك شنو,محايد
4,واو,ايجابي


In [26]:
data.groupby('label').count()

Unnamed: 0_level_0,comment
label,Unnamed: 1_level_1
ايجابي,609
سلبي,1300
محايد,580


In [27]:
positive_data = data[data['label'] == 'ايجابي'].dropna()
negative_data = data[data['label'] == 'سلبي'].dropna()
neutral_data = data[data['label'] == 'محايد'].dropna()
len(positive_data), len(negative_data), len(neutral_data)

(609, 1300, 580)

In [30]:
non_neutral_data = positive_data.append(negative_data).sample(frac=1).reset_index(drop=True)
non_neutral_data['label'] = 'غير محايد'

  non_neutral_data = positive_data.append(negative_data).sample(frac=1).reset_index(drop=True)


In [31]:
neu_data = neutral_data.append(non_neutral_data).dropna().sample(frac=1).reset_index(drop=True)
neu_data

  neu_data = neutral_data.append(non_neutral_data).dropna().sample(frac=1).reset_index(drop=True)


Unnamed: 0,comment,label
0,بس حاول اتطور اكتر لانو العالم متقدم وان شاء ا...,غير محايد
1,موفقين ان شاء الله,غير محايد
2,عاد بااااالغتووو,غير محايد
3,الباقات حقت النت مالها غالية كدا تقول باقات بنزين,غير محايد
4,تسلموو,غير محايد
...,...,...
2484,ياخ النت بتاعكم كعب شديد و يجيب الجلطه,غير محايد
2485,الشبكة زي الزفت الاشتراك زاتو كل مرة بتغيرو ال...,غير محايد
2486,نت اليوم زي الزفت الله ينعلكم,غير محايد
2487,النت كعب خساره قروش بس بلد ما فيها شئ شغال كويس,غير محايد


In [32]:
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

# seed = 0  # random state for reproducibility
output = 'label' # output label column
features = neu_data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

neu_train_data, neu_tmp = random_split(neu_data, features, output, train_fraction, rand_seed)
neu_val_data, neu_test_data = random_split(neu_tmp, features, output, val_fraction, rand_seed)

print(len(neu_train_data))
print(len(neu_val_data))
print(len(neu_test_data))
print(len(neu_train_data)+len(neu_val_data)+len(neu_test_data))
print(len(neu_data))

output: label
features: ['comment']
1991
249
249
2489
2489


In [33]:
neu_vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
# neu_vectorizer = CountVectorizer(ngram_range=(1, 2))
neu_train_data_features = neu_vectorizer.fit_transform(neu_train_data['comment'].values.astype('U'))
neu_val_data_features = neu_vectorizer.transform(neu_val_data['comment'].values.astype('U'))
neu_test_data_features = neu_vectorizer.transform(neu_test_data['comment'].values.astype('U'))


In [34]:
neu_logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(neu_logistic_reg, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.7995981918633852
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7911646586345381
f1_score: 
0.5458754208754208


In [35]:
neu_mlp = MLPClassifier(hidden_layer_sizes=(100,100), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(neu_mlp, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

Iteration 1, loss = 0.76431680
Iteration 2, loss = 0.67594404
Iteration 3, loss = 0.56817987
Iteration 4, loss = 0.46405916
Iteration 5, loss = 0.36479671
Iteration 6, loss = 0.26897376
Iteration 7, loss = 0.17940872
Iteration 8, loss = 0.10812300
Iteration 9, loss = 0.06223096
Iteration 10, loss = 0.03651689
Iteration 11, loss = 0.02382360
Iteration 12, loss = 0.01789159
Iteration 13, loss = 0.01461815
Iteration 14, loss = 0.01282045
Iteration 15, loss = 0.01179202
Iteration 16, loss = 0.01070044
Iteration 17, loss = 0.01059728
Iteration 18, loss = 0.01003944
Iteration 19, loss = 0.00980005
Iteration 20, loss = 0.00909180
Iteration 21, loss = 0.00922396
Iteration 22, loss = 0.00905796
Iteration 23, loss = 0.00872916
Iteration 24, loss = 0.00851724
Iteration 25, loss = 0.00846515
Iteration 26, loss = 0.00821178
Iteration 27, loss = 0.00840891
Training loss did not improve more than tol=0.001000 for 10 consecutive epochs. Stopping.
score on training data:
0.9959819186338523
____________

In [36]:
neu_mnb = MultinomialNB()
train_n_test_classifier(neu_mnb, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.8538422903063787
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7991967871485943
f1_score: 
0.5633417508417509


In [37]:
neu_svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(neu_svm, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.9834254143646409
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.8192771084337349
f1_score: 
0.6543480272696425


In [73]:
neu_rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(neu_rf, neu_train_data_features, neu_train_data[output],
                        neu_val_data_features, neu_val_data[output])

score on training data:
0.9959819186338523
____________________________________________________________________________________________________
score on testing data:
accuracy_score: 
0.7269076305220884
f1_score: 
0.6267195767195767


In [74]:
pickle.dump(vectorizer, open(f'file/vectorizer.pkl', 'wb'))
pickle.dump(logistic_reg, open(f'file/logistic_reg.pkl', 'wb'))
pickle.dump(mnb, open(f'file/mnb.pkl', 'wb'))
pickle.dump(svm, open(f'file/svm.pkl', 'wb'))
pickle.dump(rf, open(f'file/rf.pkl', 'wb'))
pickle.dump(mlp, open(f'file/mlp.pkl', 'wb'))
pickle.dump(neu_vectorizer, open(f'file/neu_vectorizer.pkl', 'wb'))
pickle.dump(neu_logistic_reg, open(f'file/neu_logistic_reg.pkl', 'wb'))
pickle.dump(neu_mnb, open(f'file/neu_mnb.pkl', 'wb'))
pickle.dump(neu_svm, open(f'file/neu_svm.pkl', 'wb'))
pickle.dump(neu_rf, open(f'file/neu_rf.pkl', 'wb'))
pickle.dump(neu_mlp, open(f'file/neu_mlp.pkl', 'wb'))

In [75]:
vectorizer = pickle.load(open(f'file/vectorizer.pkl', 'rb'))
logistic_reg = pickle.load(open(f'file/logistic_reg.pkl', 'rb'))
mnb = pickle.load(open(f'file/mnb.pkl', 'rb'))
svm = pickle.load(open(f'file/svm.pkl', 'rb'))
rf = pickle.load(open(f'file/rf.pkl', 'rb'))
mlp = pickle.load(open(f'file/mlp.pkl', 'rb'))

neu_vectorizer = pickle.load(open(f'file/neu_vectorizer.pkl', 'rb'))
neu_logistic_reg = pickle.load(open(f'file/neu_logistic_reg.pkl', 'rb'))
neu_mnb = pickle.load(open(f'file/neu_mnb.pkl', 'rb'))
neu_svm = pickle.load(open(f'file/neu_svm.pkl', 'rb'))
neu_rf = pickle.load(open(f'file/neu_rf.pkl', 'rb'))
neu_mlp = pickle.load(open(f'file/neu_mlp.pkl', 'rb'))

In [89]:
def predict_multi_level(X, neu_vectorizer, neu_clf, vectorizer, clf):
       neu_y_pred = neu_clf.predict(neu_vectorizer.transform(X))
    if len(X[neu_y_pred == 'عير محايد']) > 0:
        y_pred = clf.predict(vectorizer.transform(X[neu_y_pred == 'عير محايد'])) # classify non neutral into positive or negative
        neu_y_pred[neu_y_pred == 'عير محايد'] = y_pred
    
    final_y_pred = neu_y_pred
    return final_y_pred


X = test_data.dropna()['comment'].values
y = test_data.dropna()['label'].values
pred_y = predict_multi_level(X, neu_vectorizer, neu_mlp, vectorizer, mnb)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 3)

In [83]:
neu_data = neutral_data.append(non_neutral_data).dropna().sample(frac=1).reset_index(drop=True)
neu_data

  neu_data = neutral_data.append(non_neutral_data).dropna().sample(frac=1).reset_index(drop=True)


Unnamed: 0,comment,label
0,انشاء الله يكون بالجد اصلي ماعندي شريحه سوداني,محايد
1,مقاطعه بس,غير محايد
2,الحمدلله اي حد شقال علي كيفو,غير محايد
3,بالتوفيق,غير محايد
4,بدايه الالفينات,محايد
...,...,...
2484,موفقين,غير محايد
2485,فشله,غير محايد
2486,ربنا يوفقكم يارب,غير محايد
2487,سوداني لكل سوداني,غير محايد


In [90]:
print('accuracy_score: ')
print(accuracy_score(y, pred_y))

print('f1_score: ')
print(f1_score(y, pred_y, average='macro'))

accuracy_score: 


NameError: name 'pred_y' is not defined