# Import required modules

In [6]:
import pandas as pd
import nltk
import re
from qalsadi import lemmatizer,analex
import time
print("Modules Imported !! ")



Modules Imported !! 



# Data Preparing (Load-Clean)

In [2]:
data=pd.read_excel('data.xlsx') # data loading 


In [3]:
data.head() # show dataframe

Unnamed: 0,sentiment,txt
0,1.0,انشاء الله هنعمل حاجه
1,0.0,اقسم باللله ان العرب اكثر الشعوب تخلفاا
2,0.0,﻿هات ناس تفهم .. و المثل بحكي اسأل مجرب و لا ت...
3,0.0,صرماتي براس اهلك
4,0.0,حرام السخرية من الناس


In [4]:
data['sentiment'].value_counts() #count the values of sentiment 

1.0    10098
0.0    10023
Name: sentiment, dtype: int64

In [5]:
data = data.dropna() # drop and remove nan (null) value 


In [6]:
# method to remove emoji's

def remove_emoji(text):
    non_arabic_char = re.compile('[^\s\\u0600-\u06FF]')
    text_with_no_spaces = re.sub(non_arabic_char, "", text)
    text_with_single_spaces = " ".join(re.split("\s+", text_with_no_spaces))
    
    return text_with_single_spaces

In [7]:
data.txt[2] #show data before emoji's removal

'\ufeffهات ناس تفهم .. و المثل بحكي اسأل مجرب و لا تسأل خبير'

In [8]:
data["txt"]=data["txt"].map(remove_emoji) #map each row with remove_emoji's function

In [9]:
data.txt[2] # show data sample after apply remove_emoji's

'هات ناس تفهم و المثل بحكي اسأل مجرب و لا تسأل خبير'

In [10]:
data.head()

Unnamed: 0,sentiment,txt
0,1.0,انشاء الله هنعمل حاجه
1,0.0,اقسم باللله ان العرب اكثر الشعوب تخلفاا
2,0.0,هات ناس تفهم و المثل بحكي اسأل مجرب و لا تسأل ...
3,0.0,صرماتي براس اهلك
4,0.0,حرام السخرية من الناس


In [11]:
data.dropna(axis=1, how='all')


Unnamed: 0,sentiment,txt
0,1.0,انشاء الله هنعمل حاجه
1,0.0,اقسم باللله ان العرب اكثر الشعوب تخلفاا
2,0.0,هات ناس تفهم و المثل بحكي اسأل مجرب و لا تسأل ...
3,0.0,صرماتي براس اهلك
4,0.0,حرام السخرية من الناس
...,...,...
20196,0.0,المثل يقول ان أكرمت اللئيم تمردا وهذا بالضبط ...
20197,1.0,إلي سهران ريتويت بنسولف عالخاص
20198,0.0,لا تهتم بشخص زياااده ،، تراك بزمن يسمون المهتم...
20199,0.0,مكى عامل ايه انا دورت عليك كتير امبارح واتصلت...


# Data Preprocessing (Tokenize - Stop word remove - stemming or lemmatize) 

# Tokenizing data


In [12]:
def tokenize_text(inp):
    return nltk.tokenize.wordpunct_tokenize(inp)

In [13]:

        
data.txt = data.txt.apply(lambda sentence: nltk.tokenize.wordpunct_tokenize(sentence))


In [14]:
data.head()

Unnamed: 0,sentiment,txt
0,1.0,"[انشاء, الله, هنعمل, حاجه]"
1,0.0,"[اقسم, باللله, ان, العرب, اكثر, الشعوب, تخلفاا]"
2,0.0,"[هات, ناس, تفهم, و, المثل, بحكي, اسأل, مجرب, و..."
3,0.0,"[صرماتي, براس, اهلك]"
4,0.0,"[حرام, السخرية, من, الناس]"


# Stop word removal 

In [15]:
def stopword_removal(inp):
    arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))
    for i in inp:
        if i in arb_stopwords:
            inp.remove(i)

    return inp

In [16]:
data.txt=data["txt"].map(stopword_removal)

In [17]:
data.head()

Unnamed: 0,sentiment,txt
0,1.0,"[انشاء, الله, هنعمل, حاجه]"
1,0.0,"[اقسم, باللله, ان, العرب, اكثر, الشعوب, تخلفاا]"
2,0.0,"[هات, ناس, تفهم, المثل, بحكي, اسأل, مجرب, لا, ..."
3,0.0,"[صرماتي, براس, اهلك]"
4,0.0,"[حرام, السخرية, الناس]"


# Stemming data

In [18]:
stemmer=nltk.ISRIStemmer()

In [19]:

def stem(text):
    out=[]
    for i in text:
        out.append(stemmer.stem(i))
            
    return out

In [20]:
start_time=time.time()
#data.txt.map(stem)

data.txt=data.txt.map(stem)
print("-------- ",(time.time() - start_time),' Secounds --------')

--------  3.910546064376831  Secounds --------


In [21]:
#data.head()

# Lemmatizing data

Lemmatizing Take more time than stemming

In [22]:
lemmatizer = lemmatizer.Lemmatizer()

In [23]:

def lemmatize(text):
    out=[]
    for i in text:
        out.append(lemmatizer.lemmatize(i))

    return out

In [24]:
start_time=time.time()

#data.txt.map(lemmatize).head
#data.txt=data.txt.map(lemmatize)

print("-------- ",(time.time() - start_time),' Secounds --------')

--------  0.0001652240753173828  Secounds --------


In [25]:
data.head()

Unnamed: 0,sentiment,txt
0,1.0,"[شاء, الل, هنعمل, حجه]"
1,0.0,"[قسم, لله, ان, عرب, كثر, شعب, خلف]"
2,0.0,"[هات, ناس, فهم, مثل, بحك, سأل, جرب, لا, سأل, خبر]"
3,0.0,"[صرم, برس, اهل]"
4,0.0,"[حرم, سخر, ناس]"


# Words Joining

In [26]:
def join_text(txt):
    
    return " ".join(txt)

In [27]:
data.txt=data.txt.map(join_text)

In [28]:
data.head()

Unnamed: 0,sentiment,txt
0,1.0,شاء الل هنعمل حجه
1,0.0,قسم لله ان عرب كثر شعب خلف
2,0.0,هات ناس فهم مثل بحك سأل جرب لا سأل خبر
3,0.0,صرم برس اهل
4,0.0,حرم سخر ناس


In [29]:
# convert class labels to  Bad and  Good values

def decoder(arr):
    out=list()
    binary_list=list(arr)
    for item in binary_list:
        if item == 0:
            out.append('bad')
        else:
            out.append('good')
    return out
    
    

# Feature Extraction & Model Training

In [30]:
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

print("Modules Imported")

Modules Imported


## Feature Extraction

In [31]:
# Feature extaction using Counter

bag_of_words_vectorizer=CountVectorizer(binary=False) 
bag_of_words_count = bag_of_words_vectorizer.fit_transform(data["txt"])

In [32]:
x_train_count, x_test_count, y_train_count, y_test_count = train_test_split(bag_of_words_count, data['sentiment'], random_state=42, test_size=0.25)

In [33]:
#Feature extraction using binary victor

bag_of_words_vectorizer_binary=CountVectorizer(binary=True) 
bag_of_words_binary = bag_of_words_vectorizer_binary.fit_transform(data["txt"])

In [34]:
x_train_bin, x_test_bin, y_train_bin, y_test_bin = train_test_split(bag_of_words_binary, data['sentiment'], random_state=42, test_size=0.25)

In [35]:
#Feature extraction using TF-IDF

vectorizer = TfidfVectorizer()
bag_of_words_tfidf=vectorizer.fit_transform(data["txt"])


In [36]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(bag_of_words_tfidf, data['sentiment'], random_state=42, test_size=0.25)


## Model Training

In [37]:
def train_models(X_train,Y_train,X_test,Y_test):
    
    print('---------------------Start Training-------------------------------')
    
    start_time = time.time()
    
    # Define models to train
    
    names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
             "Naive Bayes", "SVM Linear"]

    classifiers = [
        KNeighborsClassifier(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        LogisticRegression(solver='lbfgs', max_iter=1000),
        SGDClassifier(max_iter = 100),
        MultinomialNB(),
        SVC(kernel = 'linear')
    ]

    models = zip(names, classifiers)
    
    scored_models=dict()

    for name, model in models:
        nltk_model = model
        nltk_model.fit(X_train,Y_train)
        pred = model.predict(X_test)
        scored_models[name]=[model,pred]
        score=f1_score(Y_test, pred)
        accuracy = accuracy_score(Y_test,pred) 
        print(name,"had Trained and it's Accuracy: ", accuracy," and it's Score: ",score )

    print('---------------------End of Training-------------------------------')
    
    print("-------- ",(time.time() - start_time),' Secounds --------')
    
    return scored_models

## Model Testing

In [38]:
def test_model(model,X_test):
    #test a specific model
    test=model[0].predict(X_test[1])
    print("binary values :",test[:10])
    print('''
    -------------------------------------
    ''')
    result=decoder(test)
    print('type of test :',type(X_test))
    print('''
    -------------------------------------
    ''')
    print('matrix : ',X_test[1])
    print('''
    -------------------------------------
    ''')
    print('test matrix shape :',X_test.shape)
    print('''
    -------------------------------------
    ''')
    print(" actual labels :",result[:10])
    print('''
    -------------------------------------
    ''')
    print('bad Tweets = ',result.count('bad'),'good tweets = ', result.count('good'))
    print('''
    -------------------------------------
    ''')

## Confusion Matrix

In [39]:
def con_mat(model,Y_test):
    # print a confusion matrix
    return pd.DataFrame(
        confusion_matrix(Y_test, model[1]),
        index = [['actual', 'actual'], ['bad', 'good']],
        columns = [['predicted', 'predicted'], ['bad', 'good']])

### Counting

In [40]:
counting_models=train_models(X_train=x_train_count,Y_train=y_train_count,X_test=x_test_count,Y_test=y_test_count)

---------------------Start Training-------------------------------
K Nearest Neighbors had Trained and it's Accuracy:  0.6344663088849135  and it's Score:  0.681668686169292
Decision Tree had Trained and it's Accuracy:  0.6779964221824687  and it's Score:  0.677547770700637
Random Forest had Trained and it's Accuracy:  0.7849334128403895  and it's Score:  0.7848906560636184
Logistic Regression had Trained and it's Accuracy:  0.7990459153249851  and it's Score:  0.8003160181710448
SGD Classifier had Trained and it's Accuracy:  0.793082886106142  and it's Score:  0.7964013299432818
Naive Bayes had Trained and it's Accuracy:  0.8075929238719937  and it's Score:  0.8048387096774194
SVM Linear had Trained and it's Accuracy:  0.7871198568872988  and it's Score:  0.7887990534411358
---------------------End of Training-------------------------------
--------  143.9493658542633  Secounds --------


In [41]:
test_model(counting_models['Naive Bayes'],x_test_count)

binary values : [1.]

    -------------------------------------
    
type of test : <class 'scipy.sparse.csr.csr_matrix'>

    -------------------------------------
    
matrix :    (0, 1722)	1
  (0, 2837)	1
  (0, 2982)	1
  (0, 3267)	1
  (0, 3923)	1
  (0, 4683)	1
  (0, 7348)	1
  (0, 8049)	1
  (0, 9234)	1
  (0, 10108)	2
  (0, 10979)	1
  (0, 11452)	1
  (0, 11645)	1
  (0, 13787)	1
  (0, 16400)	1
  (0, 16706)	1
  (0, 19032)	1
  (0, 21864)	1
  (0, 22978)	1
  (0, 24510)	1
  (0, 25743)	1

    -------------------------------------
    
test matrix shape : (5031, 26364)

    -------------------------------------
    
 actual labels : ['good']

    -------------------------------------
    
bad Tweets =  0 good tweets =  1

    -------------------------------------
    


In [42]:
con_mat(counting_models['Naive Bayes'],y_test_count)

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,bad,good
actual,bad,2067,454
actual,good,514,1996


### Binary

In [43]:
binary_models=train_models(X_train=x_train_bin,Y_train=y_train_bin,X_test=x_test_bin,Y_test=y_test_bin)

---------------------Start Training-------------------------------
K Nearest Neighbors had Trained and it's Accuracy:  0.6535480023852117  and it's Score:  0.7014899811611577
Decision Tree had Trained and it's Accuracy:  0.6853508248857086  and it's Score:  0.6839688560590936
Random Forest had Trained and it's Accuracy:  0.7821506658715961  and it's Score:  0.7828843106180665
Logistic Regression had Trained and it's Accuracy:  0.7962631683561916  and it's Score:  0.7969090548840895
SGD Classifier had Trained and it's Accuracy:  0.7871198568872988  and it's Score:  0.7865258122383895
Naive Bayes had Trained and it's Accuracy:  0.808189226793878  and it's Score:  0.8068068068068067
SVM Linear had Trained and it's Accuracy:  0.7827469687934804  and it's Score:  0.7865651239992189
---------------------End of Training-------------------------------
--------  124.3071813583374  Secounds --------


In [44]:
test_model(binary_models['Naive Bayes'],x_test_bin)

binary values : [1.]

    -------------------------------------
    
type of test : <class 'scipy.sparse.csr.csr_matrix'>

    -------------------------------------
    
matrix :    (0, 1722)	1
  (0, 2837)	1
  (0, 2982)	1
  (0, 3267)	1
  (0, 3923)	1
  (0, 4683)	1
  (0, 7348)	1
  (0, 8049)	1
  (0, 9234)	1
  (0, 10108)	1
  (0, 10979)	1
  (0, 11452)	1
  (0, 11645)	1
  (0, 13787)	1
  (0, 16400)	1
  (0, 16706)	1
  (0, 19032)	1
  (0, 21864)	1
  (0, 22978)	1
  (0, 24510)	1
  (0, 25743)	1

    -------------------------------------
    
test matrix shape : (5031, 26364)

    -------------------------------------
    
 actual labels : ['good']

    -------------------------------------
    
bad Tweets =  0 good tweets =  1

    -------------------------------------
    


In [45]:
con_mat(binary_models['Naive Bayes'],y_test_bin)

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,bad,good
actual,bad,2051,470
actual,good,495,2015


### TF-IDF

In [46]:
tfidf_models=train_models(X_train=x_train_tfidf,Y_train=y_train_tfidf,X_test=x_test_tfidf,Y_test=y_test_tfidf)

---------------------Start Training-------------------------------
K Nearest Neighbors had Trained and it's Accuracy:  0.516994633273703  and it's Score:  0.5246478873239437
Decision Tree had Trained and it's Accuracy:  0.6795865633074936  and it's Score:  0.6761751707513057
Random Forest had Trained and it's Accuracy:  0.7817531305903399  and it's Score:  0.7779126213592233
Logistic Regression had Trained and it's Accuracy:  0.8073941562313656  and it's Score:  0.8047551883941164
SGD Classifier had Trained and it's Accuracy:  0.8109719737626714  and it's Score:  0.8062741902627826
Naive Bayes had Trained and it's Accuracy:  0.8207115881534486  and it's Score:  0.8218799368088467
SVM Linear had Trained and it's Accuracy:  0.804611409262572  and it's Score:  0.8027292795504716
---------------------End of Training-------------------------------
--------  76.32120943069458  Secounds --------


In [47]:
test_model(tfidf_models['Naive Bayes'],x_test_tfidf)

binary values : [1.]

    -------------------------------------
    
type of test : <class 'scipy.sparse.csr.csr_matrix'>

    -------------------------------------
    
matrix :    (0, 1722)	0.20717938458819832
  (0, 2837)	0.29545162715649076
  (0, 2982)	0.1995371340950331
  (0, 3267)	0.1714275913345165
  (0, 3923)	0.16563322745814968
  (0, 4683)	0.22921770841588138
  (0, 7348)	0.1435546184138506
  (0, 8049)	0.1817641666610479
  (0, 9234)	0.20654670334082717
  (0, 10108)	0.20227709592114218
  (0, 10979)	0.12689189405816081
  (0, 11452)	0.1294064019785209
  (0, 11645)	0.13294388185263173
  (0, 13787)	0.13052605908760084
  (0, 16400)	0.11623471159652991
  (0, 16706)	0.31597375486081364
  (0, 19032)	0.3244629770211133
  (0, 21864)	0.28112241497938967
  (0, 22978)	0.25208811485484334
  (0, 24510)	0.26076130724574786
  (0, 25743)	0.2943198782626292

    -------------------------------------
    
test matrix shape : (5031, 26364)

    -------------------------------------
    
 actual label

In [48]:
con_mat(tfidf_models['Naive Bayes'],y_test_tfidf)

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,bad,good
actual,bad,2048,473
actual,good,429,2081


# ____________________________________________________________

In [49]:
# save the model
#import pickle

#filename = 'Naive Bayes model.sav'
#pickle.dump(scored_models_tfidf['Naive Bayes'][0], open(filename, 'wb'))
#pickle.dump(vectorizer, open("vectorizer.pickle", "wb")) 
#load model from disk 
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(x_test_tfidf, y_test_tfidf)
#print(result)

# Deploy Model Locally

In [53]:
import pickle

def predict_txt(inp):
    victorizer=pickle.load(open("models/vectorizer.pickle", 'rb'))
    file_name = open('models/Naive_Bayes_model.sav', 'rb')
    loaded_model = pickle.load(file_name)
    result = loaded_model.predict(victorizer.transform([inp]))
    print(victorizer.transform([inp]))
    print('---------------------------------')
    print(decoder(result))




In [54]:
predict_txt('فتي احمق')

  (0, 15539)	1.0
---------------------------------
['bad']


In [55]:
predict_txt('فتى طيب')

  (0, 15538)	0.8039580431345329
  (0, 13168)	0.594686022098462
---------------------------------
['good']


In [None]:
#  Conclusion

In [57]:
print("----------------------------------------- Counting Victorizer Report -----------------------------")

print(classification_report(y_test_count, counting_models['Naive Bayes'][1]))


print("----------------------------------------- Binary Victorizer Report -----------------------------")

print(classification_report(y_test_bin, binary_models['Naive Bayes'][1]))


print("----------------------------------------- TF-IDF Victorizer Report -----------------------------")

print(classification_report(y_test_tfidf, tfidf_models['Naive Bayes'][1]))



----------------------------------------- Counting Victorizer Report -----------------------------
              precision    recall  f1-score   support

         0.0       0.80      0.82      0.81      2521
         1.0       0.81      0.80      0.80      2510

    accuracy                           0.81      5031
   macro avg       0.81      0.81      0.81      5031
weighted avg       0.81      0.81      0.81      5031

----------------------------------------- Binary Victorizer Report -----------------------------
              precision    recall  f1-score   support

         0.0       0.81      0.81      0.81      2521
         1.0       0.81      0.80      0.81      2510

    accuracy                           0.81      5031
   macro avg       0.81      0.81      0.81      5031
weighted avg       0.81      0.81      0.81      5031

----------------------------------------- TF-IDF Victorizer Report -----------------------------
              precision    recall  f1-score   support

### As we can see in the previous cell execution output 

- the best Victorizer for our dataset is TF-IDF 
- the best model performance for our classification task was Naive Bayes


# Morphological Analysis

In [None]:
def morph_analysis_txt( inp):
    
    """
    This Function is used to morph the text
    :param inp: the text that will be morph (string)
    :used by: main.py
    :uses: tokenize_text()
    :return: the morph text (list)
    """
    text = inp
    print(text)

    analyzer = analex.Analex()
    analyzer.set_debug(False)
    result = analyzer.check_text(text)
    word_list=[]
    for word in result:
       for description in word:
           desc_list= [description.word, description.vocalized, description.type, description.affix, description.root,
                       description.tags, description.stem, description.affix_key, description.type,
                       description.tag_original_number, description.tag_original_gender, description.word]
           word_list.append(desc_list)

    return pd.DataFrame(word_list)

In [107]:
def morph_analysis_txt( inp):
    
    """
    This Function is used to morph the text
    :param inp: the text that will be morph (string)
    :used by: main.py
    :uses: tokenize_text()
    :return: the morph text (list)
    """
    text = inp
    print(text)

    analyzer = analex.Analex()
    analyzer.set_debug(False)
    result = analyzer.check_text(text)
    word_list=[]
    for word in result:
       for description in word:
           desc_list= [description.word, description.vocalized, description.type, description.affix, description.root,
                       description.tags, description.stem, description.affix_key, description.type,
                       description.tag_original_number, description.tag_original_gender, description.word]
           word_list.append(desc_list)

    return pd.DataFrame(word_list)

In [108]:
morph_analysis=morph_analysis_txt("السلام عليكم و رحمة الله و بركاته")

السلام عليكم و رحمة الله و بركاته


In [109]:
morph_analysis.shape

(100, 12)

In [110]:
morph_analysis.head(100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,السلام,السُّلَّامَ,Noun:اسم فاعل,ال--َ-,سلم,تعريف:منصوب:متحرك:ينون::,سلام,ال--َ-|السلام,Noun:اسم فاعل,جمع تكسير,,السلام
1,السلام,السُّلَّامُ,Noun:اسم فاعل,ال--ُ-,سلم,تعريف:مرفوع:متحرك:ينون::,سلام,ال--ُ-|السلام,Noun:اسم فاعل,جمع تكسير,,السلام
2,السلام,السُّلَّامِ,Noun:اسم فاعل,ال--ِ-,سلم,تعريف:مجرور:متحرك:ينون::,سلام,ال--ِ-|السلام,Noun:اسم فاعل,جمع تكسير,,السلام
3,السلام,السِّلَامَ,Noun:جامد,ال--َ-,سلم,تعريف:منصوب:متحرك:ينون::,سلام,ال--َ-|السلام,Noun:جامد,جمع تكسير,مؤنث,السلام
4,السلام,السِّلَامُ,Noun:جامد,ال--ُ-,سلم,تعريف:مرفوع:متحرك:ينون::,سلام,ال--ُ-|السلام,Noun:جامد,جمع تكسير,مؤنث,السلام
...,...,...,...,...,...,...,...,...,...,...,...,...
95,بركاته,بِرْكَاتِهِ,Noun:جامد,--َاتِ-هُ,بركة,مضاف:جمع مؤنث سالم:متحرك:ينون:جمع:مؤنث:مجرور::,بركة,--َاتِ-هُ|بركاته,Noun:جامد,مفرد,مؤنث,بركاته
96,بركاته,بِرْكَاتِهِ,Noun:جامد,--َاتِ-هُ,بركة,مضاف:جمع مؤنث سالم:متحرك:ينون:جمع:مؤنث:منصوب::,بركة,--َاتِ-هُ|بركاته,Noun:جامد,مفرد,مؤنث,بركاته
97,بركاته,بِرْكَاتُهِ,Noun:جامد,--َاتُ-هِ,بركة,مضاف:جمع مؤنث سالم:مرفوع:متحرك:ينون:جمع:مؤنث::,بركة,--َاتُ-هِ|بركاته,Noun:جامد,مفرد,مؤنث,بركاته
98,بركاته,بِرْكَاتِهِ,Noun:جامد,--َاتِ-هِ,بركة,مضاف:جمع مؤنث سالم:متحرك:ينون:جمع:مؤنث:مجرور::,بركة,--َاتِ-هِ|بركاته,Noun:جامد,مفرد,مؤنث,بركاته
