# Import required modules

In [None]:
import pandas as pd
import nltk
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import qalsadi.lemmatizer
import time


print("Modules Imported !! ")




# Data Preparing (Load-Clean)

In [None]:
data=pd.read_excel('data.xlsx') # data loading 


In [None]:
data.head() # show dataframe

In [None]:
data['sentiment'].value_counts() #count the values of sentiment 

In [None]:
data = data.dropna() # drop and remove nan (null) value 


In [None]:
# method to remove emoji's

def remove_emoji(text):
    non_arabic_char = re.compile('[^\s\\u0600-\u06FF]')
    text_with_no_spaces = re.sub(non_arabic_char, "", text)
    text_with_single_spaces = " ".join(re.split("\s+", text_with_no_spaces))
    
    return text_with_single_spaces

In [None]:
data.txt[2] #show data before emoji's removal

In [None]:
data["txt"]=data["txt"].map(remove_emoji) #map each row with remove_emoji's function

In [None]:
data.txt[2] # show data sample after apply remove_emoji's

In [None]:
data.head()

In [None]:
data.dropna(axis=1, how='all')


# Data Preprocessing (Tokenize - Stop word remove - stemming or lemmatize) 

# Tokenizing data


In [None]:
def tokenize_text(inp):
    return nltk.tokenize.wordpunct_tokenize(inp)

In [None]:

        
data.txt = data.txt.apply(lambda sentence: nltk.tokenize.wordpunct_tokenize(sentence))


In [None]:
data.head()

# Stop word removal 

In [None]:
def stopword_removal(inp):
    arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))
    for i in inp:
        if i in arb_stopwords:
            inp.remove(i)

    return inp

In [None]:
data.txt=data["txt"].map(stopword_removal)

In [None]:
data.head()

# Stemming data

In [None]:
stemmer=nltk.ISRIStemmer()

In [None]:

def stem(text):
    out=[]
    for i in text:
        out.append(stemmer.stem(i))
            
    return out

In [None]:
start_time=time.time()
#data.txt.map(stem)

data.txt=data.txt.map(stem)
print("-------- ",(time.time() - start_time),' Secounds --------')

In [None]:
#data.head()

# Lemmatizing data

Lemmatizing Take more time than stemming

In [None]:
lemmatizer = qalsadi.lemmatizer.Lemmatizer()

In [None]:

def lemmatize(text):
    out=[]
    for i in text:
        out.append(lemmatizer.lemmatize(i))

    return out

In [None]:
start_time=time.time()

#data.txt.map(lemmatize).head
#data.txt=data.txt.map(lemmatize)

print("-------- ",(time.time() - start_time),' Secounds --------')

In [None]:
data.head()

# Words Joining

In [None]:
def join_text(txt):
    
    return " ".join(txt)

In [None]:
data.txt=data.txt.map(join_text)

In [None]:
data.head()

In [None]:
# convert class labels to  Bad and  Good values

def decoder(arr):
    out=list()
    binary_list=list(arr)
    for item in binary_list:
        if item == 0:
            out.append('bad')
        else:
            out.append('good')
    return out
    
    

# Feature Extraction & Model Training

In [None]:
# Feature extaction using Counter

bag_of_words_vectorizer=CountVectorizer() 
bag_of_words_count = bag_of_words_vectorizer.fit_transform(data["txt"])

In [None]:
from sklearn.model_selection import train_test_split
x_train_count, x_test_count, y_train_count, y_test_count = train_test_split(bag_of_words_count, data['sentiment'], random_state=42, test_size=0.25)

In [None]:
import time
start_time = time.time()

from nltk.classify.scikitlearn import SklearnClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(solver='lbfgs', max_iter=100),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)
scored_models_count=dict()

for name, model in models:
    nltk_model = model
    nltk_model.fit(x_train_count,y_train_count)
    pred = model.predict(x_test_count)
    scored_models_count[name]=[model,pred]
    score=f1_score(y_test_count, pred)
    accuracy = accuracy_score(y_test_count,pred) 
    print(name," Accuracy: ", accuracy," Score: ",score )
    


    
print("-------- ",(time.time() - start_time),' Secounds --------')

In [None]:
#test a specific model
test=scored_models_count['Random Forest'][0].predict(x_test_count[1])
print("binary values :",test[:10])
print('''
-------------------------------------
''')
result=decoder(test)
print('type of test :',type(x_test_count))
print('''
-------------------------------------
''')
print('matrix : ',x_test_count[1])
print('''
-------------------------------------
''')
print('test matrix shape :',x_test_count.shape)
print('''
-------------------------------------
''')
print(" actual labels :",result[:10])
print('''
-------------------------------------
''')
print('bad Tweets = ',result.count('bad'),'good tweets = ', result.count('good'))
print('''
-------------------------------------
''')

In [None]:
# print a confusion matrix and a classification report
print(classification_report(y_test_count, scored_models_count['Naive Bayes'][1]))

pd.DataFrame(
    confusion_matrix(y_test_count, scored_models_count['Naive Bayes'][1]),
    index = [['actual', 'actual'], ['bad', 'good']],
    columns = [['predicted', 'predicted'], ['bad', 'good']])

# ____________________________________________________________

In [None]:
#Feature extraction using binary victor

bag_of_words_vectorizer_binary=CountVectorizer(binary=True) 
bag_of_words_binary = bag_of_words_vectorizer_binary.fit_transform(data["txt"])

In [None]:
x_train_bin, x_test_bin, y_train_bin, y_test_bin = train_test_split(bag_of_words_binary, data['sentiment'], random_state=42, test_size=0.25)

In [None]:
import time
start_time = time.time()

from nltk.classify.scikitlearn import SklearnClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(solver='lbfgs', max_iter=100),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)
scored_models_bin=dict()

for name, model in models:
    nltk_model = model
    nltk_model.fit(x_train_bin,y_train_bin)
    pred = model.predict(x_test_bin)
    scored_models_bin[name]=[model,pred]
    score=f1_score(y_test_bin, pred)
    accuracy = accuracy_score(y_test_bin,pred) 
    print(name," Accuracy: ", accuracy," Score: ",score )
    
print("-------- ",(time.time() - start_time),' Secounds --------')

In [None]:
#test a specific model
test=scored_models_bin['Random Forest'][0].predict(x_test_bin[1])
print("binary values :",test[:])
print('''
-------------------------------------
''')
result=decoder(test)
print('type of test :',type(x_test_bin))
print('''
-------------------------------------
''')
print('matrix : ',x_test_bin[1])
print('''
-------------------------------------
''')
print('test matrix shape :',x_test_bin.shape)
print('''
-------------------------------------
''')
print(" actual labels :",result[:10])
print('''
-------------------------------------
''')
print('bad Tweets = ',result.count('bad'),'good tweets = ', result.count('good'))
print('''
-------------------------------------
''')

In [None]:
# print a confusion matrix and a classification report
print(classification_report(y_test_bin, scored_models_bin['Naive Bayes'][1]))

pd.DataFrame(
    confusion_matrix(y_test_bin, scored_models_bin['Naive Bayes'][1]),
    index = [['actual', 'actual'], ['bad', 'good']],
    columns = [['predicted', 'predicted'], ['bad', 'good']])

In [None]:
#Feature extraction using TF-IDF

vectorizer = TfidfVectorizer()
bag_of_words_tfidf=vectorizer.fit_transform(data["txt"])

In [None]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(bag_of_words_tfidf, data['sentiment'], random_state=42, test_size=0.25)

In [None]:
import time
start_time = time.time()

from nltk.classify.scikitlearn import SklearnClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score
from sklearn.metrics import PrecisionRecallDisplay

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(solver='lbfgs', max_iter=100),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)
scored_models_tfidf=dict()

for name, model in models:
    nltk_model = model
    nltk_model.fit(x_train_tfidf,y_train_tfidf)
    pred = model.predict(x_test_tfidf)
    scored_models_tfidf[name]=[model,pred]
    score=f1_score(y_test_tfidf, pred)
    accuracy = accuracy_score(y_test_tfidf,pred) 
    print(name," Accuracy: ", accuracy," Score: ",score )
    
    
print("-------- ",(time.time() - start_time),' Secounds --------')

In [None]:
#test a specific model
test=scored_models_tfidf['Random Forest'][0].predict(x_test_tfidf[1])
print("binary values :",test[:10])
print('''
-------------------------------------
''')
result=decoder(test)
print('type of test :',type(x_test_tfidf))
print('''
-------------------------------------
''')
print('matrix : ',x_test_tfidf[1])
print('''
-------------------------------------
''')
print('test matrix shape :',x_test_tfidf.shape)
print('''
-------------------------------------
''')
print(" actual labels :",result[:10])
print('''
-------------------------------------
''')
print('bad Tweets = ',result.count('bad'),'good tweets = ', result.count('good'))
print('''
-------------------------------------
''')

In [None]:
# print a confusion matrix and a classification report
print(classification_report(y_test_tfidf, scored_models_tfidf['Naive Bayes'][1]))

pd.DataFrame(
    confusion_matrix(y_test_tfidf, scored_models_tfidf['Naive Bayes'][1]),
    index = [['actual', 'actual'], ['bad', 'good']],
    columns = [['predicted', 'predicted'], ['bad', 'good']])

In [None]:
# save the model
#import pickle

#filename = 'Naive Bayes model.sav'
#pickle.dump(scored_models_tfidf['Naive Bayes'][0], open(filename, 'wb'))
 
#load model from disk 
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(x_test_tfidf, y_test_tfidf)
#print(result)

In [None]:
scored_models_count

In [None]:
scored_models_bin

In [None]:
scored_models_tfidf