# Import required modules

In [1]:
import pandas as pd
import nltk
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import qalsadi.lemmatizer
import time


print("Modules Imported !! ")



Modules Imported !! 



# Data Preparing (Load-Clean)

In [2]:
data=pd.read_excel('data.xlsx') # data loading 


In [3]:
data.head() # show dataframe

Unnamed: 0,sentiment,txt
0,1.0,انشاء الله هنعمل حاجه
1,0.0,اقسم باللله ان العرب اكثر الشعوب تخلفاا
2,0.0,﻿هات ناس تفهم .. و المثل بحكي اسأل مجرب و لا ت...
3,0.0,صرماتي براس اهلك
4,0.0,حرام السخرية من الناس


In [4]:
data['sentiment'].value_counts() #count the values of sentiment 

1.0    10098
0.0    10023
Name: sentiment, dtype: int64

In [5]:
data = data.dropna() # drop and remove nan (null) value 


In [6]:
# method to remove emoji's

def remove_emoji(text):
    non_arabic_char = re.compile('[^\s\\u0600-\u06FF]')
    text_with_no_spaces = re.sub(non_arabic_char, "", text)
    text_with_single_spaces = " ".join(re.split("\s+", text_with_no_spaces))
    
    return text_with_single_spaces

In [7]:
data.txt[2] #show data before emoji's removal

'\ufeffهات ناس تفهم .. و المثل بحكي اسأل مجرب و لا تسأل خبير'

In [8]:
data["txt"]=data["txt"].map(remove_emoji) #map each row with remove_emoji's function

In [9]:
data.txt[2] # show data sample after apply remove_emoji's

'هات ناس تفهم و المثل بحكي اسأل مجرب و لا تسأل خبير'

In [10]:
data.head()

Unnamed: 0,sentiment,txt
0,1.0,انشاء الله هنعمل حاجه
1,0.0,اقسم باللله ان العرب اكثر الشعوب تخلفاا
2,0.0,هات ناس تفهم و المثل بحكي اسأل مجرب و لا تسأل ...
3,0.0,صرماتي براس اهلك
4,0.0,حرام السخرية من الناس


In [11]:
data.dropna(axis=1, how='all')


Unnamed: 0,sentiment,txt
0,1.0,انشاء الله هنعمل حاجه
1,0.0,اقسم باللله ان العرب اكثر الشعوب تخلفاا
2,0.0,هات ناس تفهم و المثل بحكي اسأل مجرب و لا تسأل ...
3,0.0,صرماتي براس اهلك
4,0.0,حرام السخرية من الناس
...,...,...
20196,0.0,المثل يقول ان أكرمت اللئيم تمردا وهذا بالضبط ...
20197,1.0,إلي سهران ريتويت بنسولف عالخاص
20198,0.0,لا تهتم بشخص زياااده ،، تراك بزمن يسمون المهتم...
20199,0.0,مكى عامل ايه انا دورت عليك كتير امبارح واتصلت...


# Data Preprocessing (Tokenize - Stop word remove - stemming or lemmatize) 

# Tokenizing data


In [12]:
def tokenize_text(inp):
    return nltk.tokenize.wordpunct_tokenize(inp)

In [13]:

        
data.txt = data.txt.apply(lambda sentence: nltk.tokenize.wordpunct_tokenize(sentence))


In [14]:
data.head()

Unnamed: 0,sentiment,txt
0,1.0,"[انشاء, الله, هنعمل, حاجه]"
1,0.0,"[اقسم, باللله, ان, العرب, اكثر, الشعوب, تخلفاا]"
2,0.0,"[هات, ناس, تفهم, و, المثل, بحكي, اسأل, مجرب, و..."
3,0.0,"[صرماتي, براس, اهلك]"
4,0.0,"[حرام, السخرية, من, الناس]"


# Stop word removal 

In [15]:
def stopword_removal(inp):
    arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))
    for i in inp:
        if i in arb_stopwords:
            inp.remove(i)

    return inp

In [16]:
data.txt=data["txt"].map(stopword_removal)

In [17]:
data.head()

Unnamed: 0,sentiment,txt
0,1.0,"[انشاء, الله, هنعمل, حاجه]"
1,0.0,"[اقسم, باللله, ان, العرب, اكثر, الشعوب, تخلفاا]"
2,0.0,"[هات, ناس, تفهم, المثل, بحكي, اسأل, مجرب, لا, ..."
3,0.0,"[صرماتي, براس, اهلك]"
4,0.0,"[حرام, السخرية, الناس]"


# Stemming data

In [18]:
stemmer=nltk.ISRIStemmer()

In [19]:

def stem(text):
    out=[]
    for i in text:
        out.append(stemmer.stem(i))
            
    return out

In [20]:
start_time=time.time()
#data.txt.map(stem)

data.txt=data.txt.map(stem)
print("-------- ",(time.time() - start_time),' Secounds --------')

--------  4.122730731964111  Secounds --------


In [21]:
#data.head()

# Lemmatizing data

Lemmatizing Take more time than stemming

In [22]:
lemmatizer = qalsadi.lemmatizer.Lemmatizer()

In [23]:

def lemmatize(text):
    out=[]
    for i in text:
        out.append(lemmatizer.lemmatize(i))

    return out

In [24]:
start_time=time.time()

#data.txt.map(lemmatize).head
#data.txt=data.txt.map(lemmatize)

print("-------- ",(time.time() - start_time),' Secounds --------')

--------  0.000141143798828125  Secounds --------


In [25]:
data.head()

Unnamed: 0,sentiment,txt
0,1.0,"[شاء, الل, هنعمل, حجه]"
1,0.0,"[قسم, لله, ان, عرب, كثر, شعب, خلف]"
2,0.0,"[هات, ناس, فهم, مثل, بحك, سأل, جرب, لا, سأل, خبر]"
3,0.0,"[صرم, برس, اهل]"
4,0.0,"[حرم, سخر, ناس]"


# Words Joining

In [26]:
def join_text(txt):
    
    return " ".join(txt)

In [27]:
data.txt=data.txt.map(join_text)

In [28]:
data.head()

Unnamed: 0,sentiment,txt
0,1.0,شاء الل هنعمل حجه
1,0.0,قسم لله ان عرب كثر شعب خلف
2,0.0,هات ناس فهم مثل بحك سأل جرب لا سأل خبر
3,0.0,صرم برس اهل
4,0.0,حرم سخر ناس


In [29]:
# convert class labels to  Bad and  Good values

def decoder(arr):
    out=list()
    binary_list=list(arr)
    for item in binary_list:
        if item == 0:
            out.append('bad')
        else:
            out.append('good')
    return out
    
    

# Feature Extraction & Model Training

In [30]:
# Feature extaction using Counter

bag_of_words_vectorizer=CountVectorizer() 
bag_of_words_count = bag_of_words_vectorizer.fit_transform(data["txt"])

In [31]:
from sklearn.model_selection import train_test_split
x_train_count, x_test_count, y_train_count, y_test_count = train_test_split(bag_of_words_count, data['sentiment'], random_state=42, test_size=0.25)

In [32]:
import time
start_time = time.time()

from nltk.classify.scikitlearn import SklearnClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(solver='lbfgs', max_iter=100),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)
scored_models_count=dict()

for name, model in models:
    nltk_model = model
    nltk_model.fit(x_train_count,y_train_count)
    pred = model.predict(x_test_count)
    scored_models_count[name]=[model,pred]
    score=f1_score(y_test_count, pred)
    accuracy = accuracy_score(y_test_count,pred) 
    print(name," Accuracy: ", accuracy," Score: ",score )
    


    
print("-------- ",(time.time() - start_time),' Secounds --------')

K Nearest Neighbors  Accuracy:  0.6344663088849135  Score:  0.681668686169292
Decision Tree  Accuracy:  0.6781951898230968  Score:  0.6802291131740075
Random Forest  Accuracy:  0.7885112303716955  Score:  0.7880478087649403


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression  Accuracy:  0.7990459153249851  Score:  0.8003160181710448
SGD Classifier  Accuracy:  0.7934804213873982  Score:  0.7948667324777888
Naive Bayes  Accuracy:  0.8075929238719937  Score:  0.8048387096774194
SVM Linear  Accuracy:  0.7871198568872988  Score:  0.7887990534411358
--------  168.0800642967224  Secounds --------


In [33]:
#test a specific model
test=scored_models_count['Random Forest'][0].predict(x_test_count[1])
print("binary values :",test[:10])
print('''
-------------------------------------
''')
result=decoder(test)
print('type of test :',type(x_test_count))
print('''
-------------------------------------
''')
print('matrix : ',x_test_count[1])
print('''
-------------------------------------
''')
print('test matrix shape :',x_test_count.shape)
print('''
-------------------------------------
''')
print(" actual labels :",result[:10])
print('''
-------------------------------------
''')
print('bad Tweets = ',result.count('bad'),'good tweets = ', result.count('good'))
print('''
-------------------------------------
''')

binary values : [1.]

-------------------------------------

type of test : <class 'scipy.sparse.csr.csr_matrix'>

-------------------------------------

matrix :    (0, 1722)	1
  (0, 2837)	1
  (0, 2982)	1
  (0, 3267)	1
  (0, 3923)	1
  (0, 4683)	1
  (0, 7348)	1
  (0, 8049)	1
  (0, 9234)	1
  (0, 10108)	2
  (0, 10979)	1
  (0, 11452)	1
  (0, 11645)	1
  (0, 13787)	1
  (0, 16400)	1
  (0, 16706)	1
  (0, 19032)	1
  (0, 21864)	1
  (0, 22978)	1
  (0, 24510)	1
  (0, 25743)	1

-------------------------------------

test matrix shape : (5031, 26364)

-------------------------------------

 actual labels : ['good']

-------------------------------------

bad Tweets =  0 good tweets =  1

-------------------------------------



In [34]:
# print a confusion matrix and a classification report
print(classification_report(y_test_count, scored_models_count['Naive Bayes'][1]))

pd.DataFrame(
    confusion_matrix(y_test_count, scored_models_count['Naive Bayes'][1]),
    index = [['actual', 'actual'], ['bad', 'good']],
    columns = [['predicted', 'predicted'], ['bad', 'good']])

              precision    recall  f1-score   support

         0.0       0.80      0.82      0.81      2521
         1.0       0.81      0.80      0.80      2510

    accuracy                           0.81      5031
   macro avg       0.81      0.81      0.81      5031
weighted avg       0.81      0.81      0.81      5031



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,bad,good
actual,bad,2067,454
actual,good,514,1996


# ____________________________________________________________

In [35]:
#Feature extraction using binary victor

bag_of_words_vectorizer_binary=CountVectorizer(binary=True) 
bag_of_words_binary = bag_of_words_vectorizer_binary.fit_transform(data["txt"])

In [36]:
x_train_bin, x_test_bin, y_train_bin, y_test_bin = train_test_split(bag_of_words_binary, data['sentiment'], random_state=42, test_size=0.25)

In [37]:
import time
start_time = time.time()

from nltk.classify.scikitlearn import SklearnClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(solver='lbfgs', max_iter=100),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)
scored_models_bin=dict()

for name, model in models:
    nltk_model = model
    nltk_model.fit(x_train_bin,y_train_bin)
    pred = model.predict(x_test_bin)
    scored_models_bin[name]=[model,pred]
    score=f1_score(y_test_bin, pred)
    accuracy = accuracy_score(y_test_bin,pred) 
    print(name," Accuracy: ", accuracy," Score: ",score )
    
print("-------- ",(time.time() - start_time),' Secounds --------')

K Nearest Neighbors  Accuracy:  0.6535480023852117  Score:  0.7014899811611577
Decision Tree  Accuracy:  0.693500298151461  Score:  0.6948951325682627
Random Forest  Accuracy:  0.774597495527728  Score:  0.7774725274725274


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression  Accuracy:  0.7962631683561916  Score:  0.7969090548840895
SGD Classifier  Accuracy:  0.7869210892466706  Score:  0.7908700741318768
Naive Bayes  Accuracy:  0.808189226793878  Score:  0.8068068068068067
SVM Linear  Accuracy:  0.7827469687934804  Score:  0.7865651239992189
--------  159.35423183441162  Secounds --------


In [38]:
#test a specific model
test=scored_models_bin['Random Forest'][0].predict(x_test_bin[1])
print("binary values :",test[:])
print('''
-------------------------------------
''')
result=decoder(test)
print('type of test :',type(x_test_bin))
print('''
-------------------------------------
''')
print('matrix : ',x_test_bin[1])
print('''
-------------------------------------
''')
print('test matrix shape :',x_test_bin.shape)
print('''
-------------------------------------
''')
print(" actual labels :",result[:10])
print('''
-------------------------------------
''')
print('bad Tweets = ',result.count('bad'),'good tweets = ', result.count('good'))
print('''
-------------------------------------
''')

binary values : [1.]

-------------------------------------

type of test : <class 'scipy.sparse.csr.csr_matrix'>

-------------------------------------

matrix :    (0, 1722)	1
  (0, 2837)	1
  (0, 2982)	1
  (0, 3267)	1
  (0, 3923)	1
  (0, 4683)	1
  (0, 7348)	1
  (0, 8049)	1
  (0, 9234)	1
  (0, 10108)	1
  (0, 10979)	1
  (0, 11452)	1
  (0, 11645)	1
  (0, 13787)	1
  (0, 16400)	1
  (0, 16706)	1
  (0, 19032)	1
  (0, 21864)	1
  (0, 22978)	1
  (0, 24510)	1
  (0, 25743)	1

-------------------------------------

test matrix shape : (5031, 26364)

-------------------------------------

 actual labels : ['good']

-------------------------------------

bad Tweets =  0 good tweets =  1

-------------------------------------



In [39]:
# print a confusion matrix and a classification report
print(classification_report(y_test_bin, scored_models_bin['Naive Bayes'][1]))

pd.DataFrame(
    confusion_matrix(y_test_bin, scored_models_bin['Naive Bayes'][1]),
    index = [['actual', 'actual'], ['bad', 'good']],
    columns = [['predicted', 'predicted'], ['bad', 'good']])

              precision    recall  f1-score   support

         0.0       0.81      0.81      0.81      2521
         1.0       0.81      0.80      0.81      2510

    accuracy                           0.81      5031
   macro avg       0.81      0.81      0.81      5031
weighted avg       0.81      0.81      0.81      5031



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,bad,good
actual,bad,2051,470
actual,good,495,2015


In [40]:
#Feature extraction using TF-IDF

vectorizer = TfidfVectorizer()
bag_of_words_tfidf=vectorizer.fit_transform(data["txt"])

In [41]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(bag_of_words_tfidf, data['sentiment'], random_state=42, test_size=0.25)

In [42]:
import time
start_time = time.time()

from nltk.classify.scikitlearn import SklearnClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score
from sklearn.metrics import PrecisionRecallDisplay

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(solver='lbfgs', max_iter=100),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)
scored_models_tfidf=dict()

for name, model in models:
    nltk_model = model
    nltk_model.fit(x_train_tfidf,y_train_tfidf)
    pred = model.predict(x_test_tfidf)
    scored_models_tfidf[name]=[model,pred]
    score=f1_score(y_test_tfidf, pred)
    accuracy = accuracy_score(y_test_tfidf,pred) 
    print(name," Accuracy: ", accuracy," Score: ",score )
    
    
print("-------- ",(time.time() - start_time),' Secounds --------')

K Nearest Neighbors  Accuracy:  0.516994633273703  Score:  0.5246478873239437
Decision Tree  Accuracy:  0.6813754720731465  Score:  0.6784353059177533
Random Forest  Accuracy:  0.7948717948717948  Score:  0.7905844155844155
Logistic Regression  Accuracy:  0.8073941562313656  Score:  0.8047551883941164
SGD Classifier  Accuracy:  0.810176903200159  Score:  0.8066410204494836
Naive Bayes  Accuracy:  0.8207115881534486  Score:  0.8218799368088467
SVM Linear  Accuracy:  0.804611409262572  Score:  0.8027292795504716
--------  102.88230919837952  Secounds --------


In [43]:
#test a specific model
test=scored_models_tfidf['Random Forest'][0].predict(x_test_tfidf[1])
print("binary values :",test[:10])
print('''
-------------------------------------
''')
result=decoder(test)
print('type of test :',type(x_test_tfidf))
print('''
-------------------------------------
''')
print('matrix : ',x_test_tfidf[1])
print('''
-------------------------------------
''')
print('test matrix shape :',x_test_tfidf.shape)
print('''
-------------------------------------
''')
print(" actual labels :",result[:10])
print('''
-------------------------------------
''')
print('bad Tweets = ',result.count('bad'),'good tweets = ', result.count('good'))
print('''
-------------------------------------
''')

binary values : [1.]

-------------------------------------

type of test : <class 'scipy.sparse.csr.csr_matrix'>

-------------------------------------

matrix :    (0, 1722)	0.20717938458819832
  (0, 2837)	0.29545162715649076
  (0, 2982)	0.1995371340950331
  (0, 3267)	0.1714275913345165
  (0, 3923)	0.16563322745814968
  (0, 4683)	0.22921770841588138
  (0, 7348)	0.1435546184138506
  (0, 8049)	0.1817641666610479
  (0, 9234)	0.20654670334082717
  (0, 10108)	0.20227709592114218
  (0, 10979)	0.12689189405816081
  (0, 11452)	0.1294064019785209
  (0, 11645)	0.13294388185263173
  (0, 13787)	0.13052605908760084
  (0, 16400)	0.11623471159652991
  (0, 16706)	0.31597375486081364
  (0, 19032)	0.3244629770211133
  (0, 21864)	0.28112241497938967
  (0, 22978)	0.25208811485484334
  (0, 24510)	0.26076130724574786
  (0, 25743)	0.2943198782626292

-------------------------------------

test matrix shape : (5031, 26364)

-------------------------------------

 actual labels : ['good']

------------------

In [44]:
# print a confusion matrix and a classification report
print(classification_report(y_test_tfidf, scored_models_tfidf['Naive Bayes'][1]))

pd.DataFrame(
    confusion_matrix(y_test_tfidf, scored_models_tfidf['Naive Bayes'][1]),
    index = [['actual', 'actual'], ['bad', 'good']],
    columns = [['predicted', 'predicted'], ['bad', 'good']])

              precision    recall  f1-score   support

         0.0       0.83      0.81      0.82      2521
         1.0       0.81      0.83      0.82      2510

    accuracy                           0.82      5031
   macro avg       0.82      0.82      0.82      5031
weighted avg       0.82      0.82      0.82      5031



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,bad,good
actual,bad,2048,473
actual,good,429,2081


In [45]:
# save the model
#import pickle

#filename = 'Naive Bayes model.sav'
#pickle.dump(scored_models_tfidf['Naive Bayes'][0], open(filename, 'wb'))
 
#load model from disk 
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(x_test_tfidf, y_test_tfidf)
#print(result)

In [46]:
scored_models_count

{'K Nearest Neighbors': [KNeighborsClassifier(),
  array([1., 1., 1., ..., 0., 0., 0.])],
 'Decision Tree': [DecisionTreeClassifier(),
  array([1., 0., 1., ..., 0., 1., 0.])],
 'Random Forest': [RandomForestClassifier(),
  array([1., 1., 1., ..., 0., 1., 0.])],
 'Logistic Regression': [LogisticRegression(),
  array([1., 1., 0., ..., 1., 1., 0.])],
 'SGD Classifier': [SGDClassifier(max_iter=100),
  array([1., 1., 1., ..., 0., 1., 0.])],
 'Naive Bayes': [MultinomialNB(), array([1., 1., 0., ..., 0., 1., 0.])],
 'SVM Linear': [SVC(kernel='linear'), array([1., 1., 0., ..., 1., 1., 0.])]}

In [47]:
scored_models_bin

{'K Nearest Neighbors': [KNeighborsClassifier(),
  array([1., 1., 1., ..., 1., 1., 1.])],
 'Decision Tree': [DecisionTreeClassifier(),
  array([1., 1., 0., ..., 1., 1., 0.])],
 'Random Forest': [RandomForestClassifier(),
  array([1., 1., 0., ..., 0., 0., 0.])],
 'Logistic Regression': [LogisticRegression(),
  array([1., 1., 1., ..., 0., 1., 0.])],
 'SGD Classifier': [SGDClassifier(max_iter=100),
  array([1., 1., 0., ..., 1., 1., 0.])],
 'Naive Bayes': [MultinomialNB(), array([1., 1., 0., ..., 0., 1., 0.])],
 'SVM Linear': [SVC(kernel='linear'), array([1., 1., 1., ..., 1., 1., 0.])]}

In [48]:
scored_models_tfidf

{'K Nearest Neighbors': [KNeighborsClassifier(),
  array([0., 1., 0., ..., 1., 1., 1.])],
 'Decision Tree': [DecisionTreeClassifier(),
  array([1., 0., 1., ..., 0., 1., 0.])],
 'Random Forest': [RandomForestClassifier(),
  array([1., 1., 1., ..., 0., 1., 0.])],
 'Logistic Regression': [LogisticRegression(),
  array([1., 1., 1., ..., 0., 1., 0.])],
 'SGD Classifier': [SGDClassifier(max_iter=100),
  array([1., 1., 1., ..., 0., 1., 0.])],
 'Naive Bayes': [MultinomialNB(), array([1., 1., 0., ..., 0., 1., 0.])],
 'SVM Linear': [SVC(kernel='linear'), array([1., 1., 1., ..., 0., 1., 0.])]}