In [None]:
import pandas as pd
import numpy as np
import json
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sys
from sklearn.neighbors import KNeighborsClassifier

import pickle


%matplotlib inline
rand_seed = 0  # random state for reproducibility
np.random.seed(rand_seed)


In [None]:
data = pd.read_excel('data/datasix.xlsx')
data = data.dropna()
data.head()
data.shape




In [None]:
data = data[data['label'] != 'محايد']
data

In [None]:
pP=data[data['label'] == 'ايجابي']
pNg=data[data['label'] == 'سلبي']
print(len(data))
print(len(pP),len(pNg))

# posative negative on comment

In [None]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))


In [None]:

testNE=test_data[test_data['label'] == 'سلبي']
testNg=test_data[test_data['label'] == 'ايجابي']

print(len(test_data))
print(len(testNE),len(testNg))
print("_______________")



trainNE=train_data[train_data['label'] == 'سلبي']
trainNg=train_data[train_data['label'] == 'ايجابي']
print(len(train_data))
print(len(trainNE),len(trainNg))

In [None]:


trainNE=train_data[train_data['label'] == 'سلبي']
trainNg=train_data[train_data['label'] == 'ايجابي']
print(len(train_data))
print(len(trainNE),len(trainNg))

In [None]:


# TF IDF


vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['comment'])
test_data_features = vectorizer.transform(test_data['comment'])

#SHAPE

train_data_features.shape, test_data_features.shape

#FUNCTION FOR MODEL TRAIN



#SHAPE

train_data_features.shape, test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('posative','negative')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


In [None]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))


# BOW


vectorizer = CountVectorizer(ngram_range=(1, 2))
train_data_features = vectorizer.fit_transform(train_data['comment'])

test_data_features = vectorizer.transform(test_data['comment'])




#SHAPE

train_data_features.shape, test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('posative','negative')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# # posative negative on lemma

In [None]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))


# TF IDF


vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['lemma'])
test_data_features = vectorizer.transform(test_data['lemma'])

#SHAPE

train_data_features.shape, test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('posative','negative')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


In [None]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))


# BOW


vectorizer = CountVectorizer(ngram_range=(1, 2))
train_data_features = vectorizer.fit_transform(train_data['lemma'])

test_data_features = vectorizer.transform(test_data['lemma'])




#SHAPE

train_data_features.shape, test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('posative','negative')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# posative negative on clean

In [None]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))



# TFIDF


vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['clean'])
test_data_features = vectorizer.transform(test_data['clean'])

#SHAPE

train_data_features.shape,  test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('posative','negative')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


In [None]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))



# BOW


vectorizer = CountVectorizer(ngram_range=(1, 2))
train_data_features = vectorizer.fit_transform(train_data['clean'])

test_data_features = vectorizer.transform(test_data['clean'])

#SHAPE

train_data_features.shape,  test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('posative','negative')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# tow classfier(neutral, non-neutral)

In [None]:
# reading our prepared data
data = pd.read_excel('data/datasix.xlsx')
data = data.dropna()
data.head()


In [None]:
positive_data = data[data['label'] == 'ايجابي'].dropna()
negative_data = data[data['label'] == 'سلبي'].dropna()
neutral_data = data[data['label'] == 'محايد'].dropna()
len(positive_data), len(negative_data), len(neutral_data)

non_neutral_data = negative_data.append(positive_data).sample(frac=1).reset_index(drop=True)
non_neutral_data['label'] = 'غير محايد'

In [None]:
data = non_neutral_data.append(neutral_data).dropna().sample(frac=1).reset_index(drop=True)
data

In [None]:
# reading our prepared data

print(data['label'].unique())


# neutral non-neutral  on comment

In [None]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))




In [None]:

testNE=test_data[test_data['label'] == 'محايد']
testNg=test_data[test_data['label'] == 'غير محايد']

print(len(test_data))
print(len(testNE),len(testNg))
print("_______________")



trainNE=train_data[train_data['label'] == 'محايد']
trainNg=train_data[train_data['label'] == 'غير محايد']
print(len(train_data))
print(len(trainNE),len(trainNg))

print("__________________")

trainNE=data[data['label'] == 'محايد']
trainNg=data[data['label'] == 'غير محايد']
print(len(data))
print(len(trainNE),len(trainNg))


In [None]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))



# TFIDF


vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['comment'])
test_data_features = vectorizer.transform(test_data['comment'])

#SHAPE

train_data_features.shape,  test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('neutral','non-neutral')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


#  neutral non-neutral  on comment

In [None]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))



# BOW


vectorizer = CountVectorizer(ngram_range=(1, 2))
train_data_features = vectorizer.fit_transform(train_data['comment'])
test_data_features = vectorizer.transform(test_data['comment'])

#SHAPE

train_data_features.shape,  test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('neutral','non-neutral')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


#  neutral non-neutral  on comment

In [25]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))



# TFIDF


vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['lemma'])
test_data_features = vectorizer.transform(test_data['lemma'])

#SHAPE

train_data_features.shape,  test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('neutral','non-neutral')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


output: label
features: ['comment', 'lemma', 'clean']
4928
1233
6161
6161
----------------------------------------------------------------------------------------------------LogisticRegress
accuracy_score Score on training data:
0.8409090909090909
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.8085969180859692
f1_score  on test data:
0.5994152046783627
              precision    recall  f1-score   support

     neutral       0.81      0.98      0.89       961
 non-neutral       0.76      0.19      0.31       272

    accuracy                           0.81      1233
   macro avg       0.78      0.59      0.60      1233
weighted avg       0.80      0.81      0.76      1233

----------------------------------------------------------------------------------------------------MultinomialNB()
accuracy_score Score on training data:
0.8352272727272727
______________________________

Iteration 20, loss = 0.02097216
Iteration 21, loss = 0.02230143
Iteration 22, loss = 0.02007001
Iteration 23, loss = 0.02030985
Iteration 24, loss = 0.02010952
Iteration 25, loss = 0.02183377
Iteration 26, loss = 0.02098370
Iteration 27, loss = 0.02059519
Training loss did not improve more than tol=0.001000 for 10 consecutive epochs. Stopping.
              precision    recall  f1-score   support

     neutral       0.81      0.98      0.89       961
 non-neutral       0.73      0.17      0.28       272

    accuracy                           0.80      1233
   macro avg       0.77      0.58      0.58      1233
weighted avg       0.79      0.80      0.75      1233

Iteration 1, loss = 0.80112399
Iteration 2, loss = 0.69365701
Iteration 3, loss = 0.61347923
Iteration 4, loss = 0.52845836
Iteration 5, loss = 0.44147898
Iteration 6, loss = 0.32285280
Iteration 7, loss = 0.18336833
Iteration 8, loss = 0.09630520
Iteration 9, loss = 0.05525567
Iteration 10, loss = 0.03826311
Iteration 11, lo

In [8]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))



# BOW


vectorizer = CountVectorizer(ngram_range=(1, 2))
train_data_features = vectorizer.fit_transform(train_data['lemma'])
test_data_features = vectorizer.transform(test_data['lemma'])

#SHAPE

train_data_features.shape,  test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('neutral','non-neutral')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


output: label
features: ['comment', 'lemma', 'clean']
4928
1233
6161
6161
----------------------------------------------------------------------------------------------------LogisticRegress
accuracy_score Score on training data:
0.9655032467532467
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.8248175182481752
f1_score  on test data:
0.6888924192583732
              precision    recall  f1-score   support

     neutral       0.84      0.95      0.89       961
 non-neutral       0.69      0.37      0.48       272

    accuracy                           0.82      1233
   macro avg       0.77      0.66      0.69      1233
weighted avg       0.81      0.82      0.80      1233

----------------------------------------------------------------------------------------------------MultinomialNB()
accuracy_score Score on training data:
0.9500811688311688
______________________________

Iteration 17, loss = 0.02307789
Iteration 18, loss = 0.02369429
Iteration 19, loss = 0.02257307
Iteration 20, loss = 0.02239378
Iteration 21, loss = 0.02148833
Iteration 22, loss = 0.02292281
Iteration 23, loss = 0.02322481
Iteration 24, loss = 0.02263071
Iteration 25, loss = 0.02221918
Iteration 26, loss = 0.02171068
Iteration 27, loss = 0.02273698
Iteration 28, loss = 0.02124820
Training loss did not improve more than tol=0.001000 for 10 consecutive epochs. Stopping.
              precision    recall  f1-score   support

     neutral       0.83      0.97      0.89       961
 non-neutral       0.72      0.31      0.44       272

    accuracy                           0.82      1233
   macro avg       0.78      0.64      0.66      1233
weighted avg       0.81      0.82      0.79      1233

Iteration 1, loss = 0.69762774
Iteration 2, loss = 0.52316287
Iteration 3, loss = 0.40993790
Iteration 4, loss = 0.30986094
Iteration 5, loss = 0.21321527
Iteration 6, loss = 0.12874443
Iteration 7, 

In [9]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))



# TFIDF


vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words=None, use_idf=True)
train_data_features = vectorizer.fit_transform(train_data['clean'])
test_data_features = vectorizer.transform(test_data['clean'])

#SHAPE

train_data_features.shape,  test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('neutral','non-neutral')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


output: label
features: ['comment', 'lemma', 'clean']
4928
1233
6161
6161
----------------------------------------------------------------------------------------------------LogisticRegress
accuracy_score Score on training data:
0.8303571428571429
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.7996755879967559
f1_score  on test data:
0.5645868986193496
              precision    recall  f1-score   support

     neutral       0.80      0.98      0.88       961
 non-neutral       0.73      0.15      0.24       272

    accuracy                           0.80      1233
   macro avg       0.77      0.57      0.56      1233
weighted avg       0.79      0.80      0.74      1233

----------------------------------------------------------------------------------------------------MultinomialNB()
accuracy_score Score on training data:
0.8492288961038961
______________________________

Iteration 23, loss = 0.02102202
Iteration 24, loss = 0.02088720
Iteration 25, loss = 0.01956856
Iteration 26, loss = 0.02037638
Training loss did not improve more than tol=0.001000 for 10 consecutive epochs. Stopping.
              precision    recall  f1-score   support

     neutral       0.82      0.98      0.89       961
 non-neutral       0.78      0.22      0.34       272

    accuracy                           0.81      1233
   macro avg       0.80      0.60      0.62      1233
weighted avg       0.81      0.81      0.77      1233

Iteration 1, loss = 0.55431815
Iteration 2, loss = 0.49347004
Iteration 3, loss = 0.41471603
Iteration 4, loss = 0.28817173
Iteration 5, loss = 0.19150254
Iteration 6, loss = 0.13756560
Iteration 7, loss = 0.09799809
Iteration 8, loss = 0.06530479
Iteration 9, loss = 0.04579926
Iteration 10, loss = 0.03748447
Iteration 11, loss = 0.03173582
Iteration 12, loss = 0.02909220
Iteration 13, loss = 0.02784381
Iteration 14, loss = 0.02539075
Iteration 15, lo

In [10]:
# RANDOM SPLIT

def random_split(data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data



#LABE4L FEATURE


train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
  # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the


output = 'label' # output label column
features = data.columns.tolist() # the features columns
features.remove(output)
print('output:', output)
print('features:', features)

train_data, test_data = random_split(data, features, output, train_fraction, rand_seed)

print(len(train_data))
print(len(test_data))


print(len(train_data)+len(test_data))
print(len(data))



# BOW


vectorizer = CountVectorizer(ngram_range=(1, 2))
train_data_features = vectorizer.fit_transform(train_data['clean'])
test_data_features = vectorizer.transform(test_data['clean'])

#SHAPE

train_data_features.shape,  test_data_features.shape

#FUNCTION FOR MODEL TRAIN


def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    print('-'*100+str(clf)[0:15])
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    data.to_excel(filename)
    
    
    sentiment_fit=clf.fit(train_features,train_labels)
    y_pred=sentiment_fit.predict(test_features)
    print(classification_report(test_labels,y_pred,target_names=('neutral','non-neutral')))
    
    

#LOGASTIC REGRESSION



logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)



# MultinomialNB

mnb = MultinomialNB()

train_n_test_classifier(mnb,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)
# S V M

svm = SVC(kernel='linear', probability=True, random_state=rand_seed)

train_n_test_classifier(svm,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


# MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)
train_n_test_classifier(mlp,  train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

# KNN

knn = KNeighborsClassifier(n_neighbors=3)

train_n_test_classifier(knn, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

#TREE
dtree = DecisionTreeClassifier(random_state=0)
train_n_test_classifier(dtree, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


estimator = []


estimator.append(('LR', LogisticRegression(solver ='lbfgs',  multi_class ='multinomial',  max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('mnb',MultinomialNB()))
estimator.append(('mlp',MLPClassifier(hidden_layer_sizes=(20,20,20,20), verbose=True, tol=0.001, random_state=rand_seed)))
estimator.append(('knn',KNeighborsClassifier(n_neighbors=3)))     


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

train_n_test_classifier(vot_hard, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


vot_soft = VotingClassifier(estimators = estimator, voting ='soft')

train_n_test_classifier(vot_soft, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


Gradientclf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1, random_state=0)

train_n_test_classifier(Gradientclf,train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

adaClf = AdaBoostClassifier(n_estimators=100, random_state=0)

train_n_test_classifier(adaClf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)

rf = RandomForestClassifier(n_estimators=100, random_state=rand_seed)

train_n_test_classifier(rf, train_data_features, train_data[output],
                        test_data_features, test_data[output],test_data)


output: label
features: ['comment', 'lemma', 'clean']
4928
1233
6161
6161
----------------------------------------------------------------------------------------------------LogisticRegress
accuracy_score Score on training data:
0.9675324675324676
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.8256285482562855
f1_score  on test data:
0.6824087551889589
              precision    recall  f1-score   support

     neutral       0.84      0.96      0.90       961
 non-neutral       0.71      0.35      0.47       272

    accuracy                           0.83      1233
   macro avg       0.78      0.65      0.68      1233
weighted avg       0.81      0.83      0.80      1233

----------------------------------------------------------------------------------------------------MultinomialNB()
accuracy_score Score on training data:
0.9571834415584416
______________________________

Iteration 25, loss = 0.01911363
Training loss did not improve more than tol=0.001000 for 10 consecutive epochs. Stopping.
              precision    recall  f1-score   support

     neutral       0.84      0.96      0.90       961
 non-neutral       0.72      0.36      0.48       272

    accuracy                           0.83      1233
   macro avg       0.78      0.66      0.69      1233
weighted avg       0.81      0.83      0.81      1233

Iteration 1, loss = 0.53881679
Iteration 2, loss = 0.42653447
Iteration 3, loss = 0.30291878
Iteration 4, loss = 0.19369236
Iteration 5, loss = 0.13221944
Iteration 6, loss = 0.08342243
Iteration 7, loss = 0.05269765
Iteration 8, loss = 0.03883665
Iteration 9, loss = 0.03351001
Iteration 10, loss = 0.03303654
Iteration 11, loss = 0.02785031
Iteration 12, loss = 0.02668035
Iteration 13, loss = 0.02475242
Iteration 14, loss = 0.02308797
Iteration 15, loss = 0.02263198
Iteration 16, loss = 0.02173890
Iteration 17, loss = 0.02079299
Iteration 18, lo