In [2]:
import pandas as pd
import numpy as np
import json
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sys

import pickle


%matplotlib inline


In [3]:

rand_seed = 0  # random state for reproducibility
np.random.seed(rand_seed)
# reading our prepared data
data = pd.read_excel('ALL_data.xlsx')
data = data.dropna()
data.head()

clean_data = data

clean_data.shape


clean_data['clean']=clean_data['comment']

In [4]:
def random_split(clean_data, features, output, fraction, seed=0):
    X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                        data[output],
                                                        stratify = data[output],
                                                        random_state=seed,
                                                        train_size=fraction
                                                       )
    train_data = pd.DataFrame(data=X_train, columns=features)
    train_data[output] = y_train
    test_data = pd.DataFrame(data=X_test, columns=features)
    test_data[output] = y_test
    
    return train_data, test_data

In [5]:
train_fraction = .80 # use this to split data into training (80%), and tmp (20%)
val_fraction = .50   # use this to split the tmp data into validation (50%), and 
                     # testing (50%) which means that the validation will be 10% of the original data as well as the

output = 'label' # output label column
features = clean_data.columns.tolist() # the features columns
features.remove(output)
features.remove('comment')
print('output:', output)
print('features:', features)

train_data, tmp = random_split(data, features, output, train_fraction, rand_seed)
val_data, test_data = random_split(tmp, features, output, val_fraction, rand_seed)

print("train data = "+str(len(train_data)))
print("val  data = "+str(len(val_data)))
print("test  data = "+str(len(test_data)))

print("all data = "+str(len(data)))

output: label
features: ['clean']
train data = 2458
val  data = 1536
test  data = 1537
all data = 3073


In [7]:
def train_n_test_classifier(clf, train_features, train_labels, test_features, test_labels,data):
    clf.fit(train_features, train_labels) # please learn patterns from the data

   
    
    print("accuracy_score Score on training data:")
    print(clf.score(train_features, train_labels))
    
    
    print('_'*100)

    print("score on testing data:")
    
    pred_y = clf.predict(test_features)
    
    val_data['predict']=pred_y
 
    count=0
    
    print("accuracy_score Score on test data:")
    print(accuracy_score(test_labels, pred_y))
    
    print("f1_score  on test data:")
    print(f1_score(test_labels, pred_y, average='macro'))
    filename='result_'+str(clf)[0:15]+'.xlsx'
    val_data.to_excel(filename)
    




In [8]:

bow_lemma_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features = 1000)
train_data_features_bow_lemma = bow_lemma_vectorizer.fit_transform(train_data['clean'])
val_data_features_bow_lemma= bow_lemma_vectorizer.transform(val_data['clean'])
test_data_features_bow_lemma = bow_lemma_vectorizer.transform(test_data['clean'])
# B O W 

logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features_bow_lemma, train_data[output],
                        val_data_features_bow_lemma, val_data[output],val_data)

# TF IDF 

tfidf_lemma_vectorizer = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True, max_df=0.5,  use_idf=True)
# vectorizer = CountVectorizer(ngram_range=(1, 2))

train_data_features_tfidf_lemma = tfidf_lemma_vectorizer.fit_transform(train_data['clean'])
val_data_features_tfidf_lemma= tfidf_lemma_vectorizer.transform(val_data['clean'])
test_data_features_tfidf_lemma = tfidf_lemma_vectorizer.transform(test_data['clean'])

logistic_reg = LogisticRegression(random_state=rand_seed)

train_n_test_classifier(logistic_reg, train_data_features_tfidf_lemma, train_data[output],
                        val_data_features_tfidf_lemma, val_data[output],val_data)

accuracy_score Score on training data:
0.8730675345809601
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.8678385416666666
f1_score  on test data:
0.850989666103667
accuracy_score Score on training data:
0.9092758340113913
____________________________________________________________________________________________________
score on testing data:
accuracy_score Score on test data:
0.9127604166666666
f1_score  on test data:
0.9040499512815194
