In [50]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sys
import nltk

In [51]:
from embed_glove import load_embedded_glove
train = load_embedded_glove
df = pd.read_csv("./assets/preprocessed.csv", encoding="latin1")

# Embedding

In [52]:
from embed_glove import load_embedded_glove
embedding = "glove"
data = load_embedded_glove()
label = df["label"]

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42)

In [55]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [56]:
def create_grid_model(classifier, param_grid, n_splits, test_size):
    cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size)
    grid_model = GridSearchCV(classifier, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=1, scoring='accuracy')
    return grid_model

In [57]:
def evaluation(grid_model, predicted_test, test_label, predicted_train, train_label, file_name, 
           decision_function, clf_name="Classifier"):
    print("Results for ", clf_name, ": ")
    print()
    print("Best parameters are %s" % (grid_model.best_params_))
    acc_train = accuracy_score(train_label, predicted_train)
    acc_test = accuracy_score(test_label, predicted_test)
    print("Train Accuracy:  %0.3f" % (acc_train))
    print("Validation Accuracy:   %0.3f" % (grid_model.best_score_))
    print("Test Accuracy   %0.3f" % (acc_test ))


    print("Mean training time: %f" % (np.mean(grid_model.cv_results_['mean_fit_time'], axis=0)) )
    print("Mean test time: %f" % (np.mean(grid_model.cv_results_['mean_score_time'], axis=0)) )
    
    # confusion matrix
    print("\nConfusion matrix:")
    print ( confusion_matrix(test_label, predicted_test))
    print("\nClassification Report:")
    print ( classification_report(test_label, predicted_test) )
    
    
    f = open(file_name+'.txt','w')
    f.write("Best parameters are %s\n"% (grid_model.best_params_))
    f.write("Train Accuracy: %0.3f\n" % (acc_train))
    f.write("Validation Accuracy: %0.3f\n" % (grid_model.best_score_))
    f.write("Test Accuracy: %0.3f\n" % (acc_test ))
    f.write("\nConfusion Matrix:")
    f.write(str(confusion_matrix(test_label, predicted_test)) + "\n")
    f.write("\nClassification Report:")
    f.write(str(classification_report(test_label, predicted_test)) + "\n\n")
    f.close()
    

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier() # classifier

# classifier parameters
max_depth = [10] #[2, 4, 8, 10, 20]
n_estimators = [1000] # [100, 500, 1000]
max_features = [2 ,'sqrt']
# min_samples_split = [None, 10, 50]

# cross validation
folds = 2
test_size = 0.1

# general 
clf_name = "RandomForest"

# parameters to grid search
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)

model = create_grid_model(classifier, param_grid, folds, test_size)
model.fit(X_train, y_train)

predicted_test = model.predict(X_test)
predicted_train = model.predict(X_train)
decision_function = model.predict_proba(X_test)

evaluation(rfc_model, predicted_test, y_test, predicted_train, y_train, 
       "./results/"+clf_name+"_"+embedding, decision_function, clf_name=clf_name)

Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


# SVM

In [15]:
from sklearn.svm import SVC
classifier = SVC()

# classifier parameters
kernel = ["rbf"]
# gamma = ["scale"]
C = [1, 10, 100, 1000]

# cross validation
folds = 2
test_size = 0.1

# general 
embedding = "glove"
clf_name = "SVM"

# parameters to grid search
param_grid = dict(kernel=kernel, C=C)

model = create_grid_model(classifier, param_grid, folds, test_size)
model.fit(X_train, y_train)

predicted_test = model.predict(X_test)
predicted_train = model.predict(X_train)
decision_function = model.predict_proba(X_test)

evaluation(rfc_model, predicted_test, y_test, predicted_train, y_train, 
       "./results/"+clf_name+"_"+embedding, decision_function, clf_name=clf_name)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:  9.3min remaining: 28.0min


KeyboardInterrupt: 

# Naive-Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

# classifier parameters

# cross validation
folds = 5
test_size = 0.1

# general 
embedding = "glove"
clf_name = "NaiveBayes"

# parameters to grid search
param_grid = dict()

model = create_grid_model(classifier, param_grid, folds, test_size)
model.fit(X_train, y_train)

predicted_test = model.predict(X_test)
predicted_train = model.predict(X_train)
decision_function = model.predict_proba(X_test)

evaluation(rfc_model, predicted_test, y_test, predicted_train, y_train, 
       "./results/"+clf_name+"_"+embedding, decision_function, clf_name=clf_name)

In [16]:
data = pd.read_csv("./assets/preprocessed.csv", encoding="latin1")