In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sys
import nltk

In [2]:
train = pd.read_csv("./assets/Corona_NLP_train_clean.csv", encoding="latin1")
test = pd.read_csv("./assets/Corona_NLP_test_clean.csv", encoding="latin1")

In [3]:
def classes_def(x):
    if x ==  "Extremely Positive":
        return "2"
    elif x == "Extremely Negative":
        return "0"
    elif x == "Negative":
        return "0"
    elif x ==  "Positive":
        return "2"
    else:
        return "1"

In [4]:
train['text'] = train.OriginalTweet
train["text"] = train["text"].astype(str)

test['text'] = test.OriginalTweet
test["text"] = test["text"].astype(str)

In [5]:
train['label']=train['Sentiment'].apply(lambda x:classes_def(x))
test['label']=test['Sentiment'].apply(lambda x:classes_def(x))

# Embedding

In [6]:
import libs.preprocessing
from libs.embedding import vectorize_with_glove

In [7]:
X_train_raw = train["text"].tolist()
y_train = train["label"].tolist()
X_test_raw = test["text"].tolist()
y_test = test["label"].tolist()

In [8]:
X_train = vectorize_with_glove(X_train_raw)
X_test = vectorize_with_glove(X_test_raw)

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
def create_grid_model(classifier, param_grid, n_splits, test_size):
    cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size)
    grid_model = GridSearchCV(classifier, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=1, scoring='accuracy')
    return grid_model

In [13]:
def evaluation(grid_model, predicted_test, test_label, predicted_train, train_label, file_name, 
           decision_function, clf_name="Classifier"):
    print("Results for ", clf_name, ": ")
    print()
    print("Best parameters are %s" % (grid_model.best_params_))
    acc_train = accuracy_score(train_label, predicted_train)
    acc_test = accuracy_score(test_label, predicted_test)
    print("Train Accuracy:  %0.3f" % (acc_train))
    print("Validation Accuracy:   %0.3f" % (grid_model.best_score_))
    print("Test Accuracy   %0.3f" % (acc_test ))


    print("Mean training time: %f" % (np.mean(grid_model.cv_results_['mean_fit_time'], axis=0)) )
    print("Mean test time: %f" % (np.mean(grid_model.cv_results_['mean_score_time'], axis=0)) )
    
    # confusion matrix
    print("\nConfusion matrix:")
    print ( confusion_matrix(test_label, predicted_test) )
    print("\nClassification Report:")
    print ( classification_report(test_label, predicted_test) )
    
    
    f = open(file_name+'.txt','w')
    f.write("Best parameters are %s\n"% (grid_model.best_params_))
    f.write("Train Accuracy: %0.3f\n" % (acc_train))
    f.write("Validation Accuracy: %0.3f\n" % (grid_model.best_score_))
    f.write("Test Accuracy: %0.3f\n" % (acc_test ))
    f.write("\nConfusion Matrix:")
    f.write(str(confusion_matrix(test_label, predicted_test)) + "\n")
    f.write("\nClassification Report:")
    f.write(str(classification_report(test_label, predicted_test)) + "\n\n")
    f.close()
    

In [14]:
rfc = RandomForestClassifier() # classifier

# classifier parameters
max_depth = [4] #[2, 4, 8, 10, 20]
n_estimators = [100] # [100, 500, 1000]
max_features = [2 ,'sqrt']
# min_samples_split = [None, 10, 50]

# cross validation
folds = 2
test_size = 0.1

embedding = "glove"
# parameters to grid search
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)

rfc_model = create_grid_model(rfc, param_grid, folds, test_size)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

evaluation(rfc_model, predicted_test, y_test, predicted_train, y_train, 
       "./results/rfc_"+embedding, decision_function, clf_name="RandomForest")

Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   11.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   11.7s finished


Results for  RandomForest : 

Best parameters are {'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 100}
Train Accuracy:  0.532
Validation Accuracy:   0.522
Test Accuracy   0.502
Mean training time: 7.235193
Mean test time: 0.037036

Confusion matrix:
[[ 516    8 1109]
 [ 136   26  457]
 [ 178    5 1363]]

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.32      0.42      1633
           1       0.67      0.04      0.08       619
           2       0.47      0.88      0.61      1546

    accuracy                           0.50      3798
   macro avg       0.58      0.41      0.37      3798
weighted avg       0.57      0.50      0.44      3798

