# Feature Preparation

In [4]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(42)

### Load data

In [5]:
with open('data/rev_sentiment_gold.pkl', 'rb') as f:
    rev_sentiment_dict = pickle.load(f)

with open('data/sentiment_gold.pkl', 'rb') as f:
    sentiment_dict = pickle.load(f)

In [6]:
with open('data/sentiment_subject_data_new.pkl', 'rb') as f:
    subjects_data = pickle.load(f)

In [7]:
subject_ids = ['ZAB', 'ZDM', 'ZGW', 'ZJM', 'ZJN', 'ZJS', 'ZKB', 'ZKH','ZKW', 'ZMG', 'ZPH', 'ZDN']
sentence_level_feats = ["sent_mean", "sm_a1", "sm_a2", "sm_b1", "sm_b2", "sm_g1", "sm_g2", "sm_t1", "sm_t2"]
word_level_feats = ["word_mean", "FFD", "TRT", "GD", "GPT", "SFD", "wm_a1", "wm_a2", "wm_b1", "wm_b2", "wm_g1", "wm_g2", 
                    "wm_t1", "wm_t2"]

In [8]:
sum(len(subjects_data[i]) for i in subject_ids)

4675

In [23]:
freq_feats = ["sm_a1", "sm_a2", "sm_b1", "sm_b2", "sm_g1", "sm_g2", "sm_t1", "sm_t2"]

def create_sentence_level_feature_df():
    df = pd.DataFrame(columns=["sm_"+str(x) for x in range(1,106)])
    
    xtlist = []
    ytlist = []
    
    for subid in subject_ids:
        for sentid in subjects_data[subid]:
#             try:
            new_vec = []
            for fqfeat in freq_feats:
                arr = subjects_data[subid][sentid][fqfeat]
                where_are_NaNs = np.isnan(arr)
                arr[where_are_NaNs] = 0
                new_vec = new_vec + list(arr)
#             xtlist.append([subid+sentid]+new_vec)
            xtlist.append(new_vec)
            ytlist.append(sentiment_dict[sentid]['label'])
#             except Exception as ex:
#                 print(sentid, subid, ex)
    return xtlist, ytlist

In [24]:
Xt, yt = create_sentence_level_feature_df()

In [25]:
from sklearn.decomposition import PCA

In [33]:
pca100 = PCA(n_components=100, svd_solver='auto')
X100 = pca100.fit_transform(np.array(Xt))

pca200 = PCA(n_components=200, svd_solver='auto')
X200 = pca200.fit_transform(np.array(Xt))

pca300 = PCA(n_components=300, svd_solver='auto')
X300 = pca300.fit_transform(np.array(Xt))

pca500 = PCA(n_components=500, svd_solver='auto')
X500 = pca500.fit_transform(np.array(Xt))

In [31]:
np.shape(Xt), np.shape(X100), 

((4675, 840), (4675, 100))

In [None]:
# for xtt in [X100, X200, X300, X500]:
#     MX = pd.DataFrame(xtt)
#     my = pd.DataFrame(yt)

#     # X = X.rename(columns={0: 'UID'})
#     my = my.rename(columns={0: 'label'})

#     print(MX.shape)
#     print(my.shape)
    
#     X_train, X_test, y_train, y_test = train_test_split(MX, my, test_size=0.20, random_state=42)

(4515, 106)
(4515, 1)


# Models

In [34]:
# Preprocessing & results----------------
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# nlp preprocessing
import spacy

# Models-------------------------
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
import sklearn.gaussian_process.kernels as kls
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier

# for visualizing ---------------
from sklearn import tree
from six import StringIO 
from IPython.display import Image, display
import seaborn as sns
import graphviz
import matplotlib.pyplot as plt

# General purpose
import re
import pandas as pd
import pickle
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [35]:
from sklearn.metrics import precision_recall_fscore_support

In [36]:
# np.random.seed(42)

In [37]:
clf_dict = {
    'DecisionTree': {"model": DecisionTreeClassifier(random_state=42), "params": {'max_depth': list(range(10, 250, 20))}},
    'RandomForest': {"model": RandomForestClassifier(random_state=42),
                     "params": {'n_estimators': list(range(5, 100, 5)), 'max_depth': list(range(10, 250, 20))}},
    'LogisticR_L1': {"model": LogisticRegression(random_state=42, max_iter=1000),
                     "params": {'penalty': ['l1'], 'solver': ['liblinear', 'saga']}},
    'LogisticR_L2': {"model": LogisticRegression(random_state=42, max_iter=1000),
                     "params": {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}},
    'LogisticR': {"model": LogisticRegression(random_state=42, max_iter=1000),
                  "params": {'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']}},
    'RidgeClf': {"model": RidgeClassifier(max_iter=1000), "params": {}},
    'SVC_linear': {"model": SVC(random_state=42), "params": {'kernel': ['linear'], 
                                                             'C': [0.5, 1.0, 1.5, 2.0, 2.5]}},
    'SVC_poly': {"model": SVC(random_state=42),
                 "params": {'kernel': ['poly'], 'degree': [3, 4, 5], 'gamma': ['scale', 'auto'], 
                            'C': [0.5, 1.0, 1.5, 2.0, 2.5]}},
    'SVC_others': {"model": SVC(random_state=42), "params": {'kernel': ['rbf', 'sigmoid'], 
                                                             'gamma': ['scale', 'auto'], 
                                                             'C': [0.5, 1.0, 1.5, 2.0, 2.5]}},
    'GussianNB': {"model": GaussianNB(), "params": {}},
    'KNN': {"model": KNeighborsClassifier(), "params": {'n_neighbors': list(range(1, 20))}},
    'GaussianProcessClf': {"model": GaussianProcessClassifier(random_state=42, kernel=kls.RBF()), "params": {}},
    'Bagging_SVC': {"model": BaggingClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                            'base_estimator': [SVC(kernel='linear'),
                                                                                               SVC(kernel='poly',
                                                                                                   degree=3,
                                                                                                   gamma='scale')]}},
    'BaggingDT': {"model": BaggingClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                          'base_estimator': [
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=10),
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=50),
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=100)]}},
    'AdaBoost': {"model": AdaBoostClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                          'base_estimator': [DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=10),
                                                                                             DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=50),
                                                                                             DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=100)]}},
    'ExtraTrees': {"model": ExtraTreesClassifier(random_state=42), "params": {'n_estimators': list(range(5, 105, 5)), 
                                                                              'max_depth': [10, 50, 100, 250, 400]}},
    'MLP_l1': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x,) for x in 
                                                                                          range(50, 600, 100)], 
                                                                  'activation': ['logistic', 'tanh', 'relu'],
                                                                  'solver': ['adam', 'sgd'], 'early_stopping': 
                                                                   [True]}},
    'MLP_l2': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x, y) for x in 
                                                                                          range(50, 600, 100) 
                                                                                          for y in range(50, 360, 100)], 
                                                                  'activation': ['logistic', 'tanh', 'relu'],
                                                                  'solver': ['adam', 'sgd'], 'early_stopping': 
                                                                                               [True]}},
    'MLP_l3': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x, y, z) for x in 
                                                                                          range(50, 600, 100) 
                                                                                          for y in range(50, 600, 100)
                                                                                          for z in range(50, 360, 100)], 
                                                                  'activation': ['logistic', 'tanh', 'relu'],
                                                                  'solver': ['adam', 'sgd'], 'early_stopping': 
                                                                                               [True]}},
    }

In [None]:
for xtt in [X100]:#, X200, X300, X500]:
    MX = pd.DataFrame(xtt)
    my = pd.DataFrame(yt)

    # X = X.rename(columns={0: 'UID'})
    my = my.rename(columns={0: 'label'})

    print(MX.shape)
    print(my.shape)
    
    X_train, X_test, y_train, y_test = train_test_split(MX, my, test_size=0.20, random_state=42)
    print("=================================================================================")
    print("=================================================================================")
    
    model_results = pd.DataFrame()
    model_results['Train_Accuracy'] = None
    model_results['Test_Accuracy'] = None
    model_results['best_params'] = None

    # X_train, X_test, y_train, y_test
    # X_train_final = X_train_normalized.drop(columns=["ref_latest"])
    # X_test_normalized_remgsdata = X_test_normalized.drop(columns=["ref_latest"])
    # X_train_normalized_remgsdata = X_train_normalized.copy()
    # X_test_normalized_remgsdata = X_test_normalized.copy()

    xtrain_final = X_train#.drop(columns=["UID"])
    ytrain_final = y_train
    # ytrain_final = ytrain.drop(columns=["UID"])

    xtest_final = X_test#.drop(columns=["UID"])
    ytest_final = y_test
    # ytest_final = ytest.drop(columns=["UID"])


    best_clf_ours = None
    best_clf_val = 0

    for clf_name, clf in clf_dict.items():
        classifier = GridSearchCV(clf['model'], clf['params'], n_jobs=5)
        classifier.fit(xtrain_final, ytrain_final)
        best_model = classifier.best_estimator_
        

        y_predicted = best_model.predict(xtest_final)
        test_acc_macro = precision_recall_fscore_support(ytest_final, y_predicted, average='macro')[2]#accuracy_score(ytest_final, y_predicted)
        print(clf_name, test_acc_macro, classifier.best_score_, classifier.best_params_)

        if test_acc_macro > best_clf_val:
            best_clf_val = test_acc_macro
            best_clf_ours = best_model

        model_results.loc[clf_name, ['Train_Accuracy', 'Test_Accuracy', 'best_params']] = [classifier.best_score_, test_acc_macro, classifier.best_params_]
        clsr = classification_report(ytest_final, y_predicted)

    print("================================================================================")
    print(best_clf_ours)
    best_y_hat = best_clf_ours.predict(xtest_final)
    clsr = classification_report(ytest_final, best_y_hat)
    print(clsr)
    test_acc = accuracy_score(ytest_final, best_y_hat)
    print("Test acc:", test_acc )
    print("Weighted F1 score: ", f1_score(ytest_final, best_y_hat, average='weighted'))

(4675, 100)
(4675, 1)
DecisionTree 0.29990427098151967 0.3462566844919786 {'max_depth': 10}
RandomForest 0.34822584585078853 0.36711229946524065 {'max_depth': 10, 'n_estimators': 30}
LogisticR_L1 0.3331653061459124 0.33475935828877007 {'penalty': 'l1', 'solver': 'liblinear'}
LogisticR_L2 0.3293207583478524 0.33502673796791443 {'penalty': 'l2', 'solver': 'saga'}
LogisticR 0.33080186442523385 0.33449197860962565 {'penalty': 'none', 'solver': 'saga'}
RidgeClf 0.33179230736745474 0.3358288770053476 {}
SVC_linear 0.34992774080795863 0.33636363636363636 {'C': 1.5, 'kernel': 'linear'}
SVC_poly 0.22648000292588052 0.34946524064171125 {'C': 0.5, 'degree': 5, 'gamma': 'scale', 'kernel': 'poly'}
SVC_others 0.28368712301738425 0.3548128342245989 {'C': 0.5, 'gamma': 'scale', 'kernel': 'rbf'}
GussianNB 0.3121435538272275 0.3323529411764706 {}
KNN 0.35944723793561 0.3352941176470588 {'n_neighbors': 13}
GaussianProcessClf 0.1699867197875166 0.3518716577540107 {}


In [None]:
# model_results = pd.DataFrame()
# model_results['Train_Accuracy'] = None
# model_results['Test_Accuracy'] = None
# model_results['best_params'] = None

# # X_train, X_test, y_train, y_test
# # X_train_final = X_train_normalized.drop(columns=["ref_latest"])
# # X_test_normalized_remgsdata = X_test_normalized.drop(columns=["ref_latest"])
# # X_train_normalized_remgsdata = X_train_normalized.copy()
# # X_test_normalized_remgsdata = X_test_normalized.copy()

# xtrain_final = X_train.drop(columns=["UID"])
# ytrain_final = y_train
# # ytrain_final = ytrain.drop(columns=["UID"])

# xtest_final = X_test.drop(columns=["UID"])
# ytest_final = y_test
# # ytest_final = ytest.drop(columns=["UID"])


# best_clf_ours = None
# best_clf_val = 0

# for clf_name, clf in clf_dict.items():
#     classifier = GridSearchCV(clf['model'], clf['params'], n_jobs=5)
#     classifier.fit(xtrain_final, ytrain_final)
#     best_model = classifier.best_estimator_
#     print(clf_name, classifier.best_score_, classifier.best_params_)
    
#     y_predicted = best_model.predict(xtest_final)
#     test_acc_macro = precision_recall_fscore_support(ytest_final, y_predicted, average='macro')[2]#accuracy_score(ytest_final, y_predicted)
    
#     if test_acc_macro > best_clf_val:
#         best_clf_val = test_acc_macro
#         best_clf_ours = best_model
    
#     model_results.loc[clf_name, ['Train_Accuracy', 'Test_Accuracy', 'best_params']] = [classifier.best_score_, test_acc_macro, classifier.best_params_]
#     clsr = classification_report(ytest_final, y_predicted)

# print("================================================================================")
# print(best_clf_ours)
# best_y_hat = best_clf_ours.predict(xtest_final)
# clsr = classification_report(ytest_final, best_y_hat)
# print(clsr)
# test_acc = accuracy_score(ytest_final, best_y_hat)
# print("Test acc:", test_acc )
# print("Weighted F1 score: ", f1_score(ytest_final, best_y_hat, average='weighted'))

In [42]:
model_results

Unnamed: 0,Train_Accuracy,Test_Accuracy,best_params
DecisionTree,0.346257,0.299904,{'max_depth': 10}
RandomForest,0.367112,0.348226,"{'max_depth': 10, 'n_estimators': 30}"
LogisticR_L1,0.334759,0.333165,"{'penalty': 'l1', 'solver': 'liblinear'}"
LogisticR_L2,0.335027,0.329321,"{'penalty': 'l2', 'solver': 'saga'}"
LogisticR,0.334492,0.330802,"{'penalty': 'none', 'solver': 'saga'}"
RidgeClf,0.335829,0.331792,{}
SVC_linear,0.336364,0.349928,"{'C': 1.5, 'kernel': 'linear'}"
SVC_poly,0.349465,0.22648,"{'C': 0.5, 'degree': 5, 'gamma': 'scale', 'ker..."
SVC_others,0.354813,0.283687,"{'C': 0.5, 'gamma': 'scale', 'kernel': 'rbf'}"
GussianNB,0.332353,0.312144,{}


In [None]:
kNN - 36.8