# Feature Preparation

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(42)

### Load data

In [6]:
with open('data/rev_sentiment_gold.pkl', 'rb') as f:
    rev_sentiment_dict = pickle.load(f)

with open('data/sentiment_gold.pkl', 'rb') as f:
    sentiment_dict = pickle.load(f)

In [30]:
with open('data/sentiment_subject_data_new.pkl', 'rb') as f:
    subjects_data = pickle.load(f)

In [12]:
subject_ids = ['ZAB', 'ZDM', 'ZGW', 'ZJM', 'ZJN', 'ZJS', 'ZKB', 'ZKH','ZKW', 'ZMG', 'ZPH', 'ZDN']
sentence_level_feats = ["sent_mean", "sm_a1", "sm_a2", "sm_b1", "sm_b2", "sm_g1", "sm_g2", "sm_t1", "sm_t2"]
word_level_feats = ["word_mean", "FFD", "TRT", "GD", "GPT", "SFD", "wm_a1", "wm_a2", "wm_b1", "wm_b2", "wm_g1", "wm_g2", 
                    "wm_t1", "wm_t2"]

In [10]:
sum(len(subjects_data[i]) for i in subject_ids)

4675

In [35]:
def create_sentence_level_feature_df():
    df = pd.DataFrame(columns=["sm_"+str(x) for x in range(1,106)])
    
    xtlist = []
    ytlist = []
    
    for subid in subject_ids:
        for sentid in subjects_data[subid]:
            try:
                xtlist.append([subid+sentid]+list(subjects_data[subid][sentid]["sent_mean"]))
                ytlist.append(sentiment_dict[sentid]['label'])
            except Exception as ex:
                print(sentid, subid)
    return xtlist, ytlist

In [42]:
Xt, yt = create_sentence_level_feature_df()

33 ZAB
75 ZAB
178 ZAB
188 ZAB
210 ZAB
211 ZAB
256 ZAB
355 ZAB
49 ZDM
55 ZDM
73 ZDM
90 ZDM
145 ZDM
146 ZDM
198 ZDM
203 ZDM
218 ZDM
267 ZDM
268 ZDM
281 ZDM
332 ZDM
346 ZDM
392 ZDM
87 ZGW
92 ZGW
102 ZGW
103 ZGW
124 ZGW
140 ZGW
159 ZGW
160 ZGW
169 ZGW
186 ZGW
187 ZGW
194 ZGW
197 ZGW
204 ZGW
226 ZGW
233 ZGW
286 ZGW
287 ZGW
318 ZGW
341 ZGW
370 ZGW
388 ZGW
396 ZGW
399 ZGW
28 ZJN
75 ZJN
105 ZJN
122 ZJN
190 ZJN
207 ZJN
390 ZJN
391 ZJN
392 ZJN
396 ZJN
400 ZJN
401 ZJN
402 ZJN
403 ZJN
404 ZJN
405 ZJN
5 ZJS
10 ZJS
19 ZJS
23 ZJS
28 ZJS
31 ZJS
35 ZJS
36 ZJS
40 ZJS
44 ZJS
45 ZJS
51 ZJS
55 ZJS
58 ZJS
60 ZJS
66 ZJS
67 ZJS
76 ZJS
83 ZJS
85 ZJS
102 ZJS
103 ZJS
104 ZJS
105 ZJS
109 ZJS
110 ZJS
115 ZJS
116 ZJS
122 ZJS
129 ZJS
144 ZJS
150 ZJS
163 ZJS
164 ZJS
166 ZJS
168 ZJS
169 ZJS
174 ZJS
178 ZJS
188 ZJS
205 ZJS
234 ZJS
240 ZJS
251 ZJS
256 ZJS
264 ZJS
265 ZJS
277 ZJS
283 ZJS
284 ZJS
287 ZJS
292 ZJS
355 ZJS
357 ZJS
363 ZJS
367 ZJS
375 ZJS
382 ZJS
391 ZJS
402 ZJS
403 ZJS
93 ZKB
108 ZKH
263 ZKH
277 ZKH
279 ZKH


In [38]:
subjects_data['ZAB']['33']

{'sent_mean': nan,
 'sm_a1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 'sm_a2': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 'sm_b1

In [43]:
X = pd.DataFrame(Xt)
y = pd.DataFrame(yt)

X = X.rename(columns={0: 'UID'})
y = y.rename(columns={0: 'label'})

print(X.shape)
print(y.shape)

(4515, 106)
(4515, 1)


In [44]:
X.head()

Unnamed: 0,UID,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
0,ZAB5,0.136908,0.168137,0.385452,0.443554,0.48943,0.469545,0.357853,0.374205,0.579966,...,0.263437,0.205831,0.157749,0.339924,0.161557,0.739019,0.210424,0.184606,0.28561,0.0
1,ZAB6,0.149611,0.189901,0.405646,0.417036,0.484388,0.477953,0.335813,0.361789,0.568168,...,0.261565,0.207844,0.166264,0.323565,0.161425,0.737319,0.216307,0.190234,0.272997,0.0
2,ZAB7,0.141546,0.178522,0.421334,0.474207,0.53157,0.480027,0.312287,0.327484,0.550506,...,0.279398,0.21679,0.159647,0.359645,0.166762,0.689524,0.219103,0.183957,0.30054,0.0
3,ZAB8,0.148731,0.191334,0.437172,0.446185,0.486084,0.495821,0.381391,0.395134,0.63792,...,0.302406,0.228788,0.160929,0.33009,0.165703,0.7182,0.233925,0.187475,0.290386,0.0
4,ZAB10,0.133797,0.174693,0.378351,0.427469,0.481511,0.475317,0.366191,0.380294,0.626149,...,0.290178,0.221566,0.171737,0.325676,0.17116,0.727944,0.228801,0.192619,0.279246,0.0


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [48]:
X_train.head()

Unnamed: 0,UID,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
2045,ZJS158,0.253507,0.452405,0.460751,0.510411,0.399087,0.158251,0.461659,0.54242,0.565835,...,0.587317,0.751763,0.562027,0.505805,0.450746,0.49035,0.379354,0.498996,0.380836,0.0
3902,ZPH78,0.648988,1.221851,1.132945,1.150835,0.94792,0.59201,0.648121,0.910746,1.086167,...,1.187955,1.112423,1.337752,1.36959,0.867618,0.660058,0.717443,1.274624,1.364582,0.0
4047,ZPH229,0.604066,1.243464,1.769672,1.259108,0.963724,0.564908,1.303813,1.365414,1.197636,...,1.357563,1.102438,1.158808,1.452245,1.947037,0.823774,0.887572,1.134499,1.428995,0.0
2836,ZKH184,1.869645,1.875095,2.317404,2.345925,1.893533,1.621276,1.508192,1.910222,2.528769,...,3.579909,2.837919,2.705813,2.650997,1.623517,1.407117,2.085069,2.2779,2.245489,0.0
2772,ZKH120,1.191401,1.286283,1.693675,1.748532,1.371537,1.210537,1.037266,1.495037,1.869668,...,2.687652,2.253665,2.152418,1.935144,1.395988,1.125359,1.62439,1.533696,1.617401,0.0


In [49]:
y_train.head()

Unnamed: 0,label
2045,1
3902,-1
4047,-1
2836,1
2772,0


# Models

In [3]:
# Preprocessing & results----------------
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# nlp preprocessing
import spacy

# Models-------------------------
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
import sklearn.gaussian_process.kernels as kls
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier

# for visualizing ---------------
from sklearn import tree
from six import StringIO 
from IPython.display import Image, display
import seaborn as sns
import graphviz
import matplotlib.pyplot as plt

# General purpose
import re
import pandas as pd
import pickle
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [4]:
from sklearn.metrics import precision_recall_fscore_support

In [2]:
# np.random.seed(42)

In [5]:
clf_dict = {
    'DecisionTree': {"model": DecisionTreeClassifier(random_state=42), "params": {'max_depth': list(range(10, 250, 20))}},
    'RandomForest': {"model": RandomForestClassifier(random_state=42),
                     "params": {'n_estimators': list(range(5, 100, 5)), 'max_depth': list(range(10, 250, 20))}},
    'LogisticR_L1': {"model": LogisticRegression(random_state=42, max_iter=1000),
                     "params": {'penalty': ['l1'], 'solver': ['liblinear', 'saga']}},
    'LogisticR_L2': {"model": LogisticRegression(random_state=42, max_iter=1000),
                     "params": {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}},
    'LogisticR': {"model": LogisticRegression(random_state=42, max_iter=1000),
                  "params": {'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']}},
    'RidgeClf': {"model": RidgeClassifier(max_iter=1000), "params": {}},
    'SVC_linear': {"model": SVC(random_state=42), "params": {'kernel': ['linear'], 
                                                             'C': [0.5, 1.0, 1.5, 2.0, 2.5]}},
    'SVC_poly': {"model": SVC(random_state=42),
                 "params": {'kernel': ['poly'], 'degree': [3, 4, 5], 'gamma': ['scale', 'auto'], 
                            'C': [0.5, 1.0, 1.5, 2.0, 2.5]}},
    'SVC_others': {"model": SVC(random_state=42), "params": {'kernel': ['rbf', 'sigmoid'], 
                                                             'gamma': ['scale', 'auto'], 
                                                             'C': [0.5, 1.0, 1.5, 2.0, 2.5]}},
    'GussianNB': {"model": GaussianNB(), "params": {}},
    'KNN': {"model": KNeighborsClassifier(), "params": {'n_neighbors': list(range(1, 20))}},
    'GaussianProcessClf': {"model": GaussianProcessClassifier(random_state=42, kernel=kls.RBF()), "params": {}},
    'Bagging_SVC': {"model": BaggingClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                            'base_estimator': [SVC(kernel='linear'),
                                                                                               SVC(kernel='poly',
                                                                                                   degree=3,
                                                                                                   gamma='scale')]}},
    'BaggingDT': {"model": BaggingClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                          'base_estimator': [
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=10),
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=50),
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=100)]}},
    'AdaBoost': {"model": AdaBoostClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                          'base_estimator': [DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=10),
                                                                                             DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=50),
                                                                                             DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=100)]}},
    'ExtraTrees': {"model": ExtraTreesClassifier(random_state=42), "params": {'n_estimators': list(range(5, 105, 5)), 
                                                                              'max_depth': [10, 50, 100, 250, 400]}},
    'MLP_l1': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x,) for x in 
                                                                                          range(50, 600, 100)], 
                                                                  'activation': ['logistic', 'tanh', 'relu'],
                                                                  'solver': ['adam', 'sgd'], 'early_stopping': 
                                                                   [True]}},
    'MLP_l2': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x, y) for x in 
                                                                                          range(50, 600, 100) 
                                                                                          for y in range(50, 360, 100)], 
                                                                  'activation': ['logistic', 'tanh', 'relu'],
                                                                  'solver': ['adam', 'sgd'], 'early_stopping': 
                                                                                               [True]}},
    'MLP_l3': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x, y, z) for x in 
                                                                                          range(50, 600, 100) 
                                                                                          for y in range(50, 600, 100)
                                                                                          for z in range(50, 360, 100)], 
                                                                  'activation': ['logistic', 'tanh', 'relu'],
                                                                  'solver': ['adam', 'sgd'], 'early_stopping': 
                                                                                               [True]}},
    }

In [51]:
model_results = pd.DataFrame()
model_results['Train_Accuracy'] = None
model_results['Test_Accuracy'] = None
model_results['best_params'] = None

# X_train, X_test, y_train, y_test
# X_train_final = X_train_normalized.drop(columns=["ref_latest"])
# X_test_normalized_remgsdata = X_test_normalized.drop(columns=["ref_latest"])
# X_train_normalized_remgsdata = X_train_normalized.copy()
# X_test_normalized_remgsdata = X_test_normalized.copy()

xtrain_final = X_train.drop(columns=["UID"])
ytrain_final = y_train
# ytrain_final = ytrain.drop(columns=["UID"])

xtest_final = X_test.drop(columns=["UID"])
ytest_final = y_test
# ytest_final = ytest.drop(columns=["UID"])


best_clf_ours = None
best_clf_val = 0

for clf_name, clf in clf_dict.items():
    classifier = GridSearchCV(clf['model'], clf['params'], n_jobs=5)
    classifier.fit(xtrain_final, ytrain_final)
    best_model = classifier.best_estimator_
    print(clf_name, classifier.best_score_, classifier.best_params_)
    
    y_predicted = best_model.predict(xtest_final)
    test_acc_macro = precision_recall_fscore_support(ytest_final, y_predicted, average='macro')[2]#accuracy_score(ytest_final, y_predicted)
    
    if test_acc_macro > best_clf_val:
        best_clf_val = test_acc_macro
        best_clf_ours = best_model
    
    model_results.loc[clf_name, ['Train_Accuracy', 'Test_Accuracy', 'best_params']] = [classifier.best_score_, test_acc_macro, classifier.best_params_]
    clsr = classification_report(ytest_final, y_predicted)

print("================================================================================")
print(best_clf_ours)
best_y_hat = best_clf_ours.predict(xtest_final)
clsr = classification_report(ytest_final, best_y_hat)
print(clsr)
test_acc = accuracy_score(ytest_final, best_y_hat)
print("Test acc:", test_acc )
print("Weighted F1 score: ", f1_score(ytest_final, best_y_hat, average='weighted'))

DecisionTree 0.3419158361018826 {'max_depth': 10}
RandomForest 0.33970099667774084 {'max_depth': 10, 'n_estimators': 65}
LogisticR_L1 0.3648947951273533 {'penalty': 'l1', 'solver': 'saga'}
LogisticR_L2 0.3590808416389812 {'penalty': 'l2', 'solver': 'sag'}
LogisticR 0.3696013289036545 {'penalty': 'none', 'solver': 'newton-cg'}
RidgeClf 0.3701550387596899 {}
SVC_linear 0.3637873754152824 {'C': 0.5, 'kernel': 'linear'}
SVC_poly 0.3607419712070875 {'C': 2.5, 'degree': 4, 'gamma': 'scale', 'kernel': 'poly'}
SVC_others 0.35741971207087486 {'C': 2.5, 'gamma': 'scale', 'kernel': 'rbf'}
GussianNB 0.3438538205980066 {}
KNN 0.34246954595791806 {'n_neighbors': 18}
GaussianProcessClf 0.3507751937984496 {}
Bagging_SVC 0.3648947951273533 {'base_estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False), 'n

In [52]:
model_results

Unnamed: 0,Train_Accuracy,Test_Accuracy,best_params
DecisionTree,0.341916,0.325685,{'max_depth': 10}
RandomForest,0.339701,0.332902,"{'max_depth': 10, 'n_estimators': 65}"
LogisticR_L1,0.364895,0.337471,"{'penalty': 'l1', 'solver': 'saga'}"
LogisticR_L2,0.359081,0.333438,"{'penalty': 'l2', 'solver': 'sag'}"
LogisticR,0.369601,0.334593,"{'penalty': 'none', 'solver': 'newton-cg'}"
RidgeClf,0.370155,0.342032,{}
SVC_linear,0.363787,0.33438,"{'C': 0.5, 'kernel': 'linear'}"
SVC_poly,0.360742,0.303405,"{'C': 2.5, 'degree': 4, 'gamma': 'scale', 'ker..."
SVC_others,0.35742,0.309608,"{'C': 2.5, 'gamma': 'scale', 'kernel': 'rbf'}"
GussianNB,0.343854,0.317988,{}
