In [129]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import hstack
from sklearn.base import BaseEstimator, TransformerMixin
from scipy import sparse
import lightgbm
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text  import CountVectorizer
import starterkit
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
import os
import io

In [50]:
from importlib import reload

reload(starterkit)

<module 'starterkit' from 'D:\\Machine Learning\\Notebooks\\EmoContext\\starterkit.py'>

In [104]:
train_data = pd.read_csv('D:\\Machine Learning\\Datasets\\EmoContext\\train.txt',
                        sep='\t', index_col='id')
X = train_data.iloc[:,:-1]
y = train_data.iloc[:,-1]
le = LabelEncoder()
y = le.fit_transform(y)

In [106]:
dev_data = pd.read_csv('D:\\Machine Learning\\Datasets\\EmoContext\\dev.txt',
                        sep='\t', index_col='id')
X_dev = dev_data.iloc[:,:-1]
y_dev = dev_data.iloc[:,-1]
y_dev = le.transform(y_dev)

In [7]:
def lightGbmTestScore(X, y):
    train_x, val_x, train_y, val_y = train_test_split(X,
                                                  y,
                                                 test_size=0.2,
                                                 random_state=42)
    clf = lightgbm.LGBMClassifier(random_state=42)
    clf.fit(train_x, train_y)
    val_pred = clf.predict(val_x)
    return f1_score(y_true=val_y, y_pred=val_pred,average='micro' )

In [8]:
def lightGbmTraingScore(X, y):
    clf = lightgbm.LGBMClassifier(random_state=42)
    clf.fit(X, y)
    pred = clf.predict(X)
    return f1_score(y_true=y, y_pred=pred,average='micro' )

In [89]:
def lightGbmValidationKitScore(X, y):
    train_x, val_x, train_y, val_y = train_test_split(X,
                                                  y,
                                                 test_size=0.2,
                                                 random_state=42)
    clf = lightgbm.LGBMClassifier(random_state=42)
    clf.fit(train_x, train_y)
    val_pred = clf.predict(val_x)
    lb = LabelBinarizer()
    ground = lb.fit_transform(val_y)
    predictions = lb.transform(val_pred)
    return  starterkit.getMetrics(predictions, ground)

In [222]:
def lightGbmTrainDevKitScore(X_train, y_train, X_dev, y_dev, showTrainingScore=False):
    return trainDevKitScore(lightgbm.LGBMClassifier(random_state=42),X_train, y_train,
                            X_dev, y_dev,showTrainingScore)

In [210]:
def trainDevKitScore(clf, X_train, y_train, X_dev, y_dev, showTrainingScore=False):
    clf.fit(X_train, y_train)
    
    lb = LabelBinarizer()
    lb.fit(y_train)
    
    if showTrainingScore:
        print('Training score \n')
        train_pred = clf.predict(X_train)
        ground = lb.transform(y_train)
        predictions = lb.transform(train_pred)
        starterkit.getMetrics(predictions=predictions, ground=ground)
    
    print('\nDev score \n')
    dev_pred = clf.predict(X_dev)
    ground = lb.transform(y_dev)
    predictions = lb.transform(dev_pred)
    return  starterkit.getMetrics(predictions=predictions, ground=ground)

In [86]:
def lightGbmTrainingKitScore(X, y):
    clf = lightgbm.LGBMClassifier()
    clf.fit(X, y)
    pred = clf.predict(X)
    lb = LabelBinarizer()
    ground = lb.fit_transform(y)
    predictions = lb.transform(pred)
    return  starterkit.getMetrics(predictions, ground)

In [258]:
def load_feats(feats_dir, folder):
    dict_to_feed = {}
    full_path = os.path.join(feats_dir, folder)
    files = [f for f in os.listdir(full_path) 
             if os.path.isfile(os.path.join(full_path, f)) and f[-4:] =='.npz']
    
    for f in files:
        dict_to_feed[f[:-4]] = sparse.load_npz(os.path.join(full_path, f))
    return dict_to_feed    

In [238]:
feats_path = 'D:\\Machine Learning\\Datasets\\EmoContext\\Features'

train_feats_dict = load_feats(feats_path, 'train')
dev_feats_dict = load_feats(feats_path, 'dev')
test_feats_dict = load_feats(feats_path, 'test')

train_feats = hstack(list(train_feats_dict.values()))
dev_feats = hstack(list(dev_feats_dict.values()))
test_feats = hstack(list(test_feats_dict.values()))

# Testing with some basic combination of features

In [None]:
### Basic features only

In [237]:
lightGbmTrainDevKitScore(train_feats, y, dev_feats, y_dev, True)

Training score 





True Positives per class :  [ 4195.  3213. 13790.  4194.]
False Positives per class :  [ 878.  439. 2795.  656.]
False Negatives per class :  [1311. 1030. 1158. 1269.]
Class happy : Precision : 0.880, Recall : 0.757, F1 : 0.814
Class sad : Precision : 0.831, Recall : 0.923, F1 : 0.875
Class angry : Precision : 0.865, Recall : 0.768, F1 : 0.813
Ignoring the Others class, Macro Precision : 0.8587, Macro Recall : 0.8158, Macro F1 : 0.8367
Ignoring the Others class, Micro TP : 21197, FP : 3890, FN : 3457
Accuracy : 0.8419, Micro Precision : 0.8449, Micro Recall : 0.8598, Micro F1 : 0.8523

Dev score 

True Positives per class :  [ 105.   87. 2095.   76.]
False Positives per class :  [143.  76. 112.  61.]
False Negatives per class :  [ 45.  55. 243.  49.]
Class happy : Precision : 0.534, Recall : 0.613, F1 : 0.570
Class sad : Precision : 0.949, Recall : 0.896, F1 : 0.922
Class angry : Precision : 0.555, Recall : 0.608, F1 : 0.580
Ignoring the Others class, Macro Precision : 0.6792, Macro Re



(0.8577132486388385,
 0.9006781013163143,
 0.8667946257197697,
 0.8834115805946793)

In [None]:
### Lexicon only

In [240]:
lightGbmTrainDevKitScore(train_feats_dict['dm_lexicon_feats'], y,
                         dev_feats_dict['dm_lexicon_feats'], y_dev, True)

Training score 

True Positives per class :  [ 3316.  2244. 13864.  2575.]
False Positives per class :  [ 773.  596. 6248.  544.]
False Negatives per class :  [2190. 1999. 1084. 2888.]
Class happy : Precision : 0.790, Recall : 0.529, F1 : 0.634
Class sad : Precision : 0.689, Recall : 0.927, F1 : 0.791
Class angry : Precision : 0.826, Recall : 0.471, F1 : 0.600
Ignoring the Others class, Macro Precision : 0.7684, Macro Recall : 0.6426, Macro F1 : 0.6999
Ignoring the Others class, Micro TP : 18683, FP : 7388, FN : 5971
Accuracy : 0.7294, Micro Precision : 0.7166, Micro Recall : 0.7578, Micro F1 : 0.7366

Dev score 

True Positives per class :  [  83.   50. 2047.   51.]
False Positives per class :  [121. 104. 208.  91.]
False Negatives per class :  [ 67.  92. 291.  74.]
Class happy : Precision : 0.325, Recall : 0.352, F1 : 0.338
Class sad : Precision : 0.908, Recall : 0.876, F1 : 0.891
Class angry : Precision : 0.359, Recall : 0.408, F1 : 0.382
Ignoring the Others class, Macro Precision :

(0.8098003629764066,
 0.8420227361818895,
 0.8245681381957773,
 0.8332040341349883)

In [242]:
lightGbmTrainDevKitScore(train_feats_dict['nrc_lexicon_feats'], y,
                         dev_feats_dict['nrc_lexicon_feats'], y_dev, True)

Training score 

True Positives per class :  [ 2983.    79. 13565.   995.]
False Positives per class :  [2871.   56. 8734.  877.]
False Negatives per class :  [2523. 4164. 1383. 4468.]
Class happy : Precision : 0.585, Recall : 0.019, F1 : 0.036
Class sad : Precision : 0.608, Recall : 0.907, F1 : 0.728
Class angry : Precision : 0.532, Recall : 0.182, F1 : 0.271
Ignoring the Others class, Macro Precision : 0.5750, Macro Recall : 0.3694, Macro F1 : 0.4498
Ignoring the Others class, Micro TP : 14639, FP : 9667, FN : 10015
Accuracy : 0.5843, Micro Precision : 0.6023, Micro Recall : 0.5938, Micro F1 : 0.5980

Dev score 

True Positives per class :  [7.100e+01 1.000e+00 2.122e+03 2.300e+01]
False Positives per class :  [217.   6. 260.  55.]
False Negatives per class :  [ 79. 141. 216. 102.]
Class happy : Precision : 0.143, Recall : 0.007, F1 : 0.013
Class sad : Precision : 0.891, Recall : 0.908, F1 : 0.899
Class angry : Precision : 0.295, Recall : 0.184, F1 : 0.227
Ignoring the Others class, 

(0.8047186932849365,
 0.8698824483177949,
 0.8238003838771593,
 0.8462145110410094)

In [243]:
lightGbmTrainDevKitScore(hstack([train_feats_dict['nrc_lexicon_feats'],
                                train_feats_dict['dm_lexicon_feats']]), y,
                         hstack([dev_feats_dict['nrc_lexicon_feats'],
                                 dev_feats_dict['dm_lexicon_feats']]), y_dev, True)

Training score 

True Positives per class :  [ 3881.  2297. 13724.  2938.]
False Positives per class :  [1038.  551. 5032.  699.]
False Negatives per class :  [1625. 1946. 1224. 2525.]
Class happy : Precision : 0.807, Recall : 0.541, F1 : 0.648
Class sad : Precision : 0.732, Recall : 0.918, F1 : 0.814
Class angry : Precision : 0.808, Recall : 0.538, F1 : 0.646
Ignoring the Others class, Macro Precision : 0.7820, Macro Recall : 0.6658, Macro F1 : 0.7192
Ignoring the Others class, Micro TP : 18959, FP : 6282, FN : 5695
Accuracy : 0.7573, Micro Precision : 0.7511, Micro Recall : 0.7690, Micro F1 : 0.7600

Dev score 

True Positives per class :  [  94.   54. 2050.   66.]
False Positives per class :  [130. 106. 161.  94.]
False Negatives per class :  [ 56.  88. 288.  59.]
Class happy : Precision : 0.338, Recall : 0.380, F1 : 0.358
Class sad : Precision : 0.927, Recall : 0.877, F1 : 0.901
Class angry : Precision : 0.412, Recall : 0.528, F1 : 0.463
Ignoring the Others class, Macro Precision :

(0.8217785843920146,
 0.8573686290003951,
 0.8330134357005758,
 0.8450155763239876)

In [None]:
### Emojis

In [18]:
emoji_feats

<30160x196 sparse matrix of type '<class 'numpy.int64'>'
	with 7196 stored elements in COOrdinate format>

In [245]:
lightGbmTrainDevKitScore(train_feats_dict['emoji_feats'].astype(np.float64), y,
                         dev_feats_dict['emoji_feats'].astype(np.float64), y_dev, True)

Training score 





True Positives per class :  [  276.  1390. 14602.  1043.]
False Positives per class :  [   92.   270. 12339.   148.]
False Negatives per class :  [5230. 2853.  346. 4420.]
Class happy : Precision : 0.837, Recall : 0.328, F1 : 0.471
Class sad : Precision : 0.542, Recall : 0.977, F1 : 0.697
Class angry : Precision : 0.876, Recall : 0.191, F1 : 0.313
Ignoring the Others class, Macro Precision : 0.7517, Macro Recall : 0.4985, Macro F1 : 0.5994
Ignoring the Others class, Micro TP : 17035, FP : 12757, FN : 7619
Accuracy : 0.5740, Micro Precision : 0.5718, Micro Recall : 0.6910, Micro F1 : 0.6258

Dev score 

True Positives per class :  [   9.   55. 2291.   15.]
False Positives per class :  [  6.  31. 336.  12.]
False Negatives per class :  [141.  87.  47. 110.]
Class happy : Precision : 0.640, Recall : 0.387, F1 : 0.482
Class sad : Precision : 0.872, Recall : 0.980, F1 : 0.923
Class angry : Precision : 0.556, Recall : 0.120, F1 : 0.197
Ignoring the Others class, Macro Precision : 0.6891, Mac



(0.8602540834845736,
 0.8616788321167883,
 0.9063339731285989,
 0.8834424695977549)

In [None]:
###Count_vectors

In [79]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')
full_conv = X[['turn1', 'turn2', 'turn3']].apply(lambda x: ' '.join(x), axis=1)
full_conv_CVect = ctv.fit_transform(full_conv)

full_conv_dev = X_dev[['turn1', 'turn2', 'turn3']].apply(lambda x: ' '.join(x), axis=1)
full_conv_CVect_dev = ctv.transform(full_conv_dev)

In [250]:
lightGbmTrainDevKitScore(full_conv_CVect.astype(np.float64), y,
                       full_conv_CVect_dev.astype(np.float64), y_dev, True)

Training score 

True Positives per class :  [ 4398.  2720. 13569.  3408.]
False Positives per class :  [ 707.  786. 3935.  637.]
False Negatives per class :  [1108. 1523. 1379. 2055.]
Class happy : Precision : 0.776, Recall : 0.641, F1 : 0.702
Class sad : Precision : 0.775, Recall : 0.908, F1 : 0.836
Class angry : Precision : 0.843, Recall : 0.624, F1 : 0.717
Ignoring the Others class, Macro Precision : 0.7978, Macro Recall : 0.7242, Macro F1 : 0.7592
Ignoring the Others class, Micro TP : 19697, FP : 5358, FN : 4957
Accuracy : 0.7989, Micro Precision : 0.7862, Micro Recall : 0.7989, Micro F1 : 0.7925

Dev score 

True Positives per class :  [ 122.   74. 2067.   76.]
False Positives per class :  [100. 117. 122.  77.]
False Negatives per class :  [ 28.  68. 271.  49.]
Class happy : Precision : 0.387, Recall : 0.521, F1 : 0.444
Class sad : Precision : 0.944, Recall : 0.884, F1 : 0.913
Class angry : Precision : 0.497, Recall : 0.608, F1 : 0.547
Ignoring the Others class, Macro Precision :

(0.8490018148820326, 0.875246742992499, 0.8510556621880998, 0.8629817049435579)

In [None]:
### Manual features + Count_vectors 

In [247]:
lightGbmTrainDevKitScore(hstack([full_conv_CVect, train_feats]), y,
                         hstack([full_conv_CVect_dev, dev_feats]), y_dev, True)

Training score 

True Positives per class :  [ 4737.  3727. 14067.  4677.]
False Positives per class :  [ 430.  457. 1649.  416.]
False Negatives per class :  [769. 516. 881. 786.]
Class happy : Precision : 0.891, Recall : 0.878, F1 : 0.885
Class sad : Precision : 0.895, Recall : 0.941, F1 : 0.917
Class angry : Precision : 0.918, Recall : 0.856, F1 : 0.886
Ignoring the Others class, Macro Precision : 0.9014, Macro Recall : 0.8919, Macro F1 : 0.8966
Ignoring the Others class, Micro TP : 22471, FP : 2522, FN : 2183
Accuracy : 0.9021, Micro Precision : 0.8991, Micro Recall : 0.9115, Micro F1 : 0.9052

Dev score 

True Positives per class :  [ 119.  108. 2153.   95.]
False Positives per class :  [79. 81. 81. 39.]
False Negatives per class :  [ 31.  34. 185.  30.]
Class happy : Precision : 0.571, Recall : 0.761, F1 : 0.653
Class sad : Precision : 0.964, Recall : 0.921, F1 : 0.942
Class angry : Precision : 0.709, Recall : 0.760, F1 : 0.734
Ignoring the Others class, Macro Precision : 0.7480,



(0.8983666061705989, 0.9213922565506453, 0.9044145873320537, 0.912824486633088)

In [None]:
### using lemmatized train data for count vect

In [252]:
train_data_lemmatized = pd.read_csv(features_path+r'\X_repld_low_lemmatized.csv')
dev_data_lemmatized = pd.read_csv(features_path+r'\dev_repld_low_lemmatized.csv')

In [255]:
ctv_lem = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')
full_conv_lem = train_data_lemmatized[['turn1', 'turn2', 'turn3']].apply(lambda x: ' '.join(x), axis=1)
full_conv_CVect_lem = ctv_lem.fit_transform(full_conv_lem)
full_conv_lem_dev = dev_data_lemmatized[['turn1', 'turn2', 'turn3']].apply(lambda x: ' '.join(x), axis=1)
full_conv_CVect_lem_dev = ctv_lem.transform(full_conv_lem_dev)

In [257]:
lightGbmTrainDevKitScore(hstack([full_conv_CVect_lem, train_feats]), y,
                         hstack([full_conv_CVect_lem_dev, dev_feats]), y_dev, True)

Training score 





True Positives per class :  [ 4746.  3729. 14036.  4680.]
False Positives per class :  [ 446.  483. 1622.  418.]
False Negatives per class :  [760. 514. 912. 783.]
Class happy : Precision : 0.885, Recall : 0.879, F1 : 0.882
Class sad : Precision : 0.896, Recall : 0.939, F1 : 0.917
Class angry : Precision : 0.918, Recall : 0.857, F1 : 0.886
Ignoring the Others class, Macro Precision : 0.8999, Macro Recall : 0.8915, Macro F1 : 0.8957
Ignoring the Others class, Micro TP : 22445, FP : 2523, FN : 2209
Accuracy : 0.9016, Micro Precision : 0.8990, Micro Recall : 0.9104, Micro F1 : 0.9046

Dev score 

True Positives per class :  [ 118.  106. 2168.   95.]
False Positives per class :  [66. 74. 84. 44.]
False Negatives per class :  [ 32.  36. 170.  30.]
Class happy : Precision : 0.589, Recall : 0.746, F1 : 0.658
Class sad : Precision : 0.963, Recall : 0.927, F1 : 0.945
Class angry : Precision : 0.683, Recall : 0.760, F1 : 0.720
Ignoring the Others class, Macro Precision : 0.7450, Macro Recall : 0



(0.9027223230490018, 0.9214313496693893, 0.909404990403071, 0.9153786707882534)

In [None]:
### Choose top feats from CountVector

In [259]:
clf = lightgbm.LGBMClassifier()
clf.fit(full_conv_CVect.astype(np.float64), y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [260]:
feat_import = dict(zip(np.arange(full_conv_CVect.shape[1]), clf.feature_importances_))
feat_import = sorted(feat_import.items(), key=lambda value: value[1], reverse=True)
feat_import = {v[0]:v[1] for v in feat_import if v[1]>0}

In [261]:
len(feat_import)

810

In [266]:
features_most_import = full_conv_CVect.tocsc()[:,[*feat_import]]
features_most_import_dev = full_conv_CVect_dev.tocsc()[:,[*feat_import]]

In [268]:
lightGbmTrainDevKitScore(features_most_import.astype(np.float64), y,
                         features_most_import_dev.astype(np.float64), y_dev, True)

Training score 

True Positives per class :  [ 4398.  2720. 13569.  3408.]
False Positives per class :  [ 707.  786. 3935.  637.]
False Negatives per class :  [1108. 1523. 1379. 2055.]
Class happy : Precision : 0.776, Recall : 0.641, F1 : 0.702
Class sad : Precision : 0.775, Recall : 0.908, F1 : 0.836
Class angry : Precision : 0.843, Recall : 0.624, F1 : 0.717
Ignoring the Others class, Macro Precision : 0.7978, Macro Recall : 0.7242, Macro F1 : 0.7592
Ignoring the Others class, Micro TP : 19697, FP : 5358, FN : 4957
Accuracy : 0.7989, Micro Precision : 0.7862, Micro Recall : 0.7989, Micro F1 : 0.7925

Dev score 

True Positives per class :  [ 122.   74. 2067.   76.]
False Positives per class :  [100. 117. 122.  77.]
False Negatives per class :  [ 28.  68. 271.  49.]
Class happy : Precision : 0.387, Recall : 0.521, F1 : 0.444
Class sad : Precision : 0.944, Recall : 0.884, F1 : 0.913
Class angry : Precision : 0.497, Recall : 0.608, F1 : 0.547
Ignoring the Others class, Macro Precision :

(0.8490018148820326, 0.875246742992499, 0.8510556621880998, 0.8629817049435579)

In [269]:
lightGbmTrainDevKitScore(hstack([features_most_import, train_feats]), y,
                         hstack([features_most_import_dev, dev_feats]), y_dev, True)

Training score 





True Positives per class :  [ 4744.  3722. 14052.  4665.]
False Positives per class :  [ 449.  466. 1654.  408.]
False Negatives per class :  [762. 521. 896. 798.]
Class happy : Precision : 0.889, Recall : 0.877, F1 : 0.883
Class sad : Precision : 0.895, Recall : 0.940, F1 : 0.917
Class angry : Precision : 0.920, Recall : 0.854, F1 : 0.886
Ignoring the Others class, Macro Precision : 0.9010, Macro Recall : 0.8904, Macro F1 : 0.8957
Ignoring the Others class, Micro TP : 22439, FP : 2528, FN : 2215
Accuracy : 0.9013, Micro Precision : 0.8987, Micro Recall : 0.9102, Micro F1 : 0.9044

Dev score 

True Positives per class :  [ 122.  108. 2159.   93.]
False Positives per class :  [77. 77. 81. 38.]
False Negatives per class :  [ 28.  34. 179.  32.]
Class happy : Precision : 0.584, Recall : 0.761, F1 : 0.661
Class sad : Precision : 0.964, Recall : 0.923, F1 : 0.943
Class angry : Precision : 0.710, Recall : 0.744, F1 : 0.727
Ignoring the Others class, Macro Precision : 0.7525, Macro Recall : 0



(0.9009074410163339,
 0.9233176838810642,
 0.9059500959692899,
 0.9145514435186979)

# Summary so far 

Dev scores for different set of features:

Manual features Only ==================
Class happy : Precision : 0.534, Recall : 0.613, F1 : 0.570
Class sad : Precision : 0.949, Recall : 0.896, F1 : 0.922
Class angry : Precision : 0.555, Recall : 0.608, F1 : 0.580
Ignoring the Others class, Macro Precision : 0.6792, Macro Recall : 0.7056, Macro F1 : 0.6922
Ignoring the Others class, Micro TP : 2258, FP : 249, FN : 347
Accuracy : 0.8577, Micro Precision : 0.9007, Micro Recall : 0.8668, Micro F1 : 0.8834

nrc_lexicon_feats and dm_lexicon_feats only ===========

Class happy : Precision : 0.338, Recall : 0.380, F1 : 0.358
Class sad : Precision : 0.927, Recall : 0.877, F1 : 0.901
Class angry : Precision : 0.412, Recall : 0.528, F1 : 0.463
Ignoring the Others class, Macro Precision : 0.5591, Macro Recall : 0.5950, Macro F1 : 0.5765
Ignoring the Others class, Micro TP : 2170, FP : 361, FN : 435
Accuracy : 0.8218, Micro Precision : 0.8574, Micro Recall : 0.8330, Micro F1 : 0.8450

emojis only ==============

Class happy : Precision : 0.640, Recall : 0.387, F1 : 0.482
Class sad : Precision : 0.872, Recall : 0.980, F1 : 0.923
Class angry : Precision : 0.556, Recall : 0.120, F1 : 0.197
Ignoring the Others class, Macro Precision : 0.6891, Macro Recall : 0.4957, Macro F1 : 0.5766
Ignoring the Others class, Micro TP : 2361, FP : 379, FN : 244
Accuracy : 0.8603, Micro Precision : 0.8617, Micro Recall : 0.9063, Micro F1 : 0.8834

Word Count Vectors only  ============

Class happy : Precision : 0.387, Recall : 0.521, F1 : 0.444
Class sad : Precision : 0.944, Recall : 0.884, F1 : 0.913
Class angry : Precision : 0.497, Recall : 0.608, F1 : 0.547
Ignoring the Others class, Macro Precision : 0.6095, Macro Recall : 0.6711, Macro F1 : 0.6388
Ignoring the Others class, Micro TP : 2217, FP : 316, FN : 388
Accuracy : 0.8490, Micro Precision : 0.8752, Micro Recall : 0.8511, Micro F1 : 0.8630

Manual + Word Count Vect =======

Class happy : Precision : 0.571, Recall : 0.761, F1 : 0.653
Class sad : Precision : 0.964, Recall : 0.921, F1 : 0.942
Class angry : Precision : 0.709, Recall : 0.760, F1 : 0.734
Ignoring the Others class, Macro Precision : 0.7480, Macro Recall : 0.8138, Macro F1 : 0.7795
Ignoring the Others class, Micro TP : 2356, FP : 201, FN : 249
Accuracy : 0.8984, Micro Precision : 0.9214, Micro Recall : 0.9044, Micro F1 : 0.9128

Word count vect LightGBM most important + Manual Feats ====

Class happy : Precision : 0.584, Recall : 0.761, F1 : 0.661
Class sad : Precision : 0.964, Recall : 0.923, F1 : 0.943
Class angry : Precision : 0.710, Recall : 0.744, F1 : 0.727
Ignoring the Others class, Macro Precision : 0.7525, Macro Recall : 0.8093, Macro F1 : 0.7799
Ignoring the Others class, Micro TP : 2360, FP : 196, FN : 245
Accuracy : 0.9009, Micro Precision : 0.9233, Micro Recall : 0.9060, Micro F1 : 0.9146


In [82]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

# Feature selection with Logistic Regeression

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')
full_conv = X[['turn1', 'turn2', 'turn3']].apply(lambda x: ' '.join(x), axis=1)
full_conv_CVect = ctv.fit_transform(full_conv)
full_conv_dev = X_dev[['turn1', 'turn2', 'turn3']].apply(lambda x: ' '.join(x), axis=1)
full_conv_CVect_dev = ctv.transform(full_conv_dev)

In [285]:
print('Before Selection: ', full_conv_CVect.shape)
lr = LogisticRegression(penalty='l1')
lr.fit(full_conv_CVect, y)
model = SelectFromModel(lr, prefit=True)
X_new = model.transform(full_conv_CVect)
X_new_dev = model.transform(full_conv_CVect_dev)
print('After Selection:', X_new.shape)

Before Selection:  (30160, 228952)




After Selection: (30160, 5483)


In [228]:
lightGbmTrainDevKitScore(hstack([X_new, train_feats]), y,
                         hstack([X_new_dev, dev_feats]), y_dev, True)

Training score 





True Positives per class :  [ 4739.  3719. 14061.  4680.]
False Positives per class :  [ 437.  459. 1653.  412.]
False Negatives per class :  [767. 524. 887. 783.]
Class happy : Precision : 0.890, Recall : 0.877, F1 : 0.883
Class sad : Precision : 0.895, Recall : 0.941, F1 : 0.917
Class angry : Precision : 0.919, Recall : 0.857, F1 : 0.887
Ignoring the Others class, Macro Precision : 0.9013, Macro Recall : 0.8913, Macro F1 : 0.8963
Ignoring the Others class, Micro TP : 22460, FP : 2524, FN : 2194
Accuracy : 0.9018, Micro Precision : 0.8990, Micro Recall : 0.9110, Micro F1 : 0.9050

Dev score 

True Positives per class :  [ 120.  107. 2155.   95.]
False Positives per class :  [79. 80. 82. 37.]
False Negatives per class :  [ 30.  35. 183.  30.]
Class happy : Precision : 0.572, Recall : 0.754, F1 : 0.650
Class sad : Precision : 0.963, Recall : 0.922, F1 : 0.942
Class angry : Precision : 0.720, Recall : 0.760, F1 : 0.739
Ignoring the Others class, Macro Precision : 0.7517, Macro Recall : 0



(0.899092558983666, 0.9221439749608764, 0.9047984644913628, 0.9133888781243944)

In [231]:
print('Before Selection: ', hstack([X_new, all_feats]).shape)
lr = LogisticRegression(penalty='l1')
lr.fit(hstack([X_new, train_feats]), y)
model = SelectFromModel(lr, prefit=True)
X_new_2 = model.transform(hstack([X_new, train_feats]))
X_new_2_dev = model.transform(hstack([X_new_dev, dev_feats]))
print('After Selection:', X_new_2.shape)

Before Selection:  (30160, 5859)




After Selection: (30160, 3745)


In [232]:
lightGbmTrainDevKitScore(X_new_2, y, X_new_2_dev, y_dev, True)

Training score 

True Positives per class :  [ 4730.  3731. 14053.  4676.]
False Positives per class :  [ 437.  468. 1651.  414.]
False Negatives per class :  [776. 512. 895. 787.]
Class happy : Precision : 0.889, Recall : 0.879, F1 : 0.884
Class sad : Precision : 0.895, Recall : 0.940, F1 : 0.917
Class angry : Precision : 0.919, Recall : 0.856, F1 : 0.886
Ignoring the Others class, Macro Precision : 0.9007, Macro Recall : 0.8918, Macro F1 : 0.8962
Ignoring the Others class, Micro TP : 22460, FP : 2533, FN : 2194
Accuracy : 0.9015, Micro Precision : 0.8987, Micro Recall : 0.9110, Micro F1 : 0.9048

Dev score 

True Positives per class :  [ 122.  107. 2158.   92.]
False Positives per class :  [80. 78. 83. 35.]
False Negatives per class :  [ 28.  35. 180.  33.]
Class happy : Precision : 0.578, Recall : 0.754, F1 : 0.654
Class sad : Precision : 0.963, Recall : 0.923, F1 : 0.943
Class angry : Precision : 0.724, Recall : 0.736, F1 : 0.730
Ignoring the Others class, Macro Precision : 0.7553,

(0.8998185117967332,
 0.9232275754014885,
 0.9047984644913628,
 0.9139201240791005)

In [294]:
print('Before Selection: ', hstack([full_conv_CVect, train_feats]).shape)
lr = LogisticRegression(penalty='l1')
lr.fit(hstack([full_conv_CVect, train_feats]), y)
model = SelectFromModel(lr, prefit=True)
X_new_3 = model.transform(hstack([full_conv_CVect, train_feats]))
X_new_dev_3 = model.transform(hstack([full_conv_CVect_dev, dev_feats]))
print('After Selection:', X_new_3.shape)
lightGbmTrainDevKitScore(X_new_3, y, X_new_dev_3, y_dev, True)

Before Selection:  (30160, 229340)




After Selection: (30160, 4344)
Training score 

True Positives per class :  [ 4730.  3731. 14053.  4676.]
False Positives per class :  [ 437.  468. 1651.  414.]
False Negatives per class :  [776. 512. 895. 787.]
Class happy : Precision : 0.889, Recall : 0.879, F1 : 0.884
Class sad : Precision : 0.895, Recall : 0.940, F1 : 0.917
Class angry : Precision : 0.919, Recall : 0.856, F1 : 0.886
Ignoring the Others class, Macro Precision : 0.9007, Macro Recall : 0.8918, Macro F1 : 0.8962
Ignoring the Others class, Micro TP : 22460, FP : 2533, FN : 2194
Accuracy : 0.9015, Micro Precision : 0.8987, Micro Recall : 0.9110, Micro F1 : 0.9048

Dev score 

True Positives per class :  [ 122.  107. 2158.   92.]
False Positives per class :  [80. 78. 83. 35.]
False Negatives per class :  [ 28.  35. 180.  33.]
Class happy : Precision : 0.578, Recall : 0.754, F1 : 0.654
Class sad : Precision : 0.963, Recall : 0.923, F1 : 0.943
Class angry : Precision : 0.724, Recall : 0.736, F1 : 0.730
Ignoring the Others c

(0.8998185117967332,
 0.9232275754014885,
 0.9047984644913628,
 0.9139201240791005)

# Summary for Logistic Regression Feats Selection

Dev score
 
Feature selection from the VectorCount and then stacking with the manual feats ====

Class happy : Precision : 0.572, Recall : 0.754, F1 : 0.650
Class sad : Precision : 0.963, Recall : 0.922, F1 : 0.942
Class angry : Precision : 0.720, Recall : 0.760, F1 : 0.739
Ignoring the Others class, Macro Precision : 0.7517, Macro Recall : 0.8117, Macro F1 : 0.7806
Ignoring the Others class, Micro TP : 2357, FP : 199, FN : 248
Accuracy : 0.8991, Micro Precision : 0.9221, Micro Recall : 0.9048, Micro F1 : 0.9134

Feature selection from the result from the first selection and the manual feats

Class happy : Precision : 0.578, Recall : 0.754, F1 : 0.654
Class sad : Precision : 0.963, Recall : 0.923, F1 : 0.943
Class angry : Precision : 0.724, Recall : 0.736, F1 : 0.730
Ignoring the Others class, Macro Precision : 0.7553, Macro Recall : 0.8042, Macro F1 : 0.7789
Ignoring the Others class, Micro TP : 2357, FP : 196, FN : 248
Accuracy : 0.8998, Micro Precision : 0.9232, Micro Recall : 0.9048, Micro F1 : 0.9139

Feature selection from the VectorCount and the manual feats ====
 
Class happy : Precision : 0.578, Recall : 0.754, F1 : 0.654
Class sad : Precision : 0.963, Recall : 0.923, F1 : 0.943
Class angry : Precision : 0.724, Recall : 0.736, F1 : 0.730
Ignoring the Others class, Macro Precision : 0.7553, Macro Recall : 0.8042, Macro F1 : 0.7789
Ignoring the Others class, Micro TP : 2357, FP : 196, FN : 248
Accuracy : 0.8998, Micro Precision : 0.9232, Micro Recall : 0.9048, Micro F1 : 0.9139

# Feature selection with Linear Discriminant Analysis

In [133]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [288]:
sklearn_lda_2000 = LDA(n_components=2000)
X_train_lda_2000 = sklearn_lda_2000.fit_transform(hstack([X_new, train_feats]).toarray(), y)
X_dev_lda_2000 = sklearn_lda_2000.transform(hstack([X_new_dev, dev_feats]).toarray())
lightGbmTrainDevKitScore(X_train_lda_2000, y, X_dev_lda_2000, y_dev, True)

Training score 

True Positives per class :  [ 5204.  3991. 14192.  5116.]
False Positives per class :  [277. 326. 692. 362.]
False Negatives per class :  [302. 252. 756. 347.]
Class happy : Precision : 0.924, Recall : 0.941, F1 : 0.932
Class sad : Precision : 0.954, Recall : 0.949, F1 : 0.951
Class angry : Precision : 0.934, Recall : 0.936, F1 : 0.935
Ignoring the Others class, Macro Precision : 0.9373, Macro Recall : 0.9422, Macro F1 : 0.9397
Ignoring the Others class, Micro TP : 23299, FP : 1380, FN : 1355
Accuracy : 0.9451, Micro Precision : 0.9441, Micro Recall : 0.9450, Micro F1 : 0.9446

Dev score 

True Positives per class :  [ 117.   93. 1969.   85.]
False Positives per class :  [126. 152.  87. 126.]
False Negatives per class :  [ 33.  49. 369.  40.]
Class happy : Precision : 0.380, Recall : 0.655, F1 : 0.481
Class sad : Precision : 0.958, Recall : 0.842, F1 : 0.896
Class angry : Precision : 0.403, Recall : 0.680, F1 : 0.506
Ignoring the Others class, Macro Precision : 0.5800,

(0.8217785843920146,
 0.8546974522292994,
 0.8241842610364684,
 0.8391635724057065)

In [135]:
sklearn_lda = LDA(n_components=1000)
X_train_lda = sklearn_lda.fit_transform(X_new_2.toarray(), y)



In [139]:
X_dev_lda = sklearn_lda.transform(X_new_2_dev.toarray())

In [230]:
lightGbmTrainDevKitScore(X_train_lda, y, X_dev_lda, y_dev, True)

Training score 

True Positives per class :  [ 5112.  3944. 13977.  4981.]
False Positives per class :  [386. 437. 864. 459.]
False Negatives per class :  [394. 299. 971. 482.]
Class happy : Precision : 0.900, Recall : 0.930, F1 : 0.915
Class sad : Precision : 0.942, Recall : 0.935, F1 : 0.938
Class angry : Precision : 0.916, Recall : 0.912, F1 : 0.914
Ignoring the Others class, Macro Precision : 0.9192, Macro Recall : 0.9254, Macro F1 : 0.9223
Ignoring the Others class, Micro TP : 22902, FP : 1760, FN : 1752
Accuracy : 0.9288, Micro Precision : 0.9286, Micro Recall : 0.9289, Micro F1 : 0.9288

Dev score 

True Positives per class :  [ 118.   99. 2008.   85.]
False Positives per class :  [115. 145.  79. 106.]
False Negatives per class :  [ 32.  43. 330.  40.]
Class happy : Precision : 0.406, Recall : 0.697, F1 : 0.513
Class sad : Precision : 0.962, Recall : 0.859, F1 : 0.908
Class angry : Precision : 0.445, Recall : 0.680, F1 : 0.538
Ignoring the Others class, Macro Precision : 0.6043,

(0.838475499092559, 0.8691514670896114, 0.8414587332053742, 0.8550809440218452)

In [142]:
sklearn_lda_2 = LDA(n_components=1000)
X_train_lda_2 = sklearn_lda_2.fit_transform(hstack([X_new, train_feats]).toarray(), y)



In [145]:
X_dev_lda_2 = sklearn_lda_2.transform(hstack([X_new_dev, dev_feats]).toarray())

In [233]:
lightGbmTrainDevKitScore(X_train_lda_2, y, X_dev_lda_2, y_dev, True)

Training score 

True Positives per class :  [ 5207.  3983. 14192.  5119.]
False Positives per class :  [279. 332. 690. 358.]
False Negatives per class :  [299. 260. 756. 344.]
Class happy : Precision : 0.923, Recall : 0.939, F1 : 0.931
Class sad : Precision : 0.954, Recall : 0.949, F1 : 0.952
Class angry : Precision : 0.935, Recall : 0.937, F1 : 0.936
Ignoring the Others class, Macro Precision : 0.9371, Macro Recall : 0.9417, Macro F1 : 0.9394
Ignoring the Others class, Micro TP : 23294, FP : 1380, FN : 1360
Accuracy : 0.9450, Micro Precision : 0.9441, Micro Recall : 0.9448, Micro F1 : 0.9445

Dev score 

True Positives per class :  [ 116.   95. 1972.   87.]
False Positives per class :  [121. 153.  86. 125.]
False Negatives per class :  [ 34.  47. 366.  38.]
Class happy : Precision : 0.383, Recall : 0.669, F1 : 0.487
Class sad : Precision : 0.958, Recall : 0.843, F1 : 0.897
Class angry : Precision : 0.410, Recall : 0.696, F1 : 0.516
Ignoring the Others class, Macro Precision : 0.5839,

(0.8239564428312159,
 0.8554408260524226,
 0.8268714011516315,
 0.8409135272301387)

In [149]:
sklearn_lda_3 = LDA(n_components=400)
X_train_lda_3 = sklearn_lda_3.fit_transform(hstack([X_new, train_feats]).toarray(), y)



In [150]:
X_dev_lda_3 = sklearn_lda_3.transform(hstack([X_new_dev, dev_feats]).toarray())

In [234]:
lightGbmTrainDevKitScore(X_train_lda_3, y, X_dev_lda_3, y_dev, True)

Training score 

True Positives per class :  [ 5207.  3983. 14192.  5119.]
False Positives per class :  [279. 332. 690. 358.]
False Negatives per class :  [299. 260. 756. 344.]
Class happy : Precision : 0.923, Recall : 0.939, F1 : 0.931
Class sad : Precision : 0.954, Recall : 0.949, F1 : 0.952
Class angry : Precision : 0.935, Recall : 0.937, F1 : 0.936
Ignoring the Others class, Macro Precision : 0.9371, Macro Recall : 0.9417, Macro F1 : 0.9394
Ignoring the Others class, Micro TP : 23294, FP : 1380, FN : 1360
Accuracy : 0.9450, Micro Precision : 0.9441, Micro Recall : 0.9448, Micro F1 : 0.9445

Dev score 

True Positives per class :  [ 116.   95. 1972.   87.]
False Positives per class :  [121. 153.  86. 125.]
False Negatives per class :  [ 34.  47. 366.  38.]
Class happy : Precision : 0.383, Recall : 0.669, F1 : 0.487
Class sad : Precision : 0.958, Recall : 0.843, F1 : 0.897
Class angry : Precision : 0.410, Recall : 0.696, F1 : 0.516
Ignoring the Others class, Macro Precision : 0.5839,

(0.8239564428312159,
 0.8554408260524226,
 0.8268714011516315,
 0.8409135272301387)

# Summary of feats selection with LDA

Dev score:


LDA with n_components 2000, using the features from  the first log regresion feat selection and stacking them with the manual feats

Class happy : Precision : 0.380, Recall : 0.655, F1 : 0.481
Class sad : Precision : 0.958, Recall : 0.842, F1 : 0.896
Class angry : Precision : 0.403, Recall : 0.680, F1 : 0.506
Ignoring the Others class, Macro Precision : 0.5800, Macro Recall : 0.7257, Macro F1 : 0.6447
Ignoring the Others class, Micro TP : 2147, FP : 365, FN : 458
Accuracy : 0.8218, Micro Precision : 0.8547, Micro Recall : 0.8242, Micro F1 : 0.8392

LDA with n_components 1000, using the features from  the second log regresion feat selection

Class happy : Precision : 0.406, Recall : 0.697, F1 : 0.513
Class sad : Precision : 0.962, Recall : 0.859, F1 : 0.908
Class angry : Precision : 0.445, Recall : 0.680, F1 : 0.538
Ignoring the Others class, Macro Precision : 0.6043, Macro Recall : 0.7453, Macro F1 : 0.6675
Ignoring the Others class, Micro TP : 2192, FP : 330, FN : 413
Accuracy : 0.8385, Micro Precision : 0.8692, Micro Recall : 0.8415, Micro F1 : 0.8551

LDA with n_components 1000, using the features from  the first log regresion feat selection and stacking them with the manual feats

Class happy : Precision : 0.383, Recall : 0.669, F1 : 0.487
Class sad : Precision : 0.958, Recall : 0.843, F1 : 0.897
Class angry : Precision : 0.410, Recall : 0.696, F1 : 0.516
Ignoring the Others class, Macro Precision : 0.5839, Macro Recall : 0.7362, Macro F1 : 0.6512
Ignoring the Others class, Micro TP : 2154, FP : 364, FN : 451
Accuracy : 0.8240, Micro Precision : 0.8554, Micro Recall : 0.8269, Micro F1 : 0.8409

LDA with n_components 400, using the features from  the first log regresion feat selection and stacking them with the manual feats

Class happy : Precision : 0.383, Recall : 0.669, F1 : 0.487
Class sad : Precision : 0.958, Recall : 0.843, F1 : 0.897
Class angry : Precision : 0.410, Recall : 0.696, F1 : 0.516
Ignoring the Others class, Macro Precision : 0.5839, Macro Recall : 0.7362, Macro F1 : 0.6512
Ignoring the Others class, Micro TP : 2154, FP : 364, FN : 451
Accuracy : 0.8240, Micro Precision : 0.8554, Micro Recall : 0.8269, Micro F1 : 0.8409

# Feature selection with Chi-Square

In [154]:
from sklearn.feature_selection import SelectKBest, chi2

In [201]:
from sklearn.naive_bayes import GaussianNB

In [219]:
selector = SelectKBest(chi2, k=3000)
X_new = selector.fit_transform(full_conv_CVect, y)
X_new_dev = selector.transform(full_conv_CVect_dev)
trainDevKitScore(GaussianNB(),
                 hstack([X_new, train_feats]).toarray(), y,
                 hstack([X_new_dev, dev_feats]).toarray(), y_dev, True)

Training score 

True Positives per class :  [  684.   437. 14067.    57.]
False Positives per class :  [1.2030e+03 3.2800e+02 1.3376e+04 8.0000e+00]
False Negatives per class :  [4822. 3806.  881. 5406.]
Class happy : Precision : 0.571, Recall : 0.103, F1 : 0.175
Class sad : Precision : 0.513, Recall : 0.941, F1 : 0.664
Class angry : Precision : 0.877, Recall : 0.010, F1 : 0.021
Ignoring the Others class, Macro Precision : 0.6536, Macro Recall : 0.3515, Macro F1 : 0.4571
Ignoring the Others class, Micro TP : 14561, FP : 13712, FN : 10093
Accuracy : 0.5055, Micro Precision : 0.5150, Micro Recall : 0.5906, Micro F1 : 0.5502

Dev score 

True Positives per class :  [1.700e+01 1.300e+01 2.172e+03 2.000e+00]
False Positives per class :  [149.  40. 362.   0.]
False Negatives per class :  [133. 129. 166. 123.]
Class happy : Precision : 0.245, Recall : 0.092, F1 : 0.133
Class sad : Precision : 0.857, Recall : 0.929, F1 : 0.892
Class angry : Precision : 1.000, Recall : 0.016, F1 : 0.031
Ignori

(0.8, 0.8447276940903824, 0.8395393474088292, 0.8421255294570659)

In [224]:
lightGbmTrainDevKitScore(hstack([X_new, train_feats]), y, hstack([X_new_dev, dev_feats]), y_dev, True)

Training score 





True Positives per class :  [ 4743.  3727. 14060.  4673.]
False Positives per class :  [ 429.  464. 1656.  408.]
False Negatives per class :  [763. 516. 888. 790.]
Class happy : Precision : 0.889, Recall : 0.878, F1 : 0.884
Class sad : Precision : 0.895, Recall : 0.941, F1 : 0.917
Class angry : Precision : 0.920, Recall : 0.855, F1 : 0.886
Ignoring the Others class, Macro Precision : 0.9012, Macro Recall : 0.8915, Macro F1 : 0.8963
Ignoring the Others class, Micro TP : 22460, FP : 2528, FN : 2194
Accuracy : 0.9020, Micro Precision : 0.8988, Micro Recall : 0.9110, Micro F1 : 0.9049

Dev score 

True Positives per class :  [ 122.  107. 2159.   93.]
False Positives per class :  [76. 81. 81. 36.]
False Negatives per class :  [ 28.  35. 179.  32.]
Class happy : Precision : 0.569, Recall : 0.754, F1 : 0.648
Class sad : Precision : 0.964, Recall : 0.923, F1 : 0.943
Class angry : Precision : 0.721, Recall : 0.744, F1 : 0.732
Ignoring the Others class, Macro Precision : 0.7513, Macro Recall : 0



(0.9005444646098003,
 0.9225655064528745,
 0.9055662188099808,
 0.9139868268113134)

In [278]:
selector = SelectKBest(chi2, k=1000)
X_new = selector.fit_transform(full_conv_CVect, y)
X_new_dev = selector.transform(full_conv_CVect_dev)

In [218]:
trainDevKitScore(GaussianNB(),
                 hstack([X_new, train_feats]).toarray(), y,
                 hstack([X_new_dev, dev_feats]).toarray(), y_dev, True)

Training score 

True Positives per class :  [  708.   462. 14027.    59.]
False Positives per class :  [1.2450e+03 3.5700e+02 1.3294e+04 8.0000e+00]
False Negatives per class :  [4798. 3781.  921. 5404.]
Class happy : Precision : 0.564, Recall : 0.109, F1 : 0.183
Class sad : Precision : 0.513, Recall : 0.938, F1 : 0.664
Class angry : Precision : 0.881, Recall : 0.011, F1 : 0.021
Ignoring the Others class, Macro Precision : 0.6527, Macro Recall : 0.3527, Macro F1 : 0.4579
Ignoring the Others class, Micro TP : 14548, FP : 13659, FN : 10106
Accuracy : 0.5058, Micro Precision : 0.5158, Micro Recall : 0.5901, Micro F1 : 0.5504

Dev score 

True Positives per class :  [1.80e+01 1.30e+01 2.17e+03 2.00e+00]
False Positives per class :  [151.  40. 361.   0.]
False Negatives per class :  [132. 129. 168. 123.]
Class happy : Precision : 0.245, Recall : 0.092, F1 : 0.133
Class sad : Precision : 0.857, Recall : 0.928, F1 : 0.891
Class angry : Precision : 1.000, Recall : 0.016, F1 : 0.031
Ignoring t

(0.7996370235934664,
 0.8449342614075793,
 0.8387715930902111,
 0.8418416490078983)

In [279]:
lightGbmTrainDevKitScore(hstack([X_new, train_feats]), y, hstack([X_new_dev, dev_feats]), y_dev, True)

Training score 





True Positives per class :  [ 4735.  3728. 14071.  4667.]
False Positives per class :  [ 437.  464. 1652.  406.]
False Negatives per class :  [771. 515. 877. 796.]
Class happy : Precision : 0.889, Recall : 0.879, F1 : 0.884
Class sad : Precision : 0.895, Recall : 0.941, F1 : 0.918
Class angry : Precision : 0.920, Recall : 0.854, F1 : 0.886
Ignoring the Others class, Macro Precision : 0.9014, Macro Recall : 0.8914, Macro F1 : 0.8964
Ignoring the Others class, Micro TP : 22466, FP : 2522, FN : 2188
Accuracy : 0.9019, Micro Precision : 0.8991, Micro Recall : 0.9113, Micro F1 : 0.9051

Dev score 

True Positives per class :  [ 122.  108. 2161.   94.]
False Positives per class :  [77. 78. 81. 34.]
False Negatives per class :  [ 28.  34. 177.  31.]
Class happy : Precision : 0.581, Recall : 0.761, F1 : 0.659
Class sad : Precision : 0.964, Recall : 0.924, F1 : 0.944
Class angry : Precision : 0.734, Recall : 0.752, F1 : 0.743
Ignoring the Others class, Macro Precision : 0.7596, Macro Recall : 0



(0.9019963702359347,
 0.9244913928012519,
 0.9071017274472168,
 0.9157140089130014)

In [280]:
selector = SelectKBest(chi2, k=500)
X_new = selector.fit_transform(hstack([full_conv_CVect, train_feats]), y)
X_new_dev = selector.transform(hstack([full_conv_CVect_dev, dev_feats]))
lightGbmTrainDevKitScore(X_new, y, X_new_dev, y_dev, True)

Training score 

True Positives per class :  [ 4718.  3702. 14025.  4655.]
False Positives per class :  [ 449.  466. 1708.  437.]
False Negatives per class :  [788. 541. 923. 808.]
Class happy : Precision : 0.888, Recall : 0.872, F1 : 0.880
Class sad : Precision : 0.891, Recall : 0.938, F1 : 0.914
Class angry : Precision : 0.914, Recall : 0.852, F1 : 0.882
Ignoring the Others class, Macro Precision : 0.8979, Macro Recall : 0.8876, Macro F1 : 0.8927
Ignoring the Others class, Micro TP : 22382, FP : 2611, FN : 2272
Accuracy : 0.8985, Micro Precision : 0.8955, Micro Recall : 0.9078, Micro F1 : 0.9016

Dev score 

True Positives per class :  [ 122.  106. 2156.   93.]
False Positives per class :  [81. 76. 83. 38.]
False Negatives per class :  [ 28.  36. 182.  32.]
Class happy : Precision : 0.582, Recall : 0.746, F1 : 0.654
Class sad : Precision : 0.963, Recall : 0.922, F1 : 0.942
Class angry : Precision : 0.710, Recall : 0.744, F1 : 0.727
Ignoring the Others class, Macro Precision : 0.7518,

(0.899092558983666, 0.9228056426332288, 0.9040307101727447, 0.9133216986620128)

In [211]:
trainDevKitScore(GaussianNB(),X_new.toarray(), y, X_new_dev.toarray(), y_dev, True)

Training score 

True Positives per class :  [  715.   463. 14012.    53.]
False Positives per class :  [1.2720e+03 3.5900e+02 1.3282e+04 4.0000e+00]
False Negatives per class :  [4791. 3780.  936. 5410.]
Class happy : Precision : 0.563, Recall : 0.109, F1 : 0.183
Class sad : Precision : 0.513, Recall : 0.937, F1 : 0.663
Class angry : Precision : 0.930, Recall : 0.010, F1 : 0.019
Ignoring the Others class, Macro Precision : 0.6688, Macro Recall : 0.3521, Macro F1 : 0.4613
Ignoring the Others class, Micro TP : 14528, FP : 13645, FN : 10126
Accuracy : 0.5054, Micro Precision : 0.5157, Micro Recall : 0.5893, Micro F1 : 0.5500

Dev score 

True Positives per class :  [1.900e+01 1.400e+01 2.171e+03 2.000e+00]
False Positives per class :  [152.  38. 359.   0.]
False Negatives per class :  [131. 128. 167. 123.]
Class happy : Precision : 0.269, Recall : 0.099, F1 : 0.144
Class sad : Precision : 0.858, Recall : 0.929, F1 : 0.892
Class angry : Precision : 1.000, Recall : 0.016, F1 : 0.031
Ignori

(0.8007259528130671,
 0.8463622291021672,
 0.8395393474088292,
 0.8429369820774716)

In [281]:
selector = SelectKBest(chi2, k=400)
X_new = selector.fit_transform(hstack([full_conv_CVect, train_feats]), y)
X_new_dev = selector.transform(hstack([full_conv_CVect_dev, dev_feats]))
lightGbmTrainDevKitScore(X_new, y, X_new_dev, y_dev, True)

Training score 

True Positives per class :  [ 4699.  3689. 14025.  4635.]
False Positives per class :  [ 461.  462. 1746.  443.]
False Negatives per class :  [807. 554. 923. 828.]
Class happy : Precision : 0.889, Recall : 0.869, F1 : 0.879
Class sad : Precision : 0.889, Recall : 0.938, F1 : 0.913
Class angry : Precision : 0.913, Recall : 0.848, F1 : 0.879
Ignoring the Others class, Macro Precision : 0.8969, Macro Recall : 0.8854, Macro F1 : 0.8911
Ignoring the Others class, Micro TP : 22349, FP : 2651, FN : 2305
Accuracy : 0.8968, Micro Precision : 0.8940, Micro Recall : 0.9065, Micro F1 : 0.9002

Dev score 

True Positives per class :  [ 122.  108. 2161.   94.]
False Positives per class :  [74. 81. 81. 34.]
False Negatives per class :  [ 28.  34. 177.  31.]
Class happy : Precision : 0.571, Recall : 0.761, F1 : 0.653
Class sad : Precision : 0.964, Recall : 0.924, F1 : 0.944
Class angry : Precision : 0.734, Recall : 0.752, F1 : 0.743
Ignoring the Others class, Macro Precision : 0.7566,

(0.9019963702359347,
 0.9234075810863619,
 0.9071017274472168,
 0.9151820294345467)

In [282]:
selector = SelectKBest(chi2, k=300)
X_new = selector.fit_transform(hstack([full_conv_CVect, train_feats]), y)
X_new_dev = selector.transform(hstack([full_conv_CVect_dev, dev_feats]))
lightGbmTrainDevKitScore(X_new, y, X_new_dev, y_dev, True)

Training score 

True Positives per class :  [ 4670.  3671. 14011.  4604.]
False Positives per class :  [ 467.  486. 1819.  432.]
False Negatives per class :  [836. 572. 937. 859.]
Class happy : Precision : 0.883, Recall : 0.865, F1 : 0.874
Class sad : Precision : 0.885, Recall : 0.937, F1 : 0.910
Class angry : Precision : 0.914, Recall : 0.843, F1 : 0.877
Ignoring the Others class, Macro Precision : 0.8941, Macro Recall : 0.8818, Macro F1 : 0.8879
Ignoring the Others class, Micro TP : 22286, FP : 2737, FN : 2368
Accuracy : 0.8938, Micro Precision : 0.8906, Micro Recall : 0.9040, Micro F1 : 0.8972

Dev score 

True Positives per class :  [ 121.  107. 2154.   93.]
False Positives per class :  [77. 81. 87. 35.]
False Negatives per class :  [ 29.  35. 184.  32.]
Class happy : Precision : 0.569, Recall : 0.754, F1 : 0.648
Class sad : Precision : 0.961, Recall : 0.921, F1 : 0.941
Class angry : Precision : 0.727, Recall : 0.744, F1 : 0.735
Ignoring the Others class, Macro Precision : 0.7523,

(0.8983666061705989,
 0.9206100899491592,
 0.9036468330134357,
 0.9120495931809376)

In [283]:
selector = SelectKBest(chi2, k=200)
X_new = selector.fit_transform(hstack([full_conv_CVect, train_feats]), y)
X_new_dev = selector.transform(hstack([full_conv_CVect_dev, dev_feats]))
lightGbmTrainDevKitScore(X_new, y, X_new_dev, y_dev, True)

Training score 

True Positives per class :  [ 4618.  3623. 13970.  4519.]
False Positives per class :  [ 490.  501. 1981.  458.]
False Negatives per class :  [888. 620. 978. 944.]
Class happy : Precision : 0.879, Recall : 0.854, F1 : 0.866
Class sad : Precision : 0.876, Recall : 0.935, F1 : 0.904
Class angry : Precision : 0.908, Recall : 0.827, F1 : 0.866
Ignoring the Others class, Macro Precision : 0.8874, Macro Recall : 0.8719, Macro F1 : 0.8796
Ignoring the Others class, Micro TP : 22112, FP : 2940, FN : 2542
Accuracy : 0.8863, Micro Precision : 0.8826, Micro Recall : 0.8969, Micro F1 : 0.8897

Dev score 

True Positives per class :  [ 118.  108. 2135.   89.]
False Positives per class :  [93. 83. 89. 40.]
False Negatives per class :  [ 32.  34. 203.  36.]
Class happy : Precision : 0.565, Recall : 0.761, F1 : 0.649
Class sad : Precision : 0.960, Recall : 0.913, F1 : 0.936
Class angry : Precision : 0.690, Recall : 0.712, F1 : 0.701
Ignoring the Others class, Macro Precision : 0.7384,

(0.8892921960072595,
 0.9166666666666666,
 0.8952015355086372,
 0.9058069528063701)

# Summary for Chi_2

k=3000, Selection from CountVector + stacking manual features

Class happy : Precision : 0.569, Recall : 0.754, F1 : 0.648
Class sad : Precision : 0.964, Recall : 0.923, F1 : 0.943
Class angry : Precision : 0.721, Recall : 0.744, F1 : 0.732
Ignoring the Others class, Macro Precision : 0.7513, Macro Recall : 0.8070, Macro F1 : 0.7782
Ignoring the Others class, Micro TP : 2359, FP : 198, FN : 246
Accuracy : 0.9005, Micro Precision : 0.9226, Micro Recall : 0.9056, Micro F1 : 0.9140

k=1000, Selection from CountVector + stacking manual features

Class happy : Precision : 0.581, Recall : 0.761, F1 : 0.659
Class sad : Precision : 0.964, Recall : 0.924, F1 : 0.944
Class angry : Precision : 0.734, Recall : 0.752, F1 : 0.743
Ignoring the Others class, Macro Precision : 0.7596, Macro Recall : 0.8123, Macro F1 : 0.7851
Ignoring the Others class, Micro TP : 2363, FP : 193, FN : 242
Accuracy : 0.9020, Micro Precision : 0.9245, Micro Recall : 0.9071, Micro F1 : 0.9157

k=500, Selection from CountVector and manual features

Class happy : Precision : 0.582, Recall : 0.746, F1 : 0.654
Class sad : Precision : 0.963, Recall : 0.922, F1 : 0.942
Class angry : Precision : 0.710, Recall : 0.744, F1 : 0.727
Ignoring the Others class, Macro Precision : 0.7518, Macro Recall : 0.8042, Macro F1 : 0.7771
Ignoring the Others class, Micro TP : 2355, FP : 197, FN : 250
Accuracy : 0.8991, Micro Precision : 0.9228, Micro Recall : 0.9040, Micro F1 : 0.9133

k=400, Selection from CountVector and manual features

Class happy : Precision : 0.571, Recall : 0.761, F1 : 0.653
Class sad : Precision : 0.964, Recall : 0.924, F1 : 0.944
Class angry : Precision : 0.734, Recall : 0.752, F1 : 0.743
Ignoring the Others class, Macro Precision : 0.7566, Macro Recall : 0.8123, Macro F1 : 0.7834
Ignoring the Others class, Micro TP : 2363, FP : 196, FN : 242
Accuracy : 0.9020, Micro Precision : 0.9234, Micro Recall : 0.9071, Micro F1 : 0.9152

k=300, Selection from CountVector and manual features

Class happy : Precision : 0.569, Recall : 0.754, F1 : 0.648
Class sad : Precision : 0.961, Recall : 0.921, F1 : 0.941
Class angry : Precision : 0.727, Recall : 0.744, F1 : 0.735
Ignoring the Others class, Macro Precision : 0.7523, Macro Recall : 0.8063, Macro F1 : 0.7784
Ignoring the Others class, Micro TP : 2354, FP : 203, FN : 251
Accuracy : 0.8984, Micro Precision : 0.9206, Micro Recall : 0.9036, Micro F1 : 0.9120

k=200, Selection from CountVector and manual features

Class happy : Precision : 0.565, Recall : 0.761, F1 : 0.649
Class sad : Precision : 0.960, Recall : 0.913, F1 : 0.936
Class angry : Precision : 0.690, Recall : 0.712, F1 : 0.701
Ignoring the Others class, Macro Precision : 0.7384, Macro Recall : 0.7952, Macro F1 : 0.7658
Ignoring the Others class, Micro TP : 2332, FP : 212, FN : 273
Accuracy : 0.8893, Micro Precision : 0.9167, Micro Recall : 0.8952, Micro F1 : 0.9058


# Final Summary: Best results for each selection

## LightGBM feats selection

Class happy : Precision : 0.584, Recall : 0.761, F1 : 0.661
Class sad : Precision : 0.964, Recall : 0.923, F1 : 0.943
Class angry : Precision : 0.710, Recall : 0.744, F1 : 0.727
Ignoring the Others class, Macro Precision : 0.7525, Macro Recall : 0.8093, Macro F1 : 0.7799
Ignoring the Others class, Micro TP : 2360, FP : 196, FN : 245
Accuracy : 0.9009, Micro Precision : 0.9233, Micro Recall : 0.9060, Micro F1 : 0.9146

## Logistic Regresion feats selection

Dev score
 
Feature selection from the VectorCount and then stacking with the manual feats ====

Class happy : Precision : 0.572, Recall : 0.754, F1 : 0.650
Class sad : Precision : 0.963, Recall : 0.922, F1 : 0.942
Class angry : Precision : 0.720, Recall : 0.760, F1 : 0.739
Ignoring the Others class, Macro Precision : 0.7517, Macro Recall : 0.8117, Macro F1 : 0.7806
Ignoring the Others class, Micro TP : 2357, FP : 199, FN : 248
Accuracy : 0.8991, Micro Precision : 0.9221, Micro Recall : 0.9048, Micro F1 : 0.9134

Feature selection from the result from the first selection and the manual feats

Class happy : Precision : 0.578, Recall : 0.754, F1 : 0.654
Class sad : Precision : 0.963, Recall : 0.923, F1 : 0.943
Class angry : Precision : 0.724, Recall : 0.736, F1 : 0.730
Ignoring the Others class, Macro Precision : 0.7553, Macro Recall : 0.8042, Macro F1 : 0.7789
Ignoring the Others class, Micro TP : 2357, FP : 196, FN : 248
Accuracy : 0.8998, Micro Precision : 0.9232, Micro Recall : 0.9048, Micro F1 : 0.9139

## LDA feats selection

LDA with n_components 1000, using the features from  the second log regresion feat selection

Class happy : Precision : 0.406, Recall : 0.697, F1 : 0.513
Class sad : Precision : 0.962, Recall : 0.859, F1 : 0.908
Class angry : Precision : 0.445, Recall : 0.680, F1 : 0.538
Ignoring the Others class, Macro Precision : 0.6043, Macro Recall : 0.7453, Macro F1 : 0.6675
Ignoring the Others class, Micro TP : 2192, FP : 330, FN : 413
Accuracy : 0.8385, Micro Precision : 0.8692, Micro Recall : 0.8415, Micro F1 : 0.8551

## Chi_2 feats selection

k=1000, Selection from CountVector + stacking manual features

Class happy : Precision : 0.581, Recall : 0.761, F1 : 0.659
Class sad : Precision : 0.964, Recall : 0.924, F1 : 0.944
Class angry : Precision : 0.734, Recall : 0.752, F1 : 0.743
Ignoring the Others class, Macro Precision : 0.7596, Macro Recall : 0.8123, Macro F1 : 0.7851
Ignoring the Others class, Micro TP : 2363, FP : 193, FN : 242
Accuracy : 0.9020, Micro Precision : 0.9245, Micro Recall : 0.9071, Micro F1 : 0.9157

# Saving Some of the selected features

In [295]:
X_test = pd.read_csv('D:\\Machine Learning\\Datasets\\EmoContext\\testwithoutlabels.txt',
                        sep='\t', index_col='id')

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')
full_conv = X[['turn1', 'turn2', 'turn3']].apply(lambda x: ' '.join(x), axis=1)
full_conv_CVect = ctv.fit_transform(full_conv)
full_conv_dev = X_dev[['turn1', 'turn2', 'turn3']].apply(lambda x: ' '.join(x), axis=1)
full_conv_CVect_dev = ctv.transform(full_conv_dev)
full_conv_test = X_test[['turn1', 'turn2', 'turn3']].apply(lambda x: ' '.join(x), axis=1)
full_conv_CVect_test = ctv.transform(full_conv_test)

In [296]:

print('Before Selection: ', hstack([full_conv_CVect, train_feats]).shape)
lr = LogisticRegression(penalty='l1')
lr.fit(hstack([full_conv_CVect, train_feats]), y)
model = SelectFromModel(lr, prefit=True)
X_log_select = model.transform(hstack([full_conv_CVect, train_feats]))
X_log_select_dev = model.transform(hstack([full_conv_CVect_dev, dev_feats]))
X_log_select_test = model.transform(hstack([full_conv_CVect_test, test_feats]))
print('After Selection:', X_log_select.shape)

Before Selection:  (30160, 229340)




After Selection: (30160, 4340)


In [300]:
features_path = 'D:\\Machine Learning\\Datasets\\EmoContext\\Features\\FeatSelection\\' 
sparse.save_npz(features_path + r'\log_reg\train.npz', X_log_select)
sparse.save_npz(features_path + r'\log_reg\dev.npz', X_log_select_dev)
sparse.save_npz(features_path + r'\log_reg\test.npz', X_log_select_test)

In [301]:
clf = lightgbm.LGBMClassifier()
clf.fit(full_conv_CVect.astype(np.float64), y)
feat_import = dict(zip(np.arange(full_conv_CVect.shape[1]), clf.feature_importances_))
feat_import = sorted(feat_import.items(), key=lambda value: value[1], reverse=True)
feat_import = {v[0]:v[1] for v in feat_import if v[1]>0}
features_most_import = full_conv_CVect.tocsc()[:,[*feat_import]]
features_most_import_dev = full_conv_CVect_dev.tocsc()[:,[*feat_import]]
features_most_import_test = full_conv_CVect_test.tocsc()[:,[*feat_import]]

In [302]:
sparse.save_npz(features_path + r'\lgbm\train.npz',
                hstack([features_most_import, train_feats]))
sparse.save_npz(features_path + r'\lgbm\dev.npz',
                hstack([features_most_import_dev, dev_feats]))
sparse.save_npz(features_path + r'\lgbm\test.npz', 
                hstack([features_most_import_test, test_feats]))

In [303]:
selector = SelectKBest(chi2, k=1000)
X_chi2 = selector.fit_transform(full_conv_CVect, y)
X_chi2_dev = selector.transform(full_conv_CVect_dev)
X_chi2_test = selector.transform(full_conv_CVect_test)

In [305]:
sparse.save_npz(features_path + r'\chi2\train.npz',
                hstack([X_chi2, train_feats]))
sparse.save_npz(features_path + r'\chi2\dev.npz',
                hstack([X_chi2_dev, dev_feats]))
sparse.save_npz(features_path + r'\chi2\test.npz', 
                hstack([X_chi2_test, test_feats]))

In [None]:
================================

In [None]:
================================

In [120]:
X_test = pd.read_csv('D:\\Machine Learning\\Datasets\\EmoContext\\testwithoutlabels.txt',
                        sep='\t', index_col='id')


In [180]:
full_conv_test = X_test[['turn1', 'turn2', 'turn3']].apply(lambda x: ' '.join(x), axis=1)
full_conv_CVect_test = ctv.transform(full_conv_test)
X_new_test = selector.transform(hstack([full_conv_CVect_test, test_feats]))

In [271]:
full_conv_CVect_test = ctv.transform(full_conv_test)
X_new_test = full_conv_CVect_test.tocsc()[:,[*feat_import]]

In [274]:
clf = lightgbm.LGBMClassifier(random_state=42)
clf.fit(hstack([features_most_import, train_feats]), y)
test_pred = clf.predict(hstack([X_new_test, test_feats]))



In [275]:
test_labels = le.inverse_transform(test_pred)

In [276]:
test_labels

array(['others', 'others', 'angry', ..., 'others', 'others', 'others'],
      dtype=object)

In [277]:
solutionPath = 'D:\\Machine Learning\\Datasets\\EmoContext\\test.txt'
testDataPath = 'D:\\Machine Learning\\Datasets\\EmoContext\\testwithoutlabels.txt'
with io.open(solutionPath, "w", encoding="utf8") as fout:
        fout.write('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n')        
        with io.open(testDataPath, encoding="utf8") as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
                fout.write(test_labels[lineNum] + '\n')