In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.svm import SVR

sns.set_style('whitegrid')
%matplotlib inline
pd.set_option('max_colwidth', 500)

In [10]:
train_file_path = '../datasets/calculated_features/train.csv'
test_file_path = '../datasets/calculated_features/test.csv'

In [11]:
train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

In [12]:
train.head()

Unnamed: 0,id,tweet,intensity,emotion,no_of_tokens,avg_token_len,upper_case_tokens,title_case_tokens,exclamation_mark_count,question_mark_count,...,max_anger_sim,max_fear_sim,joy_affect_score,sad_affect_score,anger_affect_score,fear_affect_score,joy_senselevel_score,sad_senselevel_score,anger_senselevel_score,fear_senselevel_score
0,10000,How the fu*k! Who the heck! moved my fridge!... should I knock the landlord door. #angry #mad ##,0.938,anger,24,3.208333,0.041667,0.125,3,0,...,0.683652,0.57411,0.0,0.5,1.512,0.547,0,1,2,1
1,10001,So my Indian Uber driver just called someone the N word. If I wasn't in a moving vehicle I'd have jumped out #disgusted,0.896,anger,27,3.592593,0.111111,0.259259,0,0,...,0.465649,0.499185,0.0,0.295,0.0,0.0,0,0,0,0
2,10002,@DPD_UK I asked for my parcel to be delivered to a pick up store not my address #fuming #poorcustomerservice,0.896,anger,22,4.045455,0.090909,0.045455,0,0,...,0.43386,0.532098,0.172,0.0,0.812,0.0,0,0,1,0
3,10003,so ef whichever butt wipe pulled the fire alarm in davis bc I was sound asleep #pissed #angry #upset #tired #sad #tired #hangry ######,0.896,anger,29,3.586207,0.034483,0.034483,0,0,...,0.683652,0.57411,0.0,1.406,2.672,1.324,0,1,2,2
4,10004,"Don't join @BTCare they put the phone down on you, talk over you and are rude. Taking money out of my acc willynilly! #fuming",0.896,anger,29,3.517241,0.0,0.068966,1,0,...,0.43386,0.556819,0.531,0.281,0.886,0.0,1,0,2,0


In [60]:
features = [
    'no_of_tokens',
     'avg_token_len',
     'upper_case_tokens',
     'title_case_tokens',
     'exclamation_mark_count',
     'question_mark_count',
     'quote_mark_count',
     'emoji_count',
     'joy_valuable_tok_score',
     'sad_valuable_tok_score',
     'anger_valuable_tok_score',
     'fear_valuable_token_score',
     'joy_syn_ant_score',
     'sad_syn_ant_score',
     'fear_syn_ant_score',
     'anger_syn_ant_score',
     'cuss_count',
     'avg_joy_sim',
     'avg_sad_sim',
     'avg_anger_sim',
     'avg_fear_sim',
     'max_joy_sim',
     'max_sad_sim',
     'max_anger_sim',
     'max_fear_sim',
     'joy_affect_score',
     'sad_affect_score',
     'anger_affect_score',
     'fear_affect_score',
     'joy_senselevel_score',
     'sad_senselevel_score',
     'anger_senselevel_score',
     'fear_senselevel_score'
]

label = ['intensity']

emotions = ['joy', 'sadness', 'fear', 'anger']

In [147]:
def fit_predict_evaluate(gs_clf, X_train, y_train, X_test, y_test):
    gs_clf.fit(X_train, y_train)
    
    pred_train = gs_clf.predict(X_train)
    pred_test = gs_clf.predict(X_test)
    
    mae_test = mean_absolute_error(y_test, pred_test)
    mse_test = mean_squared_error(y_test, pred_test)
    
    mae_train = mean_absolute_error(y_train, pred_train)
    mse_train = mean_squared_error(y_train, pred_train)
    
    pred_train_s = pd.Series(data=pred_train.flatten(), index=y_train.index, name='intensity')
    pred_test_s = pd.Series(data=pred_test.flatten(), index=y_test.index, name='intensity')
    
    train_corr = y_train['intensity'].corr(pred_train_s)
    test_corr = y_test['intensity'].corr(pred_test_s)
    
    res = {
        'mae_train': mae_train, 'mae_test': mae_test, 
        'mse_train': mse_train, 'mse_test': mse_test, 
        'train_corr': train_corr, 'test_corr': test_corr
    }
    return gs_clf.best_estimator_, res

In [153]:
def get_results_for_emotion(emotion, train=train, test=test):
    
    train_e = train[train['emotion'] == emotion]
    test_e = test[test['emotion'] == emotion]
    print(train_e.shape, test_e.shape)

    X_train = train_e[features]
    y_train = train_e[label]
    X_test = test_e[features]
    y_test = test_e[label]
    
    # Initialize regressors
    clf_LR = LinearRegression(n_jobs=-1)
    parameters_LR = {
        "normalize": [True, False]
    }

    clf_RF = RandomForestRegressor(random_state=42, n_jobs=-1)
    parameters_RF = {
        "n_estimators": [10, 20, 50, 100, 250],
        'max_depth': [2, 4, 6, 8, 10]
    }

    clf_svm = SVR()
    parameters_svm = {
        'kernel': ['linear', 'rbf'],
        'C':[0.01, 0.3, 0.1, 1, 10, 50]
    }

    classifiers = [clf_LR, clf_RF, clf_svm]
    parameters = [parameters_LR, parameters_RF, parameters_svm]
    
    trained_classifiers = {}
    results = {}
    
    # Train

    for clf, params in zip(classifiers, parameters):
        gs_clf = GridSearchCV(clf, params, n_jobs=-1, verbose=5, cv=5)
        clf_LR, res = fit_predict_evaluate(gs_clf, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
        trained_classifiers[clf_LR.__class__.__name__] = clf_LR
        results[clf_LR.__class__.__name__] = res
        
    return pd.DataFrame(results), trained_classifiers

In [154]:
all_emotion_results = dict()

for emotion in emotions:
    print('Solving {}...'.format(emotion))
    res_df, classifiers = get_results_for_emotion(emotion, train=train, test=test)
    this_dict = {'result_df': res_df, 'classifiers': classifiers}
    all_emotion_results[emotion] = this_dict

Solving joy...
(902, 37) (714, 37)
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    2.1s remaining:    3.2s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.3s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.3s finished


Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   15.4s finished
  self.best_estimator_.fit(X, y, **fit_params)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  32 out of  60 | elapsed:    1.2s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed:   50.1s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   51.3s finished
  y = column_or_1d(y, warn=True)


Solving sadness...
(860, 37) (673, 37)
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   21.7s finished
  self.best_estimator_.fit(X, y, **fit_params)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  32 out of  60 | elapsed:    0.9s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.1min finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.1s remaining:    0.0s


Solving fear...
(1257, 37) (993, 37)
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   17.9s finished
  self.best_estimator_.fit(X, y, **fit_params)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed:   55.1s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.0min finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.0s


Solving anger...
(941, 37) (760, 37)
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   13.8s finished
  self.best_estimator_.fit(X, y, **fit_params)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.0min finished
  y = column_or_1d(y, warn=True)


In [170]:
all_emotion_results['fear']['result_df']

Unnamed: 0,LinearRegression,RandomForestRegressor,SVR
mae_train,0.10713,0.06581,0.108795
mae_test,0.129895,0.12796,0.133119
mse_train,0.018047,0.006627,0.018725
mse_test,0.025433,0.025528,0.026599
train_corr,0.720996,0.916962,0.708723
test_corr,0.606618,0.607646,0.584229


In [171]:
corr_list = []
for emo in emotions:
    max_corr_val = max(all_emotion_results[emo]['result_df'].loc['test_corr'])
#     print(emo, max_corr_val)
    corr_list.append(max_corr_val)
    
print('Mean correlation over all 4 emotions: {}'.format(np.mean(corr_list)))

Mean correlation over all 4 emotions: 0.6263797193608739
