# 3. MELD+Tweets, BoW

## 3.1 Data preparation and filtering

(a) Loading the MELD data and dropping the 'Neutral' label

In [1]:
import pandas as pd
filepath = './data/MELD/train_sent_emo.csv'
meld_dftrain = pd.read_csv(filepath)
meld_dftrain['Utterance'] = meld_dftrain['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")

filepath = './data/MELD/test_sent_emo.csv'
meld_dftest = pd.read_csv(filepath)
meld_dftest['Utterance'] = meld_dftest['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")

meld_dftrain = meld_dftrain.set_index("Emotion", drop=False)
meld_dftrain = meld_dftrain.drop("neutral", axis=0)

meld_dftest = meld_dftest.set_index("Emotion", drop=False)
meld_dftest = meld_dftest.drop("neutral", axis=0)

  meld_dftrain['Utterance'] = meld_dftrain['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")
  meld_dftest['Utterance'] = meld_dftest['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")


(b) Loading the Tweets data

In [2]:
filepath = 'data/wassa/training/all.train.tsv'
tweets_dftrain = pd.read_csv(filepath, sep='\t')

filepath = 'data/wassa/testing/all.test.tsv'
tweets_dftest = pd.read_csv(filepath, sep='\t')

(c) Combining the two data sets
  -  Rename the following axes the same names:
      -  MELD  "Sr No.": "ID", "Utterance": "Sent"
      -  Tweets  "Tweet": "Sent", "Label": "Emotion"
  - Concatenate the two dataframes in a way that their IDs, sentences and labels are aligned. Two additional keys: "MELD" and "Tweets" are added to identify from where a particular entry is.

In [10]:
# Change index of MELD back to number sequence
meld_dftrain = meld_dftrain.set_index(pd.Series(list(range(len(meld_dftrain)))))
meld_dftest = meld_dftest.set_index(pd.Series(list(range(len(meld_dftest)))))

# Training data
meld_dftrain = meld_dftrain.rename(columns={"Sr No.": "ID", "Utterance": "Sent"})
tweets_dftrain = tweets_dftrain.rename(columns={"Tweet": "Sent", "Label": "Emotion"})
combined_dftrain = pd.concat([meld_dftrain, tweets_dftrain], keys=['MELD', 'Tweets'])

# Test data
meld_dftest = meld_dftest.rename(columns={"Sr No.": "ID", "Utterance": "Sent"})
tweets_dftest = tweets_dftest.rename(columns={"Tweet": "Sent", "Label": "Emotion"})
combined_dftest = pd.concat([meld_dftest, tweets_dftest], keys=['MELD', 'Tweets'])

In [11]:
# Check the changes
meld_dftrain.head()

Unnamed: 0,ID,Sent,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"
1,11,No don't I beg of you!,Chandler,fear,negative,0,10,8,21,"00:17:02,856","00:17:04,858"
2,13,Really?!,Chandler,surprise,positive,0,12,8,21,"00:17:13,491","00:17:16,536"
3,15,But then who? The waitress I went out with las...,Joey,surprise,negative,1,0,9,23,"00:36:40,364","00:36:42,824"
4,16,You know? Forget it!,Rachel,sadness,negative,1,1,9,23,"00:36:44,368","00:36:46,578"


In [12]:
tweets_dftrain.head()

Unnamed: 0,ID,Sent,Emotion,Score
0,10000,How the fu*k! Who the heck! moved my fridge!.....,anger,0.938
1,10001,So my Indian Uber driver just called someone t...,anger,0.896
2,10002,@DPD_UK I asked for my parcel to be delivered ...,anger,0.896
3,10003,so ef whichever butt wipe pulled the fire alar...,anger,0.896
4,10004,Don't join @BTCare they put the phone down on ...,anger,0.896


In [13]:
combined_dftrain.head()

Unnamed: 0,Unnamed: 1,ID,Sent,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,Score
MELD,0,5,My duties? All right.,Chandler,surprise,positive,0.0,4.0,8.0,21.0,"00:16:34,452","00:16:40,917",
MELD,1,11,No don't I beg of you!,Chandler,fear,negative,0.0,10.0,8.0,21.0,"00:17:02,856","00:17:04,858",
MELD,2,13,Really?!,Chandler,surprise,positive,0.0,12.0,8.0,21.0,"00:17:13,491","00:17:16,536",
MELD,3,15,But then who? The waitress I went out with las...,Joey,surprise,negative,1.0,0.0,9.0,23.0,"00:36:40,364","00:36:42,824",
MELD,4,16,You know? Forget it!,Rachel,sadness,negative,1.0,1.0,9.0,23.0,"00:36:44,368","00:36:46,578",


In [14]:
combined_dftrain.tail()

# The data sets are concatenated

Unnamed: 0,Unnamed: 1,ID,Sent,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,Score
Tweets,3608,40781,@VivienLloyd Thank you so much! Just home - st...,,sadness,,,,,,,,0.104
Tweets,3609,40782,Just put the winter duvet on ☃️❄️🌬☔️,,sadness,,,,,,,,0.104
Tweets,3610,40783,@SilkInSide @TommyJoeRatliff that's so pretty!...,,sadness,,,,,,,,0.088
Tweets,3611,40784,@BluesfestByron second artist announcement loo...,,sadness,,,,,,,,0.083
Tweets,3612,40785,I can literally eat creamy pesto pasta topped ...,,sadness,,,,,,,,0.083


(d) Tokenizing and filtering

In [40]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Using spaCy to tokenize the sentences
training_data_3 = [nlp(sent) for sent in list(combined_dftrain['Sent'])]
training_labels_3 = list(combined_dftrain['Emotion'])

test_data_3 = [nlp(sent) for sent in list(combined_dftest['Sent'])]
test_labels_3 = list(combined_dftest['Emotion'])

In [10]:
# for sent in training_data_1:
#     for token in sent:
#         if "I'm" == token.text:
#             print(sent)

    (i) Filter A: MaxDF

In [16]:
from utils import low_high_mid_df

min_df = 2
max_df = len(training_data_3)//10

low_df, high_df, clean3A = low_high_mid_df(min_df, max_df, training_data_3)

print("Rare words with low df = ", len(low_df), "words. Examples: ", list(low_df)[:20])
print("Stop words with high df:", high_df)
vocab_3A = set()
for sent in clean3A:
    for t in sent:
        vocab_3A.add(t)
print("Size of the rest vocab:", len(vocab_3A))
print("Samples:", clean3A[100:103])

Min_df 2
Max_df 889
Rare words with low df =  5972 words. Examples:  ['lolololol', '@fireemblemlord', '@childrensissue', 'witout', 'lookd', '@gopro', 'unconsciously', 'gaze', 'netizen', 'hey!-hey', 'quotient', '@hunterhaye', 'laudrup', 'lakeside', 'crisis', 'shrill', 'lousy', 'charity', 'momentarily', 'fighting']
Stop words with high df: {'do', 'you', 'have', 'i', '.', 'be', 'a', '?', 'to', 'it', "n't", 'and', ',', '!', 'the', 'that', '#', 'my', 'of'}
Size of the rest vocab: 4915
Samples: [['hey', 'ross', 'would', 'great', 'if', 'we', 'could', 'go', 'two', 'straight', 'hour', 'without', 'drop'], ['okay'], ['uh', '-', 'oh']]


    (ii) Filter B: DTandPRP

In [17]:
from utils import remove_DT_PRP

min_df = 2

low_df, DTandPRP_tok, clean3B = remove_DT_PRP(min_df, training_data_3)

print("Rare words with low df = ", len(low_df), "words. Examples:", list(low_df)[:20])
vocab_3B = set()
for sent in clean3B:
    for t in sent:
        vocab_3B.add(t)
print("Size of the rest vocab:", len(vocab_3B))
print("Samples:", clean3B[100:105])

Determiner and pronouns {'bridgetjonesbaby', 'no', "it's", 'i', '@snub23', "i'm", '@ryyyshh', 'tbh', 'tux', 'em', 'ba', 'hbu', 'strength.\\nthey', 'himself', 'every', '\\n\\nsam', 'this', "i'y'know", 'myself', 'jut', 'ours', 'your', 'yourself', '👅', 'oldham\\nnext', "up'i", '🍁', '@blackeyed_susie', '✨', 'itself', '#', "i'i'm", 'ty', 'you', '@british_airways', '@relaqss', 'ourselves', "'s", 'its', "you're'you", 'xx', "mean'i", 'an', 'don’t', "you're", '@mhchat', '’s', 'any', 'one', 'those', 'each', '😡', '\\nit', 'd', 'the-', 'themselves', 'his', '@its.finfin', '@m_t_f_72', '\uf62b', '@fra93_bruno', "i'll", "that'you", 'my', 'they', 'ya', 'yours', '@neyaphemmaster', '\\n\\nother', 'also-', 'another', "was'the", '😧', 'eagles.\\nthey', 'tho', 'ek', "y'", 'it', 'scarred,\\nthis', 'near,\\nthe', 'n', 'her', 'the', 'y', "fact'yes", '@kevincanwaitcbs', "underwear'you", '🐮', 'their', 'memphis', '@themathofyou', '@weebtard', "film'that", 'she', '_', '@sargon_of_akkad', 'either', 'we', 'some', '@

## 3.2 BoW vectorization and training the classifiers

In [19]:
from sklearn import preprocessing
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm

(a) Encoding training labels:

In [20]:
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(training_labels_3+test_labels_3)
print(list(label_encoder.classes_))

['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']


In [21]:
training_classes = label_encoder.transform(training_labels_3)
print(training_classes[:5])
print(list(combined_dftrain['Emotion'])[:5])
print(list(combined_dftrain['Sent'])[:5])

[5 2 5 5 4]
['surprise', 'fear', 'surprise', 'surprise', 'sadness']
['My duties?  All right.', "No don't I beg of you!", 'Really?!', 'But then who? The waitress I went out with last month?', 'You know? Forget it!']


### Filter A

(a) Vectorise

In [22]:
# A CountVectorizer which takes tokenized lists as input
### Taken from https://stackoverflow.com/questions/35867484/pass-tokens-to-countvectorizer,
### https://stackoverflow.com/questions/27673527/how-should-i-vectorize-the-following-list-of-lists-with-scikit-learn, 26 Oct 2021

def dummy(x):
    return x

utterance_vec_3A = CountVectorizer(tokenizer=dummy, lowercase=False)

training_count_vectors_3A = utterance_vec_3A.fit_transform(clean3A)

In [31]:
print(training_count_vectors_3A .toarray()[0][0:10])

[1 0 0 0 0 0 0 0 0 0]


In [26]:
#Total number of word features or the length of the total vector
print(len(utterance_vec_3A.vocabulary_))

4915


In [27]:
# First 50 feature names
print(list(utterance_vec_3A.get_feature_names())[:50])

[' ', '  ', '   ', '"', '#funny', '#whatever', '$', '%', '&', "'", "'\\n\\nhe", "'cause", "'d", "'em", "'everywhere", "'i", "'ll", "'s", "'ve", '(', ')', '):', '*', '+', '-', '--', '-2.5', '-dalai', '-terrible-', '..', '...', '....', '.....', '......', '.......', '..........', '.@divamagazine', '.@simonnricketts', '.@tolumanda', '/', '0', '1', '1,000', '1/2', '10', '10/11', '100', '1000', '100k', '101']


In [32]:
# Convert raw frequency counts into TF-IDF values
tfidf_transformer = TfidfTransformer()
training_tfidf_3A = tfidf_transformer.fit_transform(training_count_vectors_3A)

In [33]:
print(training_tfidf_3A.toarray()[0][:10])

[0.49906941 0.         0.         0.         0.         0.
 0.         0.         0.         0.        ]


In [34]:
from sklearn.calibration import CalibratedClassifierCV
linear_model = svm.LinearSVC(max_iter=2000)
svm_linear_clf_3A = CalibratedClassifierCV(linear_model , method='sigmoid', cv=10)

svm_linear_clf_3A.fit(training_tfidf_3A, training_classes)

CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=2000), cv=10)

### Filter B

(a) Vectorise

In [35]:
utterance_vec_3B = CountVectorizer(tokenizer=dummy, lowercase=False)

training_count_vectors_3B = utterance_vec_3B.fit_transform(clean3B)
training_tfidf_3B = tfidf_transformer.fit_transform(training_count_vectors_3B)

In [36]:
#Total number of word features or the length of the total vector
print(len(utterance_vec_3B.vocabulary_))

4887


In [37]:
# First 50 feature names
print(list(utterance_vec_3B.get_feature_names())[:50])

[' ', '  ', '   ', '!', '"', '#', '#funny', '#whatever', '$', '%', '&', "'", "'\\n\\nhe", "'cause", "'d", "'everywhere", "'i", "'ll", "'s", "'ve", '(', ')', '):', '*', '+', ',', '-', '--', '-2.5', '-dalai', '-terrible-', '.', '..', '...', '....', '.....', '......', '.......', '..........', '.@divamagazine', '.@simonnricketts', '.@tolumanda', '/', '0', '1', '1,000', '1/2', '10', '10/11', '100']


(b) Train the classifier

In [38]:
linear_model = svm.LinearSVC(max_iter=2000)
svm_linear_clf_3B = CalibratedClassifierCV(linear_model , method='sigmoid', cv=10)

svm_linear_clf_3B.fit(training_tfidf_3B, training_classes)

CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=2000), cv=10)

## 3.3 Predicting the test data and results

In [45]:
import sklearn
from sklearn.metrics import classification_report

Encode the test labels

In [41]:
test_classes = label_encoder.transform(test_labels_3)
print(test_classes[:20])
print(list(combined_dftest['Emotion'])[:5])
print(list(combined_dftest['Sent'])[:5])

[5 0 3 3 3 3 3 3 3 4 5 0 0 0 3 3 2 0 1 5]
['surprise', 'anger', 'joy', 'joy', 'joy']
["Why do all you're coffee mugs have numbers on the bottom?", "Oh. That's so Monica can keep track. That way if one on them is missing, she can be like, 'Where's number 27?!'", 'Push!', "Push 'em out, push 'em out, harder, harder.", "Push 'em out, push 'em out, way out!"]


### Filter A:

In [42]:
max_df_test = len(test_data_3)//10

low_df_test_3A, high_df_test_3A, test_mid_df_3A = \
low_high_mid_df(2, max_df_test, test_data_3)

Min_df 2
Max_df 449


In [43]:
test_count_3A = utterance_vec_3A.transform(test_mid_df_3A)
test_tfidf_3A = tfidf_transformer.fit_transform(test_count_3A)

y_pred_svm_3A = svm_linear_clf_3A.predict(test_tfidf_3A)

In [46]:
report3A = classification_report(test_classes,y_pred_svm_3A,digits = 6)
print(label_encoder.classes_)
print('BoW TFIDF SVM LINEAR: MELD+Tweets, Filter A')
print('Word mininum document frequency', min_df, "; max:", max_df_test)
print(report3A)

print('Confusion matrix SVM, BoW MELD+Tweets, Filter A')
print(label_encoder.classes_)
print(sklearn.metrics.confusion_matrix(test_classes,y_pred_svm_3A))

['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
BoW TFIDF SVM LINEAR: MELD+Tweets, Filter A
Word mininum document frequency 2 ; max: 449
              precision    recall  f1-score   support

           0   0.697248  0.619005  0.655801      1105
           1   0.333333  0.014706  0.028169        68
           2   0.812780  0.693780  0.748580      1045
           3   0.608754  0.810036  0.695117      1116
           4   0.694724  0.627696  0.659511       881
           5   0.433628  0.523132  0.474194       281

    accuracy                       0.670374      4496
   macro avg   0.596745  0.548059  0.543562      4496
weighted avg   0.679660  0.670374  0.667008      4496

Confusion matrix SVM, BoW MELD+Tweets, Filter A
['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
[[684   1  56 213  87  64]
 [ 22   1   2  23   5  15]
 [ 72   1 725 125  92  30]
 [ 71   0  47 904  47  47]
 [100   0  52 140 553  36]
 [ 32   0  10  80  12 147]]


In [48]:
pred_probabilities_3A = svm_linear_clf_3A.predict_proba(test_tfidf_3A)

pred_labels_3A = []
for predicted_label in y_pred_svm_3A:
    pred_labels_3A.append(label_encoder.classes_[predicted_label])

gold_labels_3A = []
for gold_label in test_classes:
    gold_labels_3A.append(label_encoder.classes_[gold_label])

result_frame3A = pd.DataFrame(pred_probabilities_3A*100, columns=label_encoder.classes_)

result_frame3A['Chat']= list(combined_dftest['Sent'])
result_frame3A['Prediction']=pred_labels_3A
result_frame3A['Gold']=gold_labels_3A

result_frame3A.to_csv("result_frame3A.csv")
result_frame3A.head()

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,Chat,Prediction,Gold
0,7.552957,5.227196,23.327851,17.448887,17.569593,28.873515,Why do all you're coffee mugs have numbers on ...,surprise,surprise
1,14.510428,2.16884,13.283216,45.837272,20.615817,3.584428,Oh. That's so Monica can keep track. That way ...,joy,anger
2,26.853089,3.108969,5.722333,9.683424,43.172987,11.459197,Push!,sadness,joy
3,17.164984,5.033923,3.576142,17.268503,54.412347,2.544101,"Push 'em out, push 'em out, harder, harder.",sadness,joy
4,24.564224,5.020865,4.587863,28.055049,27.917534,9.854465,"Push 'em out, push 'em out, way out!",joy,joy


In [49]:
def average_importances(model):
    coef_avg = 0
    for classifier in model.calibrated_classifiers_:
        coef_avg = coef_avg + classifier.base_estimator.coef_
        
    coef_avg  = coef_avg/len(model.calibrated_classifiers_)
    return coef_avg

def f_importances(importances, names, n=20):
    class_labels = label_encoder.classes_
    
    for num, imp in enumerate(importances):
        emotion = class_labels[num]
        topn = sorted(zip(imp,names), reverse=True)[:n]
        
        print("Important words in {} utterances".format(emotion))
        for coef, feat in topn:
            print(emotion, coef, feat)
        print("-----------------------------------------")

print('Most important features per emotion: 3A')
feature_names = utterance_vec_3A.get_feature_names()
importances = average_importances(svm_linear_clf_3A)
f_importances(importances, feature_names)

Most important features per emotion: 3A
Important words in anger utterances
anger 3.6500079099561296 anger
anger 3.6415085332939965 rage
anger 3.4825419087377254 angry
anger 3.445447920127659 offend
anger 3.136321475267664 bitter
anger 2.933377672989411 fury
anger 2.923266443202013 revenge
anger 2.8706648768318184 offense
anger 2.8041745428527336 fume
anger 2.7846500212780403 burst
anger 2.6243876260005665 furious
anger 2.6138535976487836 wrath
anger 2.6059136669177465 madden
anger 2.576526635745796 rabid
anger 2.5637964000920155 resent
anger 2.490842286268994 outrage
anger 2.482362431406309 irritate
anger 2.451210075508196 snap
anger 2.449743090030281 relentless
anger 2.4422051384880628 insult
-----------------------------------------
Important words in disgust utterances
disgust 1.800699329624385 disgusting
disgust 1.7185263751602124 violate
disgust 1.6831454546496154 eww
disgust 1.6328923104597837 ew
disgust 1.5755829685543976 behave
disgust 1.513616158771912 clinic
disgust 1.507183

### Filter B:

In [50]:
low_df_test_3B, DTandPRP_test_3B, clean_test_3B = \
remove_DT_PRP(2, test_data_3)

Determiner and pronouns {'@sarahb45', 'no', 'i', "\\n\\n'you", '@capitalone', 'blm', "naya'\\n\\n'i", "i'm", '😿', '#behaviour', 'tbh', 'em', '@interception225', 'himself', 'ios10', 'every', '@aefadul22', 'this', 'myself', 'it.\\n#funny', '@barackobama', 'ours', 'your', '@the', 'yourself', '@ritujai18874', '@messyourself', '✨', 'that,\\ngives', '#', 'itself', '@bbnicole', 'theirs', '@barbour', 'you', '@digger_forum', 'ourselves', "'s", 'its', '@rosie', '»', '@johnjharwood', '@jankhambrams', 'an', "you're", 'lv', '’s', 'any', 'one', '@eliroth', '\\nso', 'those', 'each', 'd', '@realdonaldtrump', '@your', 'themselves', 'his', '😑', '🍂', '@space_gayz', 'lt', '😄', 'my', 'they', 'ya', '\\nmatt', 'yours', '@jdegrom19', '@jbanks88', 'another', 'yhat', 'nj@latimes', 'tho', "y'", 'it', 'her', '@talktalkcare', 'tvgirl', 'the', 'y', 'boys', '@ryuredwings2', '\\nindia', '@xmaseveevil1', 'their', '@adele', 'she', '_', 'either', 'stupid?that', '\\nwhat', "they're", 'we', 'some', 'these', "i'i", 'all', 

In [51]:
test_count_3B = utterance_vec_3B.transform(clean_test_3B)
test_tfidf_3B = tfidf_transformer.fit_transform(test_count_3B)

y_pred_svm_3B = svm_linear_clf_3B.predict(test_tfidf_3B)

In [54]:
report3B = classification_report(test_classes,y_pred_svm_3B,digits = 6)
print(label_encoder.classes_)
print('BoW TFIDF SVM LINEAR: MELD+Tweets, Filter B')
print('Word mininum document frequency', min_df, "; DT PRP removed")
print(report3B)

print('Confusion matrix SVM, BoW MELD+Tweets, Filter B')
print(label_encoder.classes_)
print(sklearn.metrics.confusion_matrix(test_classes,y_pred_svm_3B))

['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
BoW TFIDF SVM LINEAR: MELD+Tweets, Filter B
Word mininum document frequency 2 ; DT PRP removed
              precision    recall  f1-score   support

           0   0.705584  0.628959  0.665072      1105
           1   0.444444  0.058824  0.103896        68
           2   0.809101  0.697608  0.749229      1045
           3   0.626834  0.803763  0.704358      1116
           4   0.701887  0.633371  0.665871       881
           5   0.477333  0.637011  0.545732       281

    accuracy                       0.681050      4496
   macro avg   0.627531  0.576589  0.572360      4496
weighted avg   0.691157  0.681050  0.678594      4496

Confusion matrix SVM, BoW MELD+Tweets, Filter B
['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
[[695   0  61 207  74  68]
 [ 19   4   3  23   7  12]
 [ 76   3 729 111  93  33]
 [ 65   1  49 897  56  48]
 [103   1  54 130 558  35]
 [ 27   0   5  63   7 179]]


In [55]:
pred_probabilities_3B = svm_linear_clf_3B.predict_proba(test_tfidf_3B)

pred_labels_3B = []
for predicted_label in y_pred_svm_3B:
    pred_labels_3B.append(label_encoder.classes_[predicted_label])

gold_labels_3B = []
for gold_label in test_classes:
    gold_labels_3B.append(label_encoder.classes_[gold_label])

result_frame3B = pd.DataFrame(pred_probabilities_3B*100, columns=label_encoder.classes_)

result_frame3B['Chat']= list(combined_dftest['Sent'])
result_frame3B['Prediction']=pred_labels_3B
result_frame3B['Gold']=gold_labels_3B

result_frame3B.to_csv("result_frame3B.csv")

In [56]:
result_frame3B.head()

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,Chat,Prediction,Gold
0,7.635362,5.189763,16.871482,17.626829,22.579746,30.096819,Why do all you're coffee mugs have numbers on ...,surprise,surprise
1,13.809647,3.599743,4.47102,54.886823,21.225968,2.006798,Oh. That's so Monica can keep track. That way ...,joy,anger
2,27.387124,4.07207,4.512778,15.452332,31.10308,17.472617,Push!,sadness,joy
3,15.59684,4.697395,4.286271,6.929665,65.700451,2.789377,"Push 'em out, push 'em out, harder, harder.",sadness,joy
4,30.886224,4.271482,6.069298,14.626383,28.512423,15.63419,"Push 'em out, push 'em out, way out!",anger,joy


In [57]:
print('Most important features per emotion for the SVM classifier')
feature_names = utterance_vec_3B.get_feature_names()
importances = average_importances(svm_linear_clf_3B)
f_importances(importances, feature_names)

Most important features per emotion for the SVM classifier
Important words in anger utterances
anger 3.9517599889013715 rage
anger 3.922396823933238 anger
anger 3.517524094713816 offend
anger 3.493768271415148 angry
anger 3.332648121069419 bitter
anger 2.98024029625986 fury
anger 2.963529688697942 revenge
anger 2.9311406328470833 offense
anger 2.877865890601306 burst
anger 2.831146054666674 fume
anger 2.703502622967309 furious
anger 2.606341356066631 wrath
anger 2.5946693596563235 madden
anger 2.5847650912785753 rabid
anger 2.578290625839091 snap
anger 2.5721717238101998 relentless
anger 2.5538142586967756 resent
anger 2.5461598529838287 outrage
anger 2.4712136549583734 insult
anger 2.4512561381079156 irritate
-----------------------------------------
Important words in disgust utterances
disgust 1.7942395097464097 disgusting
disgust 1.6954919980888725 eww
disgust 1.672789381620575 violate
disgust 1.665964701307622 ew
disgust 1.557465856724705 behave
disgust 1.5512814425383408 clinic
d