# 4. MELD, Word-embeddings

## 4.1 Data preparation and filtering

(a) Loading the training and test data

In [1]:
import pandas as pd

In [2]:
filepath = './data/MELD/train_sent_emo.csv'
meld_dftrain = pd.read_csv(filepath)
meld_dftrain['Utterance'] = meld_dftrain['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")

filepath = './data/MELD/test_sent_emo.csv'
meld_dftest = pd.read_csv(filepath)
meld_dftest['Utterance'] = meld_dftest['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")

  meld_dftrain['Utterance'] = meld_dftrain['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")
  meld_dftest['Utterance'] = meld_dftest['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")


(b) MELD data preprocessing: removing 'Neutral' utterances

In [3]:
meld_dftrain = meld_dftrain.set_index("Emotion", drop=False)
meld_dftrain = meld_dftrain.drop("neutral", axis=0)

meld_dftest = meld_dftest.set_index("Emotion", drop=False)
meld_dftest = meld_dftest.drop("neutral", axis=0)

(c) Tokenizing and filtering the data

In [5]:
# Using spaCy to tokenize the sentences

import spacy
nlp = spacy.load("en_core_web_sm")
                 
training_data_4 = [nlp(sent) for sent in list(meld_dftrain['Utterance'])]
training_labels_4 = list(meld_dftrain['Emotion'])

test_data_4 = [nlp(sent) for sent in list(meld_dftest['Utterance'])]
test_labels_4 = list(meld_dftest['Emotion'])

### Filter A

In [8]:
from utils import low_high_mid_df
min_df = 2
max_df = len(training_data_4)//10

low_df, high_df, clean4A = low_high_mid_df(min_df, max_df, training_data_4)

print("Rare words with low df = ", len(low_df), "words. Examples: ", list(low_df)[:20])
print("Stop words with high df:", high_df)
vocab_4A = set()
for sent in clean4A:
    for t in sent:
        vocab_4A.add(t)
print("Size of the rest vocab:", len(vocab_4A))
print("Samples:", clean4A[10:20])

Min_df 2
Max_df 527
Rare words with low df =  1680 words. Examples:  ['paolo', 'liking', 'safety', 'juice', 'crime', 'series', "or'oh", '12', 'thquirt', 'pushover', 'we?re', 'greet', 'surgeon', 'law', 'chuck', 'bend', 'expressly', 'hooohhh', '22', 'coyote']
Stop words with high df: {'a', 'and', 'be', 'to', 'the', 'i', ',', 'you', '.', '!', 'what', 'that', '?', 'do', 'it', "n't", 'oh'}
Size of the rest vocab: 1530
Samples: [['just', 'coffee', 'where', 'we', 'gon', 'na', 'hang', 'out', 'now'], ['got'], [], ['um', '-', 'mm', 'yeah', 'right'], ['my', 'god', 'my', 'god', 'poor', 'monica'], [], [], ['he', 'think', 'monica', 'empty', 'she', 'empty', 'vase'], ['totally', 'god', 'she', 'seem', 'so', 'happy', 'too'], ['hey']]


### Filter B

In [9]:
from utils import remove_DT_PRP

min_df = 2

low_df, DTandPRP_tok, clean4B = remove_DT_PRP(min_df, training_data_4)

print("Rare words with low df = ", len(low_df), "words. Examples:", list(low_df)[:20])
vocab_4B = set()
for sent in clean4B:
    for t in sent:
        vocab_4B.add(t)
print("Size of the rest vocab:", len(vocab_4B))
print("Samples:", clean4B[10:20])

Determiner and pronouns {'she', "underwear'you", "i'll", "you're", "was'the", 'no', 'yours', "you're'you", 'his', "it's", 'some', 'you', 'tux', "i'm", 'their', 'i-', 'themselves', 'either', 'this', "mean'i", 'both', "'em", 'a', "i'y'know", "fact'yes", 'the', 'they', 'ours', 'that', 'neither', "film'that", 'those', "'s", 'mine', 'ya', 'each', 'myself', 'all', 'hers', 'i', 'that?s', 'its', 'her', "i'i'm", "that'you", "up'i", 'your', 'itself', 'ourselves', 'ba', 'my', 'the-', 'any', 'we', 'our', 'he', 'every', 'another', 'it', 'herself', 'an', 'yourself', 'these'}
Min_df 2
Rare words with low df =  1670 words. Examples: ['paolo', 'liking', 'safety', 'juice', 'crime', 'series', "or'oh", '12', 'thquirt', 'pushover', 'we?re', 'greet', 'surgeon', 'law', 'chuck', 'bend', 'expressly', 'hooohhh', '22', 'coyote']
Size of the rest vocab: 1515
Samples: [['just', 'coffee', '!', 'where', 'be', 'gon', 'na', 'hang', 'out', 'now', '?'], ['got', '.'], ['!'], ['um', '-', 'mm', ',', 'yeah', 'right', '!'], 

## 4.2 Word-embedding model and training the classifiers

(a) Encoding the labels

In [38]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(training_labels_4+test_labels_4)
print(list(label_encoder.classes_))

['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']


In [39]:
training_classes = label_encoder.transform(training_labels_4)
print(training_classes[:5])
print(list(meld_dftrain['Emotion'])[:5])
print(list(meld_dftrain['Utterance'])[:5])

[5 2 5 5 4]
['surprise', 'fear', 'surprise', 'surprise', 'sadness']
['My duties?  All right.', "No don't I beg of you!", 'Really?!', 'But then who? The waitress I went out with last month?', 'You know? Forget it!']


(b) Loading the embedding model

In [11]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

from os import path

wordembeddings="glove.twitter.27B.200d.txt"
glove_file = datapath(path.abspath('../glove/glove.twitter.27B.200d.txt'))

# Create a word2vec model from the Glove text data
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)

word_embedding_model = KeyedVectors.load_word2vec_format(tmp_file)

# Dimensions set to 200.
num_features = 200

# Converting Index2Word
index2word_set = set(word_embedding_model.index_to_key)

  _ = glove2word2vec(glove_file, tmp_file)


### Filter A

In [35]:
from utils import featureVecMethod, getAvgFeatureVecs

trainFeatureVecs_4A, embedding_words_4A, no_embedding_words_4A = \
getAvgFeatureVecs(clean4A,
                  word_embedding_model, 
                  index2word_set, 
                  num_features
                  )

Shape of our matrix is: (5279, 200)
Review 0 of 5279
Review 1000 of 5279
Review 2000 of 5279
Review 3000 of 5279
Review 4000 of 5279
Review 5000 of 5279


In [34]:
print(embedding_words_4A[:50])
print()
print(no_embedding_words_4A[:50])

['my', 'all', 'right', 'no', 'of', 'really', 'but', 'then', 'who', 'waitress', 'go', 'out', 'with', 'last', 'month', 'know', 'forget', 'no', '-', 'no', '-', 'no', '-', 'no', 'no', 'who', 'who', 'talk', 'about', 'no', '-', '-', '-', 'actually', 'know', 'ever', 'say', 'they', 'close', 'down', 'bar', 'no', 'way', 'just', 'coffee', 'where', 'we', 'gon', 'na', 'hang']

[' ', "y'know", '...', '...', ' ', ' ', ' ', "y'know", ' ', '15', '...', '...', '...', ' ', ' ', '...', ' ', ' ', ' ', "i'm", ' ', ' ', "y'know", ' ', ' ', ' ', ' ', '...', '  ', ' ', "nothin'", "nothin'", "it's", "y'know", '  ', '...', ' ', ' ', ' ', ' ', ' ', '  ', "y'know", '...', ' ', ' ', ' ', ' ', ' ', '...']


In [42]:
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV

linear_model = svm.LinearSVC(max_iter=2000)
svm_linear_clf_4A = CalibratedClassifierCV(linear_model , method='sigmoid', cv=10)

svm_linear_clf_4A.fit(trainFeatureVecs_4A, training_classes)

CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=2000), cv=10)

### Filter B

In [43]:
trainFeatureVecs_4B, embedding_words_4B, no_embedding_words_4B = \
getAvgFeatureVecs(clean4B,
                  word_embedding_model, 
                  index2word_set, 
                  num_features
                  )

Shape of our matrix is: (5279, 200)
Review 0 of 5279
Review 1000 of 5279
Review 2000 of 5279
Review 3000 of 5279
Review 4000 of 5279
Review 5000 of 5279


In [44]:
print(embedding_words_4B[:50])
print()
print(no_embedding_words_4B[:50])

['?', 'all', 'right', '.', 'no', 'do', "n't", 'of', '!', 'really', '?', '!', 'but', 'then', 'who', '?', 'waitress', 'go', 'out', 'with', 'last', 'month', '?', 'know', '?', 'forget', '!', 'no', '-', '-', 'no', '-', 'no', ',', 'no', '!', 'who', ',', 'who', 'be', 'talk', 'about', '?', 'no', ',', 'i', '-', '-', 'i', '-']

[' ', "y'know", '...', '...', ' ', ' ', ' ', "y'know", ' ', '15', '...', '...', '...', ' ', ' ', '...', ' ', ' ', ' ', "i'm", ' ', ' ', "y'know", ' ', ' ', ' ', ' ', '...', '  ', ' ', "nothin'", "nothin'", "y'know", '  ', '...', ' ', ' ', ' ', ' ', ' ', '  ', "y'know", '...', ' ', ' ', ' ', ' ', ' ', '...', 'goodacre']


In [45]:
linear_model = svm.LinearSVC(max_iter=2000)
svm_linear_clf_4B = CalibratedClassifierCV(linear_model , method='sigmoid', cv=10)

svm_linear_clf_4B.fit(trainFeatureVecs_4B, training_classes)

CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=2000), cv=10)

## 4.3 Predicting the test data and results

Loading the systems and encode the labels

In [50]:
test_classes_4 = label_encoder.transform(test_labels_4)
print(test_classes_4[:5])
print(list(meld_dftest['Emotion'])[:5])
print(list(meld_dftest['Utterance'])[:5])

[5 0 3 3 3]
['surprise', 'anger', 'joy', 'joy', 'joy']
["Why do all you're coffee mugs have numbers on the bottom?", "Oh. That's so Monica can keep track. That way if one on them is missing, she can be like, 'Where's number 27?!'", 'Push!', "Push 'em out, push 'em out, harder, harder.", "Push 'em out, push 'em out, way out!"]


### Filter A

(a) Prediction

In [70]:
max_df_test = len(test_data_4)//10

low_df_test_4A, high_df_test_4A, test_mid_df_4A = \
low_high_mid_df(2, max_df_test, test_data_4)

Min_df 2
Max_df 135


In [80]:
print(high_df_test_4A)

{'a', 'and', 'be', 'to', 'the', 'i', ',', 'you', '.', '!', 'what', 'that', '?', 'do', 'it', "n't", 'oh'}


In [71]:
testDataVecs_4A, test_4A_known_words, test_4A_unknown_words =\
getAvgFeatureVecs(test_mid_df_4A,
                  word_embedding_model, 
                  index2word_set, 
                  num_features) 

Shape of our matrix is: (1354, 200)
Review 0 of 1354
Review 1000 of 1354


In [72]:
y_pred_svm_4A = svm_linear_clf_4A.predict(testDataVecs_4A)

(b) Results

In [77]:
# 4. Evaluating and analyzing the result
from sklearn.metrics import classification_report

report_4A = classification_report(test_classes_4,y_pred_svm_4A,digits = 6)
print(label_encoder.classes_)
print('Embeddings SVM LINEAR: MELD, Filter A')
print('Word embedding model used', wordembeddings)
print(f'Word mininum document frequency: {min_df}; maximum: {max_df_test}')
print(report_4A)

['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
Embeddings SVM LINEAR: MELD, Filter A
Word embedding model used glove.twitter.27B.200d.txt
Word mininum document frequency: 2; maximum: 135
              precision    recall  f1-score   support

           0   0.426877  0.313043  0.361204       345
           1   1.000000  0.014706  0.028986        68
           2   0.666667  0.040000  0.075472        50
           3   0.433333  0.808458  0.564236       402
           4   0.380952  0.115385  0.177122       208
           5   0.461268  0.466192  0.463717       281

    accuracy                       0.436484      1354
   macro avg   0.561516  0.292964  0.278456      1354
weighted avg   0.466514  0.436484  0.387244      1354



In [78]:
import sklearn
print('Confusion matrix SVM, embeddings, MELD, Filter A')
print(label_encoder.classes_)
print(sklearn.metrics.confusion_matrix(test_classes_4,y_pred_svm_4A))

Confusion matrix SVM, embeddings, MELD, Filter A
['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
[[108   0   0 154  13  70]
 [ 25   1   0  24   3  15]
 [ 11   0   2  26   5   6]
 [ 33   0   0 325   7  37]
 [ 49   0   0 110  24  25]
 [ 27   0   1 111  11 131]]


In [74]:
pred_probabilities_4A = svm_linear_clf_4A.predict_proba(testDataVecs_4A)

pred_labels_4A = []
for predicted_label in y_pred_svm_4A:
    pred_labels_4A.append(label_encoder.classes_[predicted_label])

gold_labels_4A = []
for gold_label in test_classes_4:
    gold_labels_4A.append(label_encoder.classes_[gold_label])

result_frame4A = pd.DataFrame(pred_probabilities_4A*100, columns=label_encoder.classes_)

result_frame4A['Chat']= list(meld_dftest['Utterance'])
result_frame4A['Prediction']=pred_labels_4A
result_frame4A['Gold']=gold_labels_4A

result_frame4A.to_csv("result_frame4A.csv")
result_frame4A.head()

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,Chat,Prediction,Gold
0,21.974532,4.69751,1.819319,36.286647,9.914662,25.307331,Why do all you're coffee mugs have numbers on ...,joy,surprise
1,23.044432,4.281091,4.068578,43.770453,12.257258,12.578188,Oh. That's so Monica can keep track. That way ...,joy,anger
2,40.557398,6.361188,16.760104,21.53718,5.397572,9.386557,Push!,anger,joy
3,29.797333,4.415145,4.759714,35.45502,20.440459,5.132329,"Push 'em out, push 'em out, harder, harder.",joy,joy
4,29.587861,5.569622,6.130495,41.155173,13.071929,4.48492,"Push 'em out, push 'em out, way out!",joy,joy


### Filter B

In [63]:
low_df_test_4B, DTandPRP_test_4B, clean_test_4B = \
remove_DT_PRP(2, test_data_4)

Determiner and pronouns {'she', "you're", "they're", 'no', 'yours', 'his', "i'm", 'some', 'you', 'themselves', 'their', 'either', 'this', 'both', "'em", 'a', 'the', 'they', 'himself', 'that', '’s', "my'this", 'those', "'s", 'mine', 'ya', 'each', 'myself', 'all', 'hers', 'i', 'its', 'her', 'your', 'my', "i'i", 'any', 'we', 'our', 'he', 'every', 'one', 'another', 'it', 'an', 'yourself', 'these'}
Min_df 2


In [65]:
testDataVecs_4B, test_4B_known_words, test_4B_unknown_words =\
getAvgFeatureVecs(clean_test_4B,
                  word_embedding_model, 
                  index2word_set, 
                  num_features) 

Shape of our matrix is: (1354, 200)
Review 0 of 1354
Review 1000 of 1354


In [66]:
y_pred_svm_4B = svm_linear_clf_4B.predict(testDataVecs_4B)

In [75]:
report_4B = classification_report(test_classes_4,y_pred_svm_4B,digits = 6)
print(label_encoder.classes_)
print('Embeddings SVM LINEAR: MELD, Filter B')
print('Word embedding model used', wordembeddings)
print('Word mininum document frequency', min_df, "; DT PRP removed")
print(report_4B)

['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
Embeddings SVM LINEAR: MELD, Filter B
Word embedding model used glove.twitter.27B.200d.txt
Word mininum document frequency 2 ; DT PRP removed
              precision    recall  f1-score   support

           0   0.458955  0.356522  0.401305       345
           1   0.333333  0.029412  0.054054        68
           2   0.300000  0.060000  0.100000        50
           3   0.482036  0.800995  0.601869       402
           4   0.526316  0.240385  0.330033       208
           5   0.573290  0.626335  0.598639       281

    accuracy                       0.499261      1354
   macro avg   0.445655  0.352275  0.347650      1354
weighted avg   0.487705  0.499261  0.462291      1354



In [79]:
print('Confusion matrix SVM, embeddings, MELD, Filter B')
print(label_encoder.classes_)
print(sklearn.metrics.confusion_matrix(test_classes_4,y_pred_svm_4B))

Confusion matrix SVM, embeddings, MELD, Filter B
['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
[[123   2   2 136  16  66]
 [ 25   2   0  29   5   7]
 [ 10   0   3  21   6  10]
 [ 37   1   1 322  13  28]
 [ 46   1   2  89  50  20]
 [ 27   0   2  71   5 176]]


In [76]:
pred_probabilities_4B = svm_linear_clf_4B.predict_proba(testDataVecs_4B)

pred_labels_4B = []
for predicted_label in y_pred_svm_4B:
    pred_labels_4B.append(label_encoder.classes_[predicted_label])

gold_labels_4B = []
for gold_label in test_classes:
    gold_labels_4B.append(label_encoder.classes_[gold_label])

result_frame4B = pd.DataFrame(pred_probabilities_4B*100, columns=label_encoder.classes_)

result_frame4B['Chat']= list(meld_dftest['Utterance'])
result_frame4B['Prediction']=pred_labels_4B
result_frame4B['Gold']=gold_labels_4B

result_frame4B.to_csv("result_frame4B.csv")
result_frame4B.head()

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,Chat,Prediction,Gold
0,25.467972,4.532039,2.01875,15.162789,8.827758,43.990691,Why do all you're coffee mugs have numbers on ...,surprise,surprise
1,19.545227,4.368579,3.6652,46.422368,13.581513,12.417113,Oh. That's so Monica can keep track. That way ...,joy,anger
2,53.459969,3.640484,4.550625,29.936859,2.548967,5.863097,Push!,anger,joy
3,31.274717,3.971997,6.440881,25.025715,29.345872,3.940817,"Push 'em out, push 'em out, harder, harder.",anger,joy
4,43.620567,4.945454,6.789892,27.679692,11.559821,5.404574,"Push 'em out, push 'em out, way out!",anger,joy
