# 1. MELD, BoW

## 1.1 Data preparation and filtering

(a) Loading the training and test data

In [1]:
import pandas as pd

In [2]:
filepath = './data/MELD/train_sent_emo.csv'
meld_dftrain = pd.read_csv(filepath)
meld_dftrain['Utterance'] = meld_dftrain['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")

filepath = './data/MELD/test_sent_emo.csv'
meld_dftest = pd.read_csv(filepath)
meld_dftest['Utterance'] = meld_dftest['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")

  meld_dftrain['Utterance'] = meld_dftrain['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")
  meld_dftest['Utterance'] = meld_dftest['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")


(b) MELD data preprocessing: removing 'Neutral' utterances

In [3]:
meld_dftrain = meld_dftrain.set_index("Emotion", drop=False)
meld_dftrain = meld_dftrain.drop("neutral", axis=0)

meld_dftest = meld_dftest.set_index("Emotion", drop=False)
meld_dftest = meld_dftest.drop("neutral", axis=0)

In [4]:
list(meld_dftrain['Utterance'][:10])

['My duties?  All right.',
 "No don't I beg of you!",
 'Really?!',
 'But then who? The waitress I went out with last month?',
 'You know? Forget it!',
 'No-no-no-no, no! Who, who were you talking about?',
 "No, I-I-I-I don't, I actually don't know",
 'Do I ever.',
 "Chris says they're closing down the bar.",
 'No way!']

(c) Building the filters.
These filters are written here, and saved in utils.py.

### Filter A: MaxDF

In [5]:
def low_high_mid_df(min_df, max_df, texts):
    """
    This function separates texts into three iterables according to their document frequencies.
    :min_df: int, words occuring in documents less than this number will be put into low_df.
    :max_df: int, words occuring in documents more than this number will be put into high_df.
    :texts: a list of lists with sentences of spaCy-tokenized words.
    :return: low_df - set of words of which df is too low.
        high_df - set of words of which df is too high.
        mid_df_texts - list of texts in which the dfs of their words are in the middle.
    """
    new_texts = []
    alltokens = set()
    for text in texts:
        sent = []
        for token in text:
            token = token.lemma_.lower()  # Lemmatize the input and make them lowercase.
            sent.append(token)
            alltokens.add(token)
        new_texts.append(sent)
    
    kw_count = dict.fromkeys(alltokens, 0)
    for text in new_texts:
        for key in kw_count:
            if key in text:  # If a word is in a document,
                kw_count[key] += 1  # Df += 1

    low_df = set()
    high_df = set()
    for word, count in kw_count.items():
        if count>max_df:
            high_df.add(word)
        elif count<min_df:
            low_df.add(word)
            
    mid_df_texts = []
    for text in new_texts:
        mid_df_texts.append([tok for tok in text if (tok not in high_df) and (tok not in low_df)])
    
    print('Min_df', min_df)
    print('Max_df', max_df)
    return low_df, high_df, mid_df_texts

In [6]:
# A test for the filter
test_str_a = ["So beat it, just beat it",
"You better run, you better do what you can",
"Don't wanna see no blood, don't be a macho man",
"You wanna be tough, better do what you can",
"So beat it, but you wanna be bad",
"Just beat it (beat it), beat it (beat it)",
"No one wants to be defeated"]

import spacy
nlp = spacy.load("en_core_web_sm")

test_list_a = [nlp(sent) for sent in test_str_a]

low_high_mid_df(2, 3, test_list_a)

Min_df 2
Max_df 3


({'(',
  ')',
  'a',
  'bad',
  'blood',
  'but',
  'defeat',
  'macho',
  'man',
  "n't",
  'one',
  'run',
  'see',
  'to',
  'tough',
  'want'},
 {',', 'be'},
 [['so', 'beat', 'it', 'just', 'beat', 'it'],
  ['you', 'well', 'you', 'well', 'do', 'what', 'you', 'can'],
  ['do', 'wanna', 'no', 'do'],
  ['you', 'wanna', 'well', 'do', 'what', 'you', 'can'],
  ['so', 'beat', 'it', 'you', 'wanna'],
  ['just', 'beat', 'it', 'beat', 'it', 'beat', 'it', 'beat', 'it'],
  ['no']])

### Filter B: DTandPRP

In [7]:
def remove_DT_PRP(min_df, texts):
    """
    This function 1) removes determiners and pronouns in texts; 2) separates rare words with low df.
    :min_df: int, words occuring in documents less than this number will be put into low_df.
    :texts: a list of lists with sentences of spaCy-tokenized, POS-tagged words.
    :return: low_df - list of words of which df is too low.
        clean_texts - list of sentences with no determiners, pronouns and low-df words.
    """
    DTandPRP_tag = ["DT", "PRP", "PRP$"]
    DTandPRP_tok = set()
    vocab = set()
    new_texts = []
    for text in texts:
        sent = []
        for token in text:
            if token.tag_ in DTandPRP_tag:
                DTandPRP_tok.add(token.lemma_.lower())
            else:
                token = token.lemma_.lower()
                sent.append(token)
                vocab.add(token)  # Create a set a vocab without the DTs and PRPs
        new_texts.append(sent)
    
    kw_count = dict.fromkeys(vocab, 0)
    for text in new_texts:
        for key in kw_count:
            if key in text:  # If a word is in a document,
                kw_count[key] += 1  # Df += 1
    
    low_df = set()
    for word, count in kw_count.items():
        if count<min_df:
            low_df.add(word)
    
    clean_texts = []
    for text in new_texts:
        # Keep words if they are not DTs nor PRPs, and not lower than min_df
        clean_texts.append([tok for tok in text if (tok in vocab) and (tok not in low_df)])
    
    print('Determiner and pronouns', DTandPRP_tok)
    print('Min_df', min_df)
    return low_df, DTandPRP_tok, clean_texts

In [8]:
# A test for the filter
test_str_b = ["So beat it, just beat it",
"You better run, you better do what you can",
"Don't wanna see no blood, don't be a macho man",
"You wanna be tough, better do what you can",
"So beat it, but you wanna be bad",
"Just beat it (beat it), beat it (beat it)",
"No one wants to be defeated"]

test_list_b = [nlp(sent) for sent in test_str_b]

remove_DT_PRP(2, test_list_b)

Determiner and pronouns {'it', 'a', 'no', 'you'}
Min_df 2


({'(',
  ')',
  'bad',
  'blood',
  'but',
  'defeat',
  'macho',
  'man',
  "n't",
  'one',
  'run',
  'see',
  'to',
  'tough',
  'want'},
 {'a', 'it', 'no', 'you'},
 [['so', 'beat', ',', 'just', 'beat'],
  ['well', ',', 'well', 'do', 'what', 'can'],
  ['do', 'wanna', ',', 'do', 'be'],
  ['wanna', 'be', ',', 'well', 'do', 'what', 'can'],
  ['so', 'beat', ',', 'wanna', 'be'],
  ['just', 'beat', 'beat', ',', 'beat', 'beat'],
  ['be']])

(d) Tokenizing and filtering

In [9]:
# Using spaCy to tokenize the sentences
training_data_1 = [nlp(sent) for sent in list(meld_dftrain['Utterance'])]
training_labels_1 = list(meld_dftrain['Emotion'])

test_data_1 = [nlp(sent) for sent in list(meld_dftest['Utterance'])]
test_labels_1 = list(meld_dftest['Emotion'])

In [10]:
# for sent in training_data_1:
#     for token in sent:
#         if "I'm" == token.text:
#             print(sent)

    (i) Filter A: MaxDF

 - min_df would be 2, considering words appearing in less than 2 documents are rare.
 - max_df would be 1/10 of the amount of documents, which means if a word appear once in every 10 documents, its df is considered to be too high. Besides subjective and intuitive judgment, this number is also manipulated to make the vocabulary sizes of the two filters more similar and comparable.

In [11]:
#from utils import low_high_mid_df

min_df = 2
max_df = len(training_data_1)//10

low_df, high_df, clean1A = low_high_mid_df(min_df, max_df, training_data_1)

print("Rare words with low df = ", len(low_df), "words. Examples: ", list(low_df)[:20])
print("Stop words with high df:", high_df)
vocab_1A = set()
for sent in clean1A:
    for t in sent:
        vocab_1A.add(t)
print("Size of the rest vocab:", len(vocab_1A))
print("Samples:", clean1A[10:20])

Min_df 2
Max_df 527
Rare words with low df =  1680 words. Examples:  ['address', 'scooter', 'swim', 'island', 'mellow', 'pink', 'yentel', 'paste', '33', 'mattress', 'tibidaybo', 'meat', 'confident', 'thin', 'swirl', 'adorable', 'requires', 'lord', 'frustrating', 'lay']
Stop words with high df: {'and', '!', 'a', 'what', "n't", 'that', 'do', '?', 'the', 'you', 'it', 'to', '.', 'be', 'i', 'oh', ','}
Size of the rest vocab: 1530
Samples: [['just', 'coffee', 'where', 'we', 'gon', 'na', 'hang', 'out', 'now'], ['got'], [], ['um', '-', 'mm', 'yeah', 'right'], ['my', 'god', 'my', 'god', 'poor', 'monica'], [], [], ['he', 'think', 'monica', 'empty', 'she', 'empty', 'vase'], ['totally', 'god', 'she', 'seem', 'so', 'happy', 'too'], ['hey']]


    (ii) Filter B: DTandPRP

In [12]:
#from utils import remove_DT_PRP

min_df = 2

low_df, DTandPRP_tok, clean1B = remove_DT_PRP(min_df, training_data_1)

print("Rare words with low df = ", len(low_df), "words. Examples:", list(low_df)[:20])
vocab_1B = set()
for sent in clean1B:
    for t in sent:
        vocab_1B.add(t)
print("Size of the rest vocab:", len(vocab_1B))
print("Samples:", clean1B[10:20])

Determiner and pronouns {'some', "underwear'you", 'that', 'all', 'that?s', "i'll", 'they', "mean'i", "was'the", 'he', 'an', 'i', 'yourself', "i'i'm", 'those', 'every', 'both', 'its', 'ya', 'either', "you're", 'hers', 'herself', "you're'you", 'yours', 'ourselves', 'your', "i'm", "fact'yes", 'her', 'i-', 'his', 'we', 'ba', 'any', 'these', 'a', "it's", "up'i", 'you', 'each', 'my', 'no', 'our', 'themselves', "that'you", "'em", 'myself', "film'that", 'itself', 'tux', 'mine', 'this', 'the', 'their', 'ours', 'it', 'another', 'the-', "i'y'know", "'s", 'she', 'neither'}
Min_df 2
Rare words with low df =  1670 words. Examples: ['address', 'scooter', 'swim', 'island', 'mellow', 'pink', 'yentel', 'paste', '33', 'mattress', 'tibidaybo', 'meat', 'confident', 'thin', 'swirl', 'adorable', 'requires', 'lord', 'frustrating', 'lay']
Size of the rest vocab: 1515
Samples: [['just', 'coffee', '!', 'where', 'be', 'gon', 'na', 'hang', 'out', 'now', '?'], ['got', '.'], ['!'], ['um', '-', 'mm', ',', 'yeah', 'ri

## 1.2 BoW vectorization and training the classifiers

In [13]:
from sklearn import preprocessing
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm

In [14]:
# loaded_classifier = pickle.load(open(filename_classifier, 'rb'))
# loaded_vectorizer = pickle.load(open(filename_vectorizer, 'rb'))
# loaded_transformer = pickle.load(open(filename_transformer, 'rb'))
# loaded_label_encoder = pickle.load(open(filename_encoder, 'rb'))

(a) Encoding training labels:

In [15]:
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(training_labels_1+test_labels_1)
print(list(label_encoder.classes_))

['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']


In [16]:
training_classes = label_encoder.transform(training_labels_1)
print(training_classes[:20])
print(list(meld_dftrain['Emotion'])[:20])
print(list(meld_dftrain['Utterance'])[:20])

[5 2 5 5 4 5 2 3 4 5 1 4 3 3 5 5 5 5 4 5]
['surprise', 'fear', 'surprise', 'surprise', 'sadness', 'surprise', 'fear', 'joy', 'sadness', 'surprise', 'disgust', 'sadness', 'joy', 'joy', 'surprise', 'surprise', 'surprise', 'surprise', 'sadness', 'surprise']
['My duties?  All right.', "No don't I beg of you!", 'Really?!', 'But then who? The waitress I went out with last month?', 'You know? Forget it!', 'No-no-no-no, no! Who, who were you talking about?', "No, I-I-I-I don't, I actually don't know", 'Do I ever.', "Chris says they're closing down the bar.", 'No way!', 'Just coffee! Where are we gonna hang out now?', 'Got me.', 'You betcha!', 'Um-mm, yeah right!', 'Oh my God, oh my God! Poor Monica!', 'What, what, what?!', 'What?!', 'He thinks Monica is empty, she is the empty vase!', 'Oh, totally. Oh, God, oh, she seemed so happy too.', 'Hey!']


### Filter A: 

(a) Vectorise

In [17]:
# A CountVectorizer which takes tokenized lists as input
### Taken from https://stackoverflow.com/questions/35867484/pass-tokens-to-countvectorizer,
### https://stackoverflow.com/questions/27673527/how-should-i-vectorize-the-following-list-of-lists-with-scikit-learn, 26 Oct 2021

def dummy(x):
    return x

utterance_vec_1A = CountVectorizer(tokenizer=dummy, lowercase=False)

training_count_vectors_1A = utterance_vec_1A.fit_transform(clean1A)

In [18]:
print(training_count_vectors_1A .toarray()[0][:200])

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [19]:
#Total number of word features or the length of the total vector
print(len(utterance_vec_1A.vocabulary_))

1530


In [20]:
# First 50 feature names
print(list(utterance_vec_1A.get_feature_names())[:50])

[' ', '  ', '   ', '"', '$', "'", "'cause", "'d", "'em", "'ll", "'s", "'ve", '(', '-', '--', '..', '...', '....', '.....', '1', '10', '15', '17', '18', '19', '2,000', '20', '200', '25', '40', '50', '500', '7', '700', '74', '80', ':', ';', '[', ']', 'aaron', 'able', 'about', 'absolutely', 'accent', 'accept', 'ace', 'across', 'act', 'actor']


In [21]:
# Convert raw frequency counts into TF-IDF values
tfidf_transformer = TfidfTransformer()
training_tfidf_1A = tfidf_transformer.fit_transform(training_count_vectors_1A)

In [22]:
print(training_tfidf_1A.toarray()[0][:200])

[0.47132701 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.53124026 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

In [24]:
from sklearn.calibration import CalibratedClassifierCV
linear_model = svm.LinearSVC(max_iter=2000)
svm_linear_clf_1A = CalibratedClassifierCV(linear_model , method='sigmoid', cv=10)

svm_linear_clf_1A.fit(training_tfidf_1A, training_classes)

CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=2000), cv=10)

### Filter B

(a) Vectorise

In [25]:
utterance_vec_1B = CountVectorizer(tokenizer=dummy, lowercase=False)

training_count_vectors_1B = utterance_vec_1B.fit_transform(clean1B)
training_tfidf_1B = tfidf_transformer.fit_transform(training_count_vectors_1B)

In [26]:
#Total number of word features or the length of the total vector
print(len(utterance_vec_1B.vocabulary_))

1515


In [27]:
# First 50 feature names
print(list(utterance_vec_1B.get_feature_names())[1000:1010])

['personal', 'pete', 'pheebs', 'phil', 'phoebe', 'phone', 'pick', 'picnic', 'picture', 'pie']


(b) Train the classifier

In [28]:
linear_model = svm.LinearSVC(max_iter=2000)
svm_linear_clf_1B = CalibratedClassifierCV(linear_model , method='sigmoid', cv=10)

svm_linear_clf_1B.fit(training_tfidf_1B, training_classes)

CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=2000), cv=10)

## 1.3 Predicting the test data and results

In [29]:
import sklearn
from sklearn.metrics import classification_report

Encode the test labels

In [30]:
test_classes = label_encoder.transform(test_labels_1)
print(test_classes[:20])
print(list(meld_dftest['Emotion'])[:20])
print(list(meld_dftest['Utterance'])[:20])

[5 0 3 3 3 3 3 3 3 4 5 0 0 0 3 3 2 0 1 5]
['surprise', 'anger', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'sadness', 'surprise', 'anger', 'anger', 'anger', 'joy', 'joy', 'fear', 'anger', 'disgust', 'surprise']
["Why do all you're coffee mugs have numbers on the bottom?", "Oh. That's so Monica can keep track. That way if one on them is missing, she can be like, 'Where's number 27?!'", 'Push!', "Push 'em out, push 'em out, harder, harder.", "Push 'em out, push 'em out, way out!", "Let's get that ball and really move, hey, hey, ho, ho.", "Let's'  I was just'yeah, right.", 'Push!', 'Push!', "Uhh, yes I did but there isn't. Okay, here we go.", 'Okay, go left. Left! Left!', "Okay, y'know what? There is no more left, left!", 'Oh okay, lift it straight up over your head!', 'Straight up over your head!', 'You can do it!', 'You can do it!', "No wait, look. Look! I'm sorry, it's just I've never even", 'Okay, fine, whatever. Welcome to the building.', 'Ugh, can you believe that guy!', 'Ohh!

### Filter A:

In [49]:
max_df_test = len(test_data_1)//10

low_df_test_1A, high_df_test_1A, test_mid_df_1A = \
low_high_mid_df(2, max_df_test, test_data_1)

Min_df 2
Max_df 135


In [45]:
test_count_1A = utterance_vec_1A.transform(test_mid_df_1A)
test_tfidf_1A = tfidf_transformer.fit_transform(test_count_1A)

y_pred_svm_1A = svm_linear_clf_1A.predict(test_tfidf_1A)

In [51]:
report1A = classification_report(test_classes,y_pred_svm_1A,digits = 6)
print(label_encoder.classes_)
print('BoW TFIDF SVM LINEAR: MELD, Filter A')
print(f'Word mininum document frequency: {min_df}; maximum: {max_df_test}')
print(report1A)

print('Confusion matrix SVM, BoW MELD, Filter A')
print(label_encoder.classes_)
print(sklearn.metrics.confusion_matrix(test_classes,y_pred_svm_1A))

['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
BoW TFIDF SVM LINEAR: MELD, Filter A
Word mininum document frequency: 2; maximum: 135
              precision    recall  f1-score   support

           0   0.418803  0.284058  0.338515       345
           1   0.500000  0.014706  0.028571        68
           2   0.500000  0.020000  0.038462        50
           3   0.430636  0.741294  0.544790       402
           4   0.432432  0.153846  0.226950       208
           5   0.437143  0.544484  0.484945       281

    accuracy                       0.430576      1354
   macro avg   0.453169  0.293065  0.277039      1354
weighted avg   0.435292  0.430576  0.386362      1354

Confusion matrix SVM, BoW MELD, Filter A
['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
[[ 98   1   0 150  11  85]
 [ 21   1   0  22   4  20]
 [ 13   0   1  24   4   8]
 [ 38   0   0 298  12  54]
 [ 39   0   1 106  32  30]
 [ 25   0   0  92  11 153]]


In [48]:
pred_probabilities_1A = svm_linear_clf_1A.predict_proba(test_tfidf_1A)

pred_labels_1A = []
for predicted_label in y_pred_svm_1A:
    pred_labels_1A.append(label_encoder.classes_[predicted_label])

gold_labels_1A = []
for gold_label in test_classes:
    gold_labels_1A.append(label_encoder.classes_[gold_label])

result_frame1A = pd.DataFrame(pred_probabilities_1A*100, columns=label_encoder.classes_)

result_frame1A['Chat']= list(meld_dftest['Utterance'])
result_frame1A['Prediction']=pred_labels_1A
result_frame1A['Gold']=gold_labels_1A

result_frame1A.to_csv("result_frame1A.csv")
result_frame1A.head()

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,Chat,Prediction,Gold
0,14.956565,5.370529,8.36213,22.724746,7.195511,41.390519,Why do all you're coffee mugs have numbers on ...,surprise,surprise
1,21.956232,3.814577,6.349448,48.573109,14.965021,4.341613,Oh. That's so Monica can keep track. That way ...,joy,anger
2,19.088784,4.87963,4.239784,28.195862,9.667696,33.928244,Push!,surprise,joy
3,19.732127,7.285264,4.7349,28.663558,33.072693,6.511459,"Push 'em out, push 'em out, harder, harder.",sadness,joy
4,23.146414,6.687672,4.101298,35.686163,11.287242,19.09121,"Push 'em out, push 'em out, way out!",joy,joy


In [52]:
def average_importances(model):
    coef_avg = 0
    for classifier in model.calibrated_classifiers_:
        coef_avg = coef_avg + classifier.base_estimator.coef_
        
    coef_avg  = coef_avg/len(model.calibrated_classifiers_)
    return coef_avg

def f_importances(importances, names, n=20):
    class_labels = label_encoder.classes_
    
    for num, imp in enumerate(importances):
        emotion = class_labels[num]
        topn = sorted(zip(imp,names), reverse=True)[:n]
        
        print("Important words in {} utterances".format(emotion))
        for coef, feat in topn:
            print(emotion, coef, feat)
        print("-----------------------------------------")

print('Most important features per emotion: 1A')
feature_names = utterance_vec_1A.get_feature_names()
importances = average_importances(svm_linear_clf_1A)
f_importances(importances, feature_names)

Most important features per emotion: 1A
Important words in anger utterances
anger 1.7819809207908481 tape
anger 1.5872583100466553 mad
anger 1.5782512719487427 calm
anger 1.5536727080819999 oboe
anger 1.4735916728142802 knuckle
anger 1.4526197530277458 garbage
anger 1.4518083936438988 quarter
anger 1.429492054025632 gimme
anger 1.4191189726675855 york
anger 1.3879065982043068 fancy
anger 1.3569580668773875 easy
anger 1.3538630374952905 pack
anger 1.3516010026819918 bloody
anger 1.344249194703592 table
anger 1.3397681910913177 jackass
anger 1.3272354867985725 scrud
anger 1.3195553028345093 choice
anger 1.3099244715665272 care
anger 1.3024775420366317 laminate
anger 1.2971931691062708 suggestion
-----------------------------------------
Important words in disgust utterances
disgust 1.774723789387489 disgusting
disgust 1.7699263786385646 ew
disgust 1.7391020527921757 violate
disgust 1.6597005426841491 behave
disgust 1.634613306739066 eww
disgust 1.5211811408468356 boat
disgust 1.495124304

### Filter B:

In [36]:
low_df_test_1B, DTandPRP_test_1B, clean_test_1B = \
remove_DT_PRP(2, test_data_1)

Determiner and pronouns {'some', 'that', 'all', 'they', 'he', 'an', 'i', 'yourself', 'those', 'every', 'its', 'either', 'ya', "you're", 'hers', 'both', "my'this", 'yours', 'your', "i'm", '’s', 'her', "they're", 'his', 'we', 'any', 'these', 'a', 'you', "i'i", 'each', 'my', 'no', 'our', 'himself', 'themselves', 'one', "'em", 'myself', 'mine', 'this', 'the', 'their', 'it', 'another', "'s", 'she'}
Min_df 2


In [37]:
test_count_1B = utterance_vec_1B.transform(clean_test_1B)
test_tfidf_1B = tfidf_transformer.fit_transform(test_count_1B)

y_pred_svm_1B = svm_linear_clf_1B.predict(test_tfidf_1B)

In [53]:
report1B = classification_report(test_classes,y_pred_svm_1B,digits = 6)
print(label_encoder.classes_)
print('BoW TFIDF SVM LINEAR: MELD, Filter B')
print('Word mininum document frequency', min_df, "; DT PRP removed")
print(report1B)

print('Confusion matrix SVM, BoW MELD, Filter B')
print(label_encoder.classes_)
print(sklearn.metrics.confusion_matrix(test_classes,y_pred_svm_1B))

['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
BoW TFIDF SVM LINEAR: MELD, Filter B
Word mininum document frequency 2 ; DT PRP removed
              precision    recall  f1-score   support

           0   0.442688  0.324638  0.374582       345
           1   1.000000  0.029412  0.057143        68
           2   0.500000  0.060000  0.107143        50
           3   0.443953  0.748756  0.557407       402
           4   0.505495  0.221154  0.307692       208
           5   0.546296  0.629893  0.585124       281

    accuracy                       0.473412      1354
   macro avg   0.573072  0.335642  0.331515      1354
weighted avg   0.504319  0.473412  0.436463      1354

Confusion matrix SVM, BoW MELD, Filter B
['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
[[112   0   0 163  11  59]
 [ 19   2   0  27   6  14]
 [ 14   0   3  18   5  10]
 [ 44   0   1 301  16  40]
 [ 39   0   2  97  46  24]
 [ 25   0   0  72   7 177]]


In [39]:
pred_probabilities_1B = svm_linear_clf_1B.predict_proba(test_tfidf_1B)

pred_labels_1B = []
for predicted_label in y_pred_svm_1B:
    pred_labels_1B.append(label_encoder.classes_[predicted_label])

gold_labels_1B = []
for gold_label in test_classes:
    gold_labels_1B.append(label_encoder.classes_[gold_label])

result_frame1B = pd.DataFrame(pred_probabilities_1B*100, columns=label_encoder.classes_)

result_frame1B['Chat']= list(meld_dftest['Utterance'])
result_frame1B['Prediction']=pred_labels_1B
result_frame1B['Gold']=gold_labels_1B

result_frame1B.to_csv("result_frame1B.csv")

In [40]:
result_frame1B.head()

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,Chat,Prediction,Gold
0,13.914321,5.744996,8.733563,29.244748,7.023625,35.338748,Why do all you're coffee mugs have numbers on ...,surprise,surprise
1,19.148226,5.112113,5.458403,52.121119,14.897871,3.262269,Oh. That's so Monica can keep track. That way ...,joy,anger
2,24.664124,4.624781,0.78458,45.874474,2.186192,21.86585,Push!,joy,joy
3,17.951878,6.663199,7.649186,13.439925,51.485383,2.81043,"Push 'em out, push 'em out, harder, harder.",sadness,joy
4,31.840525,6.647776,7.327343,26.336274,11.931238,15.916844,"Push 'em out, push 'em out, way out!",anger,joy


In [54]:
print('Most important features per emotion: 1B')
feature_names = utterance_vec_1B.get_feature_names()
importances = average_importances(svm_linear_clf_1B)
f_importances(importances, feature_names)

Most important features per emotion: 1B
Important words in anger utterances
anger 1.693612461876011 tape
anger 1.614785222225591 calm
anger 1.5201460511540483 oboe
anger 1.504128271368633 mad
anger 1.4819404056686212 puck
anger 1.462976671944109 garbage
anger 1.445775564305268 table
anger 1.4358343821895163 easy
anger 1.4311620711579942 gimme
anger 1.353733224177083 bloody
anger 1.344203950395651 bitch
anger 1.3426957676864837 quarter
anger 1.3424136965059186 scrud
anger 1.3278274654237006 knuckle
anger 1.3131064638616585 choice
anger 1.3020290384125368 york
anger 1.3011097227434945 cramp
anger 1.2867341345561323 jackass
anger 1.2844905336447041 nothin'
anger 1.2485350603651733 fancy
-----------------------------------------
Important words in disgust utterances
disgust 1.853043201820205 ew
disgust 1.771912102216519 disgusting
disgust 1.7039667355156578 behave
disgust 1.6953516943282883 violate
disgust 1.6516117916952766 eww
disgust 1.6268312307781705 boat
disgust 1.5165538761449802 cl