# 0. Overview of Data

In [1]:
import pandas as pd
import spacy
from spacy import displacy
import nltk

## 0.1 Loading the MELD data set

In [2]:
filepath = './data/MELD/train_sent_emo.csv'
meld_train = pd.read_csv(filepath)
### The data has some problematic strings with encoding problems. The next code removes some of these from the utterances
# Try to fix encoding
meld_train['Utterance'] = meld_train['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")

  meld_train['Utterance'] = meld_train['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")


In [3]:
# Removing 'Neutral'

meld_train = meld_train.set_index("Emotion", drop=False)
meld_train = meld_train.drop("neutral", axis=0)
meld_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5279 entries, surprise to joy
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Sr No.        5279 non-null   int64 
 1   Utterance     5279 non-null   object
 2   Speaker       5279 non-null   object
 3   Emotion       5279 non-null   object
 4   Sentiment     5279 non-null   object
 5   Dialogue_ID   5279 non-null   int64 
 6   Utterance_ID  5279 non-null   int64 
 7   Season        5279 non-null   int64 
 8   Episode       5279 non-null   int64 
 9   StartTime     5279 non-null   object
 10  EndTime       5279 non-null   object
dtypes: int64(5), object(6)
memory usage: 494.9+ KB


In [4]:
meld_train.head()

Unnamed: 0_level_0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
Emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
surprise,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"
fear,11,No don't I beg of you!,Chandler,fear,negative,0,10,8,21,"00:17:02,856","00:17:04,858"
surprise,13,Really?!,Chandler,surprise,positive,0,12,8,21,"00:17:13,491","00:17:16,536"
surprise,15,But then who? The waitress I went out with las...,Joey,surprise,negative,1,0,9,23,"00:36:40,364","00:36:42,824"
sadness,16,You know? Forget it!,Rachel,sadness,negative,1,1,9,23,"00:36:44,368","00:36:46,578"


### Looking at examples of each emotion:

In [5]:
examples = dict()

for index, utterance in enumerate(meld_train['Utterance'][:500]):
    emotion_label = meld_train['Emotion'].iloc[index]
    if emotion_label not in examples.keys():
        new_val = []
        examples[emotion_label] = new_val
        new_val.append(utterance)
    else:
        examples[emotion_label].append(utterance)
    
for eg in examples.items():
    print(eg[0].upper())
    for num, sent in enumerate(eg[1][:15]):
        print(num+1, sent)
    print()

SURPRISE
1 My duties?  All right.
2 Really?!
3 But then who? The waitress I went out with last month?
4 No-no-no-no, no! Who, who were you talking about?
5 No way!
6 Oh my God, oh my God! Poor Monica!
7 What, what, what?!
8 What?!
9 He thinks Monica is empty, she is the empty vase!
10 Hey!
11 What are you doing here?
12 Everybody!!
13 You liked it? You really liked it?
14 I was surprised to see a kangaroo in a World War I epic.
15 Look!

FEAR
1 No don't I beg of you!
2 No, I-I-I-I don't, I actually don't know
3 Well, I-I got this blinding pain in my stomach when I was lifting weights before, then I uh passed out and uh, haven't been able to stand up since.
4 No way! 'Kay look, if I have to go to the doctor for anything it's gonna be for this thing sticking out of my stomach!
5 Oh boy, I just can't watch. It's too scary!
6 Oh yeah well, you know me, babies, responsibilities, ahhh!!!
7 Uh-oh.
8 I know!  Don't switch hands, okay?
9 What should I wear, now I'm all nervous.
10 I think you'v

The shape of the data:ANGER
1 Oh no-no-no, give me some specifics.
2 You fell asleep!!
3 There was no kangaroo!
4 They didn't take any of my suggestions!
5 This guy fell asleep!
6 He fell asleep too!
7 Be mad at him!
8 Or, call an ambulance.
9 What?! What is with everybody? It's Thanksgiving, not...Truth-Day!
10 Ok, pure evil , horny and alone . I've done this
11 Nobody!
12 Nobody respects the bucket!
13 You wouldn't believe what people put in here!
14 Look!
15 Okay, does this look like a garbage can to you?

In [6]:
meld_train.Emotion.value_counts()

joy         1743
surprise    1205
anger       1109
sadness      683
disgust      271
fear         268
Name: Emotion, dtype: int64

## 0.2 Loading the Tweets data set, and looking at examples of each emotion

In [7]:
filepath = './data/wassa/training/all.train.tsv'
tweets_train = pd.read_csv(filepath, sep='\t')

examples = dict()

for index, utterance in enumerate(tweets_train['Tweet']):
    emotion_label = tweets_train['Label'].iloc[index]
    if emotion_label not in examples.keys():
        new_val = []
        examples[emotion_label] = new_val
        new_val.append(utterance)
    else:
        examples[emotion_label].append(utterance)
    
for eg in examples.items():
    print(eg[0].upper())
    for num, sent in enumerate(eg[1][:15]):
        print(num+1, sent)
    print()

ANGER
1 How the fu*k! Who the heck! moved my fridge!... should I knock the landlord door. #angry #mad ##
2 So my Indian Uber driver just called someone the N word. If I wasn't in a moving vehicle I'd have jumped out #disgusted 
3 @DPD_UK I asked for my parcel to be delivered to a pick up store not my address #fuming #poorcustomerservice
4 so ef whichever butt wipe pulled the fire alarm in davis bc I was sound asleep #pissed #angry #upset #tired #sad #tired #hangry ######
5 Don't join @BTCare they put the phone down on you, talk over you and are rude. Taking money out of my acc willynilly! #fuming
6 My blood is boiling
7 When you've still got a whole season of Wentworth to watch and a stupid cunt in work ruins it for us 😭😭 @__KirstyGA #raging #oldcunt
8 @bt_uk why does tracking show my equipment delivered, when it wasn't? Why is my service suddenly delayed? We've already 3 weeks. #fuming
9 @TeamShanny legit why i am so furious with him, people are such fucking idiots.
10 How is it suppo

The shape of the data:

In [8]:
tweets_train.info()
tweets_train.Label.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3613 entries, 0 to 3612
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      3613 non-null   int64  
 1   Tweet   3613 non-null   object 
 2   Label   3613 non-null   object 
 3   Score   3613 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 113.0+ KB


fear       1147
anger       857
joy         823
sadness     786
Name: Label, dtype: int64

## 0.3 Understand the dependencies and POS of some samples

In [9]:
some_sample = ""
for tweet in tweets_train['Tweet'][:20]:
    some_sample += f"{tweet} "

In [10]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(some_sample)

In [17]:
displacy.render(doc, jupyter=True, style='dep')


In [22]:
subj_list = []
for token in list(doc):
    if token.dep_ == "nsubj": #or token.dep_ == "nsubjpass":
        subj_list.append(token)

print(subj_list)

[I, driver, I, I, I, wipe, I, #, #, they, #, you, cunt, tracking, equipment, it, We, i, people, it, you, i, i, i, guys, Bitches, what, @dapperlaughs, I, #, that, guy, 's, blood, I, it, I, people, he, I, I, everyone]


In [19]:
nltk_sents = nltk.tokenize.sent_tokenize(some_sample)
for sent in nltk_sents:
    tokens = nltk.tokenize.word_tokenize(sent)
    postagged = nltk.pos_tag(tokens)
    print(postagged)

[('How', 'WRB'), ('the', 'DT'), ('fu', 'NN'), ('*', 'NNP'), ('k', 'NN'), ('!', '.')]
[('Who', 'WP'), ('the', 'DT'), ('heck', 'NN'), ('!', '.')]
[('moved', 'VBN'), ('my', 'PRP$'), ('fridge', 'NN'), ('!', '.'), ('...', ':')]
[('should', 'MD'), ('I', 'PRP'), ('knock', 'VB'), ('the', 'DT'), ('landlord', 'NN'), ('door', 'NN'), ('.', '.')]
[('#', '#'), ('angry', 'JJ'), ('#', '#'), ('mad', 'JJ'), ('#', '#'), ('#', '#'), ('So', 'NNP'), ('my', 'PRP$'), ('Indian', 'JJ'), ('Uber', 'NNP'), ('driver', 'NN'), ('just', 'RB'), ('called', 'VBD'), ('someone', 'NN'), ('the', 'DT'), ('N', 'NNP'), ('word', 'NN'), ('.', '.')]
[('If', 'IN'), ('I', 'PRP'), ('was', 'VBD'), ("n't", 'RB'), ('in', 'IN'), ('a', 'DT'), ('moving', 'VBG'), ('vehicle', 'NN'), ('I', 'PRP'), ("'d", 'MD'), ('have', 'VB'), ('jumped', 'VBN'), ('out', 'RP'), ('#', '#'), ('disgusted', 'JJ'), ('@', 'NNP'), ('DPD_UK', 'NNP'), ('I', 'PRP'), ('asked', 'VBD'), ('for', 'IN'), ('my', 'PRP$'), ('parcel', 'NN'), ('to', 'TO'), ('be', 'VB'), ('delivere

In [26]:
teststr = "no-no, no, no one is taking no for an answer."
print(nltk.pos_tag(nltk.tokenize.word_tokenize(teststr)))

[('no-no', 'JJ'), (',', ','), ('no', 'DT'), (',', ','), ('no', 'DT'), ('one', 'NN'), ('is', 'VBZ'), ('taking', 'VBG'), ('no', 'DT'), ('for', 'IN'), ('an', 'DT'), ('answer', 'NN'), ('.', '.')]


In [37]:
testonspacy = nlp(teststr)
[(word.text, word.tag_) for word in list(testonspacy)]

[('no', 'UH'),
 ('-', 'HYPH'),
 ('no', 'UH'),
 (',', ','),
 ('no', 'UH'),
 (',', ','),
 ('no', 'DT'),
 ('one', 'NN'),
 ('is', 'VBZ'),
 ('taking', 'VBG'),
 ('no', 'RB'),
 ('for', 'IN'),
 ('an', 'DT'),
 ('answer', 'NN'),
 ('.', '.')]

In [44]:
for word in doc.sents:
    print(word.text)

How the fu*k!
Who the heck!
moved my fridge!...
should I knock the landlord door.
#angry #mad ##
So my Indian Uber driver just called someone the N word.
If I wasn't in a moving vehicle I'd have jumped out #disgusted  @DPD_UK I asked for my parcel to be delivered to a pick up store not my address #fuming #poorcustomerservice so ef whichever butt wipe pulled the fire alarm in davis bc
I was sound asleep #pissed #angry #upset
#tired #sad #tired #hangry ###### Don't join @BTCare they put the phone down on you, talk over you and are rude.
Taking money out of my acc willynilly!
#fuming My blood is boiling When you've still got a whole season of Wentworth to watch and a stupid cunt in work ruins it for us 😭
😭
@__KirstyGA #raging #oldcunt @bt_uk why does tracking show my equipment delivered, when it wasn't?
Why is my service suddenly delayed?
We've already 3 weeks.
#fuming @TeamShanny legit why i am so furious with him, people are such fucking idiots.
How is it suppose to work if you do that?

In [5]:
spacy_tok_test = "Sorry-but you're...? People say I'm, 'Pip.'"

runtest = nlp(spacy_tok_test.lower())

for tok in runtest:
    print(tok)

sorry
-
but
you
're
...
?
people
say
i
'm
,
'
pip
.
'
