# 2. Tweets, BoW

## 2.1 Data preparation and filtering

(a) Loading the training and test data

In [2]:
import pandas as pd
filepath = 'data/wassa/training/all.train.tsv'
tweets_dftrain = pd.read_csv(filepath, sep='\t')

filepath = 'data/wassa/testing/all.test.tsv'
tweets_dftest = pd.read_csv(filepath, sep='\t')

(b) Tokenizing and filtering

In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Using spaCy to tokenize the sentences
training_data_2 = [nlp(sent) for sent in list(tweets_dftrain['Tweet'])]
training_labels_2 = list(tweets_dftrain['Label'])

test_data_2 = [nlp(sent) for sent in list(tweets_dftest['Tweet'])]
test_labels_2 = list(tweets_dftest['Label'])

In [10]:
# for sent in training_data_1:
#     for token in sent:
#         if "I'm" == token.text:
#             print(sent)

    (i) Filter A: MaxDF

In [9]:
from utils import low_high_mid_df

min_df = 2
max_df = len(training_data_2)//10

low_df, high_df, clean2A = low_high_mid_df(min_df, max_df, training_data_2)

print("Rare words with low df = ", len(low_df), "words. Examples: ", list(low_df)[:20])
print("Stop words with high df:", high_df)
vocab_2A = set()
for sent in clean2A:
    for t in sent:
        vocab_2A.add(t)
print("Size of the rest vocab:", len(vocab_2A))
print("Samples:", clean2A[10:20])

Min_df 2
Max_df 361
Rare words with low df =  5295 words. Examples:  ['athlete', 'rescue', 'part2', '81', 'victory.\\n\\ntomiho', '珞', 'cowardliness', 'dnt', 'vest', '@pottzgame', 'raaar', '@mannequinpussy', 'lasting', 'yh', 'gameface', 'tracey', '@a_rockasthe', 'bbfail', 'mindset', '//rip//']
Stop words with high df: {'the', ',', 'that', 'for', 'my', 'it', 'of', 'and', 'be', 'in', 'i', '#', 'on', '!', 'a', 'have', '.', 'you', ' ', 'do', 'to', "n't"}
Size of the rest vocab: 4190
Samples: [['m', 'so', 'mad', 'about', 'power', 'ranger', 'm', 'incense', 'm', 'furious'], ['wo', 'nt', 'use', 'use', '@mothercareuk', '@mothercarehelp', 'again', 'these', 'guy', 'ca', 'nt', 'get', 'nothing', 'right', 'fume'], ['bitch', 'aggravate', 'like', 'what', 'inspire', 'big', 'cunt', 'know', 'man', 'kind', '?'], ['why', '@dapperlaugh', 'come', 'glasgow', 'night', 'work', 'fucking', 'gutte', 'wait', 'an', 'appearance', 'age', 'rage'], ['fume', '😤'], ['zero', 'help', 'from', '@up', 'customer', 'service', 'j

    (ii) Filter B: DTandPRP

In [12]:
from utils import remove_DT_PRP

min_df = 2

low_df, DTandPRP_tok, clean2B = remove_DT_PRP(min_df, training_data_2)

print("Rare words with low df = ", len(low_df), "words. Examples:", list(low_df)[:20])
vocab_2B = set()
for sent in clean2B:
    for t in sent:
        vocab_2B.add(t)
print("Size of the rest vocab:", len(vocab_2B))
print("Samples:", clean2B[10:20])

Determiner and pronouns {'no', 'this', '_', 'our', '@relaqss', '\\n\\nsam', '❤', 'herself', '#', 'strength.\\nthey', '🐈', 'they', 'myself', '@rowillfindyou', 'em', '@melissajoyrd', '😡', 'tbh', 'every', 'memphis', "'s", '’s', '@snub23', 'each', 'its', 'don’t', "y'", '@blackeyed_susie', '👅', '@ryyyshh', '@m_t_f_72', 'hbu', 'hers', '😧', 'he', 'a', 'both', '@neyaphemmaster', 'these', 'yours', '🍁', 'her', ':)', 'you', 'himself', 'either', 'we', '@reyesaverie', '@missmeliss465', 'that', '\\nit', 'mine', '\uf62b', 'i', 'one', '@fra93_bruno', 'scarred,\\nthis', 'thy', 'those', 'jut', 'ya', 'an', '@weebtard', 'y', 'another', '\\n\\nother', '✨', 'oldham\\nnext', 'some', 'near,\\nthe', 'ty', 'itself', 'the', '🐮', 'themselves', 'his', 'my', 'isthereahelplineforthis', 'it', 'd', '@british_airways', "you're", '@ntfc', 'xx', '@its.finfin', 'ek', 'your', 'she', 'u', 'eagles.\\nthey', '@kevincanwaitcbs', 'n', 'also-', 'yourself', '@adsbyflaherty', '@themathofyou', 'any', 'their', 'all', 'bridgetjonesba

## 2.2 BoW vectorization and training the classifiers

In [13]:
from sklearn import preprocessing
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm

(a) Encoding training labels:

In [14]:
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(training_labels_2+test_labels_2)
print(list(label_encoder.classes_))

['anger', 'fear', 'joy', 'sadness']


In [18]:
training_classes = label_encoder.transform(training_labels_2)
print(training_classes[:5])
print(list(tweets_dftrain['Label'])[:5])
print(list(tweets_dftrain['Tweet'])[:5])

[0 0 0 0 0]
['anger', 'anger', 'anger', 'anger', 'anger']
['How the fu*k! Who the heck! moved my fridge!... should I knock the landlord door. #angry #mad ##', "So my Indian Uber driver just called someone the N word. If I wasn't in a moving vehicle I'd have jumped out #disgusted ", '@DPD_UK I asked for my parcel to be delivered to a pick up store not my address #fuming #poorcustomerservice', 'so ef whichever butt wipe pulled the fire alarm in davis bc I was sound asleep #pissed #angry #upset #tired #sad #tired #hangry ######', "Don't join @BTCare they put the phone down on you, talk over you and are rude. Taking money out of my acc willynilly! #fuming"]


### Filter A: 

(a) Vectorise

In [19]:
# A CountVectorizer which takes tokenized lists as input
### Taken from https://stackoverflow.com/questions/35867484/pass-tokens-to-countvectorizer,
### https://stackoverflow.com/questions/27673527/how-should-i-vectorize-the-following-list-of-lists-with-scikit-learn, 26 Oct 2021

def dummy(x):
    return x

utterance_vec_2A = CountVectorizer(tokenizer=dummy, lowercase=False)

training_count_vectors_2A = utterance_vec_2A.fit_transform(clean2A)

In [34]:
print(training_count_vectors_2A .toarray()[2][200:280])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]


In [22]:
#Total number of word features or the length of the total vector
print(len(utterance_vec_2A.vocabulary_))

4190


In [35]:
# First 50 feature names
print(list(utterance_vec_2A.get_feature_names())[:50])

['  ', '   ', '"', '#funny', '#whatever', '$', '%', '&', "'", "'\\n\\nhe", "'d", "'everywhere", "'i", "'ll", "'s", "'ve", '(', ')', '):', '*', '+', '-', '--', '-2.5', '-dalai', '-terrible-', '..', '...', '....', '.....', '..........', '.@divamagazine', '.@simonnricketts', '.@tolumanda', '/', '0', '1', '1/2', '10', '10/11', '100', '1000', '100k', '101', '10golds24', '12', '13', '130', '148', '15']


In [36]:
# Convert raw frequency counts into TF-IDF values
tfidf_transformer = TfidfTransformer()
training_tfidf_2A = tfidf_transformer.fit_transform(training_count_vectors_2A)

In [37]:
print(training_tfidf_2A.toarray()[2][200:280])

[0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.3738906 0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.       ]


In [38]:
from sklearn.calibration import CalibratedClassifierCV
linear_model = svm.LinearSVC(max_iter=2000)
svm_linear_clf_2A = CalibratedClassifierCV(linear_model , method='sigmoid', cv=10)

svm_linear_clf_2A.fit(training_tfidf_2A, training_classes)

CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=2000), cv=10)

### Filter B

(a) Vectorise

In [39]:
utterance_vec_2B = CountVectorizer(tokenizer=dummy, lowercase=False)

training_count_vectors_2B = utterance_vec_2B.fit_transform(clean2B)
training_tfidf_2B = tfidf_transformer.fit_transform(training_count_vectors_2B)

In [40]:
#Total number of word features or the length of the total vector
print(len(utterance_vec_2B.vocabulary_))

4167


In [41]:
# First 50 feature names
print(list(utterance_vec_2B.get_feature_names())[:50])

[' ', '  ', '   ', '!', '"', '#', '#funny', '#whatever', '$', '%', '&', "'", "'\\n\\nhe", "'d", "'everywhere", "'i", "'ll", "'s", "'ve", '(', ')', '):', '*', '+', ',', '-', '--', '-2.5', '-dalai', '-terrible-', '.', '..', '...', '....', '.....', '..........', '.@divamagazine', '.@simonnricketts', '.@tolumanda', '/', '0', '1', '1/2', '10', '10/11', '100', '1000', '100k', '101', '10golds24']


(b) Train the classifier

In [42]:
linear_model = svm.LinearSVC(max_iter=2000)
svm_linear_clf_2B = CalibratedClassifierCV(linear_model , method='sigmoid', cv=10)

svm_linear_clf_2B.fit(training_tfidf_2B, training_classes)

CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=2000), cv=10)

## 2.3 Predicting the test data and results

In [48]:
import sklearn
from sklearn.metrics import classification_report

Encode the test labels

In [43]:
test_classes = label_encoder.transform(test_labels_2)
print(test_classes[:20])
print(list(tweets_dftest['Label'])[:5])
print(list(tweets_dftest['Tweet'])[:5])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
['anger', 'anger', 'anger', 'anger', 'anger']
['At the point today where if someone says something remotely kind to me, a waterfall will burst out of my eyes', "@CorningFootball  IT'S GAME DAY!!!!      T MINUS 14:30  #relentless", 'This game has pissed me off more than any other game this year. My blood is boiling! Time to turn it off! #STLCards', "@spamvicious I've just found out it's Candice and not Candace. She can pout all she likes for me 😍", "@moocowward @mrsajhargreaves @Melly77 @GaryBarlow if he can't come to my Mum'a 60th after 25k tweets then why should I 🙈  #soreloser"]


### Filter A:

In [45]:
max_df_test = len(test_data_2)//10

low_df_test_2A, high_df_test_2A, test_mid_df_2A = \
low_high_mid_df(2, max_df_test, test_data_2)

Min_df 2
Max_df 314


In [46]:
test_count_2A = utterance_vec_2A.transform(test_mid_df_2A)
test_tfidf_2A = tfidf_transformer.fit_transform(test_count_2A)

y_pred_svm_2A = svm_linear_clf_2A.predict(test_tfidf_2A)

In [62]:
report2A = classification_report(test_classes,y_pred_svm_2A,digits = 4)
print(label_encoder.classes_)
print('BoW TFIDF SVM LINEAR: Tweets, Filter A')
print('Word mininum document frequency', min_df, "; max:", max_df_test)
print(report2A)

print('Confusion matrix SVM, BoW Tweets, Filter A')
print(label_encoder.classes_)
print(sklearn.metrics.confusion_matrix(test_classes,y_pred_svm_2A))

['anger' 'fear' 'joy' 'sadness']
BoW TFIDF SVM LINEAR: Tweets, Filter A
Word mininum document frequency 2 ; max: 314
              precision    recall  f1-score   support

           0   0.826265  0.794737  0.810195       760
           1   0.810811  0.783920  0.797138       995
           2   0.835598  0.861345  0.848276       714
           3   0.761571  0.806835  0.783550       673

    accuracy                       0.809039      3142
   macro avg   0.808561  0.811709  0.809790      3142
weighted avg   0.809635  0.809039  0.809007      3142

Confusion matrix SVM, BoW Tweets, Filter A
['anger' 'fear' 'joy' 'sadness']
[[604  68  34  54]
 [ 60 780  61  94]
 [ 21  56 615  22]
 [ 46  58  26 543]]


In [52]:
pred_probabilities_2A = svm_linear_clf_2A.predict_proba(test_tfidf_2A)

pred_labels_2A = []
for predicted_label in y_pred_svm_2A:
    pred_labels_2A.append(label_encoder.classes_[predicted_label])

gold_labels_2A = []
for gold_label in test_classes:
    gold_labels_2A.append(label_encoder.classes_[gold_label])

result_frame2A = pd.DataFrame(pred_probabilities_2A*100, columns=label_encoder.classes_)

result_frame2A['Chat']= list(tweets_dftest['Tweet'])
result_frame2A['Prediction']=pred_labels_2A
result_frame2A['Gold']=gold_labels_2A

result_frame2A.to_csv("result_frame2A.csv")
result_frame2A.head()

Unnamed: 0,anger,fear,joy,sadness,Chat,Prediction,Gold
0,82.695987,15.14031,0.424242,1.739462,At the point today where if someone says somet...,anger,anger
1,96.369466,0.461471,0.592322,2.576741,@CorningFootball IT'S GAME DAY!!!! T MIN...,anger,anger
2,88.33753,3.101388,0.394256,8.166827,This game has pissed me off more than any othe...,anger,anger
3,67.636159,1.85954,3.174199,27.330102,@spamvicious I've just found out it's Candice ...,anger,anger
4,44.852154,47.804198,1.132859,6.210789,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,fear,anger


In [50]:
def average_importances(model):
    coef_avg = 0
    for classifier in model.calibrated_classifiers_:
        coef_avg = coef_avg + classifier.base_estimator.coef_
        
    coef_avg  = coef_avg/len(model.calibrated_classifiers_)
    return coef_avg

def f_importances(importances, names, n=20):
    class_labels = label_encoder.classes_
    
    for num, imp in enumerate(importances):
        emotion = class_labels[num]
        topn = sorted(zip(imp,names), reverse=True)[:n]
        
        print("Important words in {} utterances".format(emotion))
        for coef, feat in topn:
            print(emotion, coef, feat)
        print("-----------------------------------------")

print('Most important features per emotion: 2A')
feature_names = utterance_vec_2A.get_feature_names()
importances = average_importances(svm_linear_clf_2A)
f_importances(importances, feature_names)

Most important features per emotion: 2A
Important words in anger utterances
anger 3.6531723579385096 anger
anger 3.642727066735774 rage
anger 3.1845846298112006 offend
anger 3.125060604146694 bitter
anger 3.112672764842934 angry
anger 2.9477823412368043 revenge
anger 2.7428586341517507 fury
anger 2.741317666066721 offense
anger 2.680995221601827 fume
anger 2.597124900468489 snap
anger 2.5961267143253854 burst
anger 2.506765325183344 madden
anger 2.4054276084847963 relentless
anger 2.400482322595806 outrage
anger 2.391648001364326 rabid
anger 2.365590793005136 resent
anger 2.347488282200855 wrath
anger 2.283261928110445 burn
anger 2.2735609888285153 irritate
anger 2.225944378750616 insult
-----------------------------------------
Important words in fear utterances
fear 3.125602040902827 terrorism
fear 2.9986588971243657 shake
fear 2.7834003795187847 horror
fear 2.701618724341592 nightmare
fear 2.7009975802301063 bully
fear 2.6737671580995594 fear
fear 2.6405042449598675 panic
fear 2.525

### Filter B:

In [53]:
low_df_test_2B, DTandPRP_test_2B, clean_test_2B = \
remove_DT_PRP(2, test_data_2)

Determiner and pronouns {'»', '@xmaseveevil1', 'boys', 'no', 'this', '_', 'our', '@ryuredwings2', '\\nindia', '\\nmatt', '#', '@the', 'half', 'they', '@realdonaldtrump', 'myself', 'nj@latimes', '@messyourself', 'ios10', '@bbnicole', 'em', '@talktalkcare', "\\n\\n'you", 'tbh', 'lt', 'every', '@space_gayz', "'s", 'said!!!!\\nthey', 'each', 'its', "y'", '😿', '@aefadul22', '\\nso', '@ritujai18874', '@johnjharwood', 'he', 'a', 'both', 'these', '@jankhambrams', 'yours', 'her', '@eliroth', 'you', 'himself', 'either', 'tvgirl', "naya'\\n\\n'i", 'we', '#behaviour', 'that,\\ngives', 'that', '@sarahb45', 'neither', '@rosie', "'em", 'blm', 'mine', '@colinoccupantz', '💦', 'lv', 'i', '@digger_forum', 'one', 'thy', '@jbanks88', 'those', 'yhat', '\\nwhat', 'ya', 'an', '@adele', '@capitalone', 'it.\\n#funny', 'y', 'another', '@barbour', '✨', '😑', 'some', 'itself', 'the', '@your', 'themselves', 'his', 'my', 'isthereahelplineforthis', 'it', 'd', '🍂', 'theirs', 'thee', 'your', 'she', 'u', '@interception22

In [55]:
test_count_2B = utterance_vec_2B.transform(clean_test_2B)
test_tfidf_2B = tfidf_transformer.fit_transform(test_count_2B)

y_pred_svm_2B = svm_linear_clf_2B.predict(test_tfidf_2B)

In [63]:
report2B = classification_report(test_classes,y_pred_svm_2B,digits = 4)
print(label_encoder.classes_)
print('BoW TFIDF SVM LINEAR: Tweets, Filter B')
print('Word mininum document frequency', min_df, "; DT PRP removed")
print(report2B)

print('Confusion matrix SVM, BoW Tweets, Filter B')
print(label_encoder.classes_)
print(sklearn.metrics.confusion_matrix(test_classes,y_pred_svm_2B))

['anger' 'fear' 'joy' 'sadness']
BoW TFIDF SVM LINEAR: Tweets, Filter B
Word mininum document frequency 2 ; DT PRP removed
              precision    recall  f1-score   support

           0   0.837838  0.775000  0.805195       760
           1   0.801628  0.791960  0.796764       995
           2   0.841816  0.857143  0.849410       714
           3   0.746228  0.808321  0.776034       673

    accuracy                       0.806174      3142
   macro avg   0.806877  0.808106  0.806851      3142
weighted avg   0.807652  0.806174  0.806327      3142

Confusion matrix SVM, BoW Tweets, Filter B
['anger' 'fear' 'joy' 'sadness']
[[589  77  32  62]
 [ 55 788  55  97]
 [ 16  60 612  26]
 [ 43  58  28 544]]


In [58]:
pred_probabilities_2B = svm_linear_clf_2B.predict_proba(test_tfidf_2B)

pred_labels_2B = []
for predicted_label in y_pred_svm_2B:
    pred_labels_2B.append(label_encoder.classes_[predicted_label])

gold_labels_2B = []
for gold_label in test_classes:
    gold_labels_2B.append(label_encoder.classes_[gold_label])

result_frame2B = pd.DataFrame(pred_probabilities_2B*100, columns=label_encoder.classes_)

result_frame2B['Chat']= list(tweets_dftest['Tweet'])
result_frame2B['Prediction']=pred_labels_2B
result_frame2B['Gold']=gold_labels_2B

result_frame2B.to_csv("result_frame2B.csv")

In [60]:
result_frame2B.head()

Unnamed: 0,anger,fear,joy,sadness,Chat,Prediction,Gold
0,80.252924,18.248005,0.489399,1.009673,At the point today where if someone says somet...,anger,anger
1,93.832984,2.342541,1.199966,2.624508,@CorningFootball IT'S GAME DAY!!!! T MIN...,anger,anger
2,88.26634,1.443278,0.373367,9.917015,This game has pissed me off more than any othe...,anger,anger
3,45.246627,2.054475,2.466968,50.23193,@spamvicious I've just found out it's Candice ...,sadness,anger
4,40.80096,55.035215,0.928871,3.234954,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,fear,anger


In [61]:
print('Most important features per emotion for the SVM classifier')
feature_names = utterance_vec_2B.get_feature_names()
importances = average_importances(svm_linear_clf_2B)
f_importances(importances, feature_names)

Most important features per emotion for the SVM classifier
Important words in anger utterances
anger 3.8134984964311327 anger
anger 3.786781189451753 rage
anger 3.230216875654725 bitter
anger 3.2152211955083283 offend
anger 3.213678089849894 angry
anger 3.0240640140831125 revenge
anger 2.772870201902399 fume
anger 2.7264526576923207 fury
anger 2.7004319275203663 offense
anger 2.6379413900302477 burst
anger 2.606032670892099 snap
anger 2.5410851452003143 madden
anger 2.5362554842308036 relentless
anger 2.477317025828108 rabid
anger 2.408564394524163 wrath
anger 2.4068473518754994 outrage
anger 2.3591358697076643 burn
anger 2.254733474390254 insult
anger 2.253847730574333 irritate
anger 2.250037624745847 resent
-----------------------------------------
Important words in fear utterances
fear 3.21861375239446 terrorism
fear 3.138634628898958 shake
fear 2.894621260276061 horror
fear 2.769012346630014 bully
fear 2.752972084147799 fear
fear 2.728016022227951 nightmare
fear 2.7110650521048485