# 5. Tweets, Word-embeddings

## 5.1 Data preparation and filtering

(a) Loading the training and test data

In [1]:
import pandas as pd
filepath = 'data/wassa/training/all.train.tsv'
tweets_dftrain = pd.read_csv(filepath, sep='\t')

filepath = 'data/wassa/testing/all.test.tsv'
tweets_dftest = pd.read_csv(filepath, sep='\t')

(b) Tokenizing and filtering the data

In [3]:
# Using spaCy to tokenize the sentences

import spacy
nlp = spacy.load("en_core_web_sm")
                 
training_data_5 = [nlp(sent) for sent in list(tweets_dftrain['Tweet'])]
training_labels_5 = list(tweets_dftrain['Label'])

test_data_5 = [nlp(sent) for sent in list(tweets_dftest['Tweet'])]
test_labels_5 = list(tweets_dftest['Label'])

### Filter A

In [4]:
from utils import low_high_mid_df
min_df = 2
max_df = len(training_data_5)//10

low_df, high_df, clean5A = low_high_mid_df(min_df, max_df, training_data_5)

print("Rare words with low df = ", len(low_df), "words. Examples: ", list(low_df)[:20])
print("Stop words with high df:", high_df)
vocab_5A = set()
for sent in clean5A:
    for t in sent:
        vocab_5A.add(t)
print("Size of the rest vocab:", len(vocab_5A))
print("Samples:", clean5A[10:20])

Min_df 2
Max_df 361
Rare words with low df =  5295 words. Examples:  ['@rbrutti', '@kevinrouth', 'afaik', 'ollaf', 'making', '@moamali', 'attendance', 'stfuuu', 'me?\\n', 'dnt', '️&amp', 'blessings', 'cn', 'ku', '@blvdcenter', '@jackie_mansky', '@courtneymee', 'lesson', 'grapefruit', 'division']
Stop words with high df: {'do', 'in', 'to', '#', 'that', 'have', 'my', 'a', 'of', '!', 'i', 'the', "n't", 'and', 'on', '.', 'it', 'for', ' ', 'be', 'you', ','}
Size of the rest vocab: 4190
Samples: [['m', 'so', 'mad', 'about', 'power', 'ranger', 'm', 'incense', 'm', 'furious'], ['wo', 'nt', 'use', 'use', '@mothercareuk', '@mothercarehelp', 'again', 'these', 'guy', 'ca', 'nt', 'get', 'nothing', 'right', 'fume'], ['bitch', 'aggravate', 'like', 'what', 'inspire', 'big', 'cunt', 'know', 'man', 'kind', '?'], ['why', '@dapperlaugh', 'come', 'glasgow', 'night', 'work', 'fucking', 'gutte', 'wait', 'an', 'appearance', 'age', 'rage'], ['fume', '😤'], ['zero', 'help', 'from', '@up', 'customer', 'service', 

### Filter B

In [5]:
from utils import remove_DT_PRP

min_df = 2

low_df, DTandPRP_tok, clean5B = remove_DT_PRP(min_df, training_data_5)

print("Rare words with low df = ", len(low_df), "words. Examples:", list(low_df)[:20])
vocab_5B = set()
for sent in clean5B:
    for t in sent:
        vocab_5B.add(t)
print("Size of the rest vocab:", len(vocab_5B))
print("Samples:", clean5B[10:20])

Determiner and pronouns {'@neyaphemmaster', 'no', 'tbh', '😊', 'ty', '’s', 'tho', '🍁', 'himself', 'the', '@melissajoyrd', '@m_t_f_72', '\\nit', 'oldham\\nnext', 'they', 'em', 'either', 'scarred,\\nthis', 'itself', 'we', 'it', 'its', ':)', 'an', 'don’t', 'our', 'n', 'these', '@smshow', '@rowillfindyou', '❤', 'each', 'u', 'isthereahelplineforthis', '\\nimagine', 'some', '@british_airways', 'i', 'those', 'bridgetjonesbaby', 'ourselves', '@sargon_of_akkad', '@weebtard', 'their', 'hers', 'themselves', 'herself', 'he', 'this', '🐮', 'also-', 'yours', 'myself', 'hbu', 'her', '@its.finfin', 'every', 'yourself', '@missmeliss465', '@reyesaverie', '#', 'that', "'s", '@adsbyflaherty', 'a', 'y', 'another', "y'", 'one', 'both', '@kevincanwaitcbs', 'she', 'your', 'strength.\\nthey', '\\n\\nother', '🐈', '@mhchat', '✨', '@blackeyed_susie', "you're", '_', '\uf62b', '@themathofyou', 'near,\\nthe', '😧', 'all', '@ntfc', '@fra93_bruno', '😡', 'my', 'his', 'memphis', 'any', 'xx', '@relaqss', '@ryyyshh', 'ya', '

## 5.2 Word-embedding model and training the classifiers

(a) Encoding the labels

In [6]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(training_labels_5+test_labels_5)
print(list(label_encoder.classes_))

['anger', 'fear', 'joy', 'sadness']


In [7]:
training_classes = label_encoder.transform(training_labels_5)
print(training_classes[:5])
print(list(tweets_dftrain['Label'])[:5])
print(list(tweets_dftrain['Tweet'])[:5])

[0 0 0 0 0]
['anger', 'anger', 'anger', 'anger', 'anger']
['How the fu*k! Who the heck! moved my fridge!... should I knock the landlord door. #angry #mad ##', "So my Indian Uber driver just called someone the N word. If I wasn't in a moving vehicle I'd have jumped out #disgusted ", '@DPD_UK I asked for my parcel to be delivered to a pick up store not my address #fuming #poorcustomerservice', 'so ef whichever butt wipe pulled the fire alarm in davis bc I was sound asleep #pissed #angry #upset #tired #sad #tired #hangry ######', "Don't join @BTCare they put the phone down on you, talk over you and are rude. Taking money out of my acc willynilly! #fuming"]


(b) Loading the embedding model

In [8]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

from os import path

wordembeddings="glove.twitter.27B.200d.txt"
glove_file = datapath(path.abspath('../glove/glove.twitter.27B.200d.txt'))

# Create a word2vec model from the Glove text data
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)

word_embedding_model = KeyedVectors.load_word2vec_format(tmp_file)

# Dimensions set to 200.
num_features = 200

# Converting Index2Word
index2word_set = set(word_embedding_model.index_to_key)

  _ = glove2word2vec(glove_file, tmp_file)


### Filter A

In [9]:
from utils import featureVecMethod, getAvgFeatureVecs

trainFeatureVecs_5A, embedding_words_5A, no_embedding_words_5A = \
getAvgFeatureVecs(clean5A,
                  word_embedding_model, 
                  index2word_set, 
                  num_features
                  )

Shape of our matrix is: (3613, 200)
Review 0 of 3613
Review 1000 of 3613
Review 2000 of 3613
Review 3000 of 3613


In [10]:
print(embedding_words_5A[:50])
print()
print(no_embedding_words_5A[:50])

['how', 'who', 'heck', 'move', 'fridge', 'should', 'knock', 'landlord', 'door', 'angry', 'mad', 'so', 'indian', 'uber', 'driver', 'just', 'call', 'someone', 'n', 'word', 'if', 'move', 'vehicle', "'d", 'jump', 'out', 'disgusted', 'ask', 'parcel', 'deliver', 'pick', 'up', 'store', 'not', 'address', 'fume', 'so', 'ef', 'whichever', 'butt', 'wipe', 'pull', 'fire', 'alarm', 'davis', 'bc', 'sound', 'asleep', 'piss', 'angry']

['fu*k', '...', '@dpd_uk', 'poorcustomerservice', '@btcare', 'willynilly', '😭', '😭', '@__kirstyga', 'oldcunt', '@bt_uk', '3', '@mothercareuk', '@mothercarehelp', '@dapperlaugh', '😤', '@up', '2', '🤗', '👌', '🏻', 'hoopjunkie', 'f*c@n', '😞', '😞', '\\nwhy', '..', 'immobilize', '.@divamagazine', '😒', '💸', '😒', '@vodafoneukhelp', '@vodafoneuk', '44.77', '148', '@iphone', '40', '10', '@barclaysuk', 'treatcustomersfairly', '@ggreenwald', '5', '6', '@thomsoncare', 'sam-', '@ya_boi_huck', 'huckfp2', '@dapperlaugh', '😂']


In [11]:
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV

linear_model = svm.LinearSVC(max_iter=2000)
svm_linear_clf_5A = CalibratedClassifierCV(linear_model , method='sigmoid', cv=10)

svm_linear_clf_5A.fit(trainFeatureVecs_5A, training_classes)

CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=2000), cv=10)

### Filter B

In [12]:
trainFeatureVecs_5B, embedding_words_5B, no_embedding_words_5B = \
getAvgFeatureVecs(clean5B,
                  word_embedding_model, 
                  index2word_set, 
                  num_features
                  )

Shape of our matrix is: (3613, 200)
Review 0 of 3613
Review 1000 of 3613
Review 2000 of 3613
Review 3000 of 3613


In [13]:
print(embedding_words_5B[:50])
print()
print(no_embedding_words_5B[:50])

['how', '!', 'who', 'heck', '!', 'move', 'fridge', '!', 'should', 'knock', 'landlord', 'door', '.', '#', 'angry', '#', 'mad', '#', '#', 'so', 'indian', 'uber', 'driver', 'just', 'call', 'someone', 'n', 'word', '.', 'if', 'be', "n't", 'in', 'move', 'vehicle', "'d", 'have', 'jump', 'out', '#', 'disgusted', 'ask', 'for', 'parcel', 'to', 'be', 'deliver', 'to', 'pick', 'up']

['fu*k', '...', '@dpd_uk', 'poorcustomerservice', '@btcare', 'willynilly', '😭', '😭', '@__kirstyga', 'oldcunt', '@bt_uk', '3', '@mothercareuk', '@mothercarehelp', '@dapperlaugh', '😤', '@up', '2', '🤗', '👌', '🏻', 'hoopjunkie', 'f*c@n', '😞', '😞', '\\nwhy', '..', 'immobilize', ' ', '.@divamagazine', '😒', '💸', '😒', '@vodafoneukhelp', '@vodafoneuk', '44.77', '148', '@iphone', '40', ' ', '10', '@barclaysuk', 'treatcustomersfairly', '@ggreenwald', '5', '6', '@thomsoncare', 'sam-', '@ya_boi_huck', 'huckfp2']


In [14]:
linear_model = svm.LinearSVC(max_iter=2000)
svm_linear_clf_5B = CalibratedClassifierCV(linear_model , method='sigmoid', cv=10)

svm_linear_clf_5B.fit(trainFeatureVecs_5B, training_classes)

CalibratedClassifierCV(base_estimator=LinearSVC(max_iter=2000), cv=10)

## 5.3 Predicting the test data and results

Encode the labels

In [16]:
test_classes_5 = label_encoder.transform(test_labels_5)
print(test_classes_5[:5])
print(list(tweets_dftest['Label'])[:5])
print(list(tweets_dftest['Tweet'])[:5])

[0 0 0 0 0]
['anger', 'anger', 'anger', 'anger', 'anger']
['At the point today where if someone says something remotely kind to me, a waterfall will burst out of my eyes', "@CorningFootball  IT'S GAME DAY!!!!      T MINUS 14:30  #relentless", 'This game has pissed me off more than any other game this year. My blood is boiling! Time to turn it off! #STLCards', "@spamvicious I've just found out it's Candice and not Candace. She can pout all she likes for me 😍", "@moocowward @mrsajhargreaves @Melly77 @GaryBarlow if he can't come to my Mum'a 60th after 25k tweets then why should I 🙈  #soreloser"]


### Filter A

(a) Prediction

In [17]:
max_df_test = len(test_data_5)//10

low_df_test_5A, high_df_test_5A, test_mid_df_5A = \
low_high_mid_df(2, max_df_test, test_data_5)

Min_df 2
Max_df 314


In [35]:
print(high_df_test_5A)

{'do', 'in', 'to', '#', 'that', 'have', 'my', 'a', 'of', '!', 'i', 'the', "n't", 'and', 'on', '.', 'it', 'for', ' ', 'be', 'you', ','}


In [18]:
testDataVecs_5A, test_5A_known_words, test_5A_unknown_words =\
getAvgFeatureVecs(test_mid_df_5A,
                  word_embedding_model, 
                  index2word_set, 
                  num_features) 

Shape of our matrix is: (3142, 200)
Review 0 of 3142
Review 1000 of 3142
Review 2000 of 3142
Review 3000 of 3142


In [19]:
y_pred_svm_5A = svm_linear_clf_5A.predict(testDataVecs_5A)

(b) Results

In [32]:
# 4. Evaluating and analyzing the result
from sklearn.metrics import classification_report

report_5A = classification_report(test_classes_5,y_pred_svm_5A,digits = 6)
print(label_encoder.classes_)
print('Embeddings SVM LINEAR: Tweets, Filter A')
print('Word embedding model used', wordembeddings)
print('Word mininum document frequency', min_df, ": max:", max_df_test)
print(report_5A)

['anger' 'fear' 'joy' 'sadness']
Embeddings SVM LINEAR: Tweets, Filter A
Word embedding model used glove.twitter.27B.200d.txt
Word mininum document frequency 2 : max: 314
              precision    recall  f1-score   support

           0   0.691761  0.640789  0.665301       760
           1   0.683744  0.697487  0.690547       995
           2   0.682219  0.757703  0.717983       714
           3   0.636508  0.595840  0.615503       673

    accuracy                       0.675684      3142
   macro avg   0.673558  0.672955  0.672333      3142
weighted avg   0.675219  0.675684  0.674601      3142



In [22]:
import sklearn
print('Confusion matrix SVM, embeddings, Tweets, Filter A')
print(label_encoder.classes_)
print(sklearn.metrics.confusion_matrix(test_classes_5,y_pred_svm_5A))

Confusion matrix SVM, embeddings, Tweets, Filter A
['anger' 'fear' 'joy' 'sadness']
[[487 124  79  70]
 [104 694  92 105]
 [ 38  81 541  54]
 [ 75 116  81 401]]


In [24]:
pred_probabilities_5A = svm_linear_clf_5A.predict_proba(testDataVecs_5A)

pred_labels_5A = []
for predicted_label in y_pred_svm_5A:
    pred_labels_5A.append(label_encoder.classes_[predicted_label])

gold_labels_5A = []
for gold_label in test_classes_5:
    gold_labels_5A.append(label_encoder.classes_[gold_label])

result_frame5A = pd.DataFrame(pred_probabilities_5A*100, columns=label_encoder.classes_)

result_frame5A['Chat']= list(tweets_dftest['Tweet'])
result_frame5A['Prediction']=pred_labels_5A
result_frame5A['Gold']=gold_labels_5A

result_frame5A.to_csv("result_frame5A.csv")
result_frame5A.head()

Unnamed: 0,anger,fear,joy,sadness,Chat,Prediction,Gold
0,37.646578,22.134382,16.334299,23.884741,At the point today where if someone says somet...,anger,anger
1,60.890805,15.365714,18.8205,4.92298,@CorningFootball IT'S GAME DAY!!!! T MIN...,anger,anger
2,72.422019,5.877677,7.032385,14.66792,This game has pissed me off more than any othe...,anger,anger
3,18.053478,15.828589,55.478675,10.639258,@spamvicious I've just found out it's Candice ...,joy,anger
4,38.196309,29.817172,11.337498,20.64902,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,anger,anger


### Filter B

In [25]:
low_df_test_5B, DTandPRP_test_5B, clean_test_5B = \
remove_DT_PRP(2, test_data_5)

Determiner and pronouns {'half', '@capitalone', 'no', '@realdonaldtrump', 'tbh', '😊', '@eliroth', 'tho', 'thee', 'himself', 'the', 'boys', "'em", '@ritujai18874', 'they', 'em', 'either', 'itself', 'we', 'it', '@talktalkcare', 'its', 'an', 'our', 'these', 'each', '»', 'u', '@messyourself', 'isthereahelplineforthis', 'some', '@xmaseveevil1', 'nj@latimes', '#behaviour', 'i', '😑', 'those', '@sarahb45', 'ourselves', 'lv', 'their', '@ryuredwings2', 'themselves', '🍂', 'he', 'tvgirl', '@aefadul22', '💦', 'this', 'ours', 'yours', 'myself', 'her', 'said!!!!\\nthey', 'every', 'stupid?that', 'yourself', 'that,\\ngives', '\\nindia', 'lt', "naya'\\n\\n'i", '@jbanks88', '#', 'that', "'s", 'blm', '@kristasaidthis', '@bbnicole', 'a', 'y', 'another', 'yhat', "y'", 'one', '@colinoccupantz', 'both', '😿', 'she', 'your', 'ios10', '@jankhambrams', '\\nmatt', '✨', '@barbour', '@johnjharwood', '\\nwhat', '_', 'all', 'my', 'his', '@the', '@interception225', 'it.\\n#funny', 'theirs', 'any', '@jdegrom19', 'happy\\

In [26]:
testDataVecs_5B, test_5B_known_words, test_5B_unknown_words =\
getAvgFeatureVecs(clean_test_5B,
                  word_embedding_model, 
                  index2word_set, 
                  num_features) 

Shape of our matrix is: (3142, 200)
Review 0 of 3142
Review 1000 of 3142
Review 2000 of 3142
Review 3000 of 3142


In [28]:
y_pred_svm_5B = svm_linear_clf_5B.predict(testDataVecs_5B)

In [33]:
report_5B = classification_report(test_classes_5,y_pred_svm_5B,digits = 6)
print(label_encoder.classes_)
print('Embeddings SVM LINEAR: Tweets, Filter B')
print('Word embedding model used', wordembeddings)
print('Word mininum document frequency', min_df, "; DT PRP removed")
print(report_5B)

['anger' 'fear' 'joy' 'sadness']
Embeddings SVM LINEAR: Tweets, Filter B
Word embedding model used glove.twitter.27B.200d.txt
Word mininum document frequency 2 ; DT PRP removed
              precision    recall  f1-score   support

           0   0.673582  0.640789  0.656777       760
           1   0.670647  0.677387  0.674000       995
           2   0.664122  0.731092  0.696000       714
           3   0.617834  0.576523  0.596464       673

    accuracy                       0.659134      3142
   macro avg   0.656546  0.656448  0.655810      3142
weighted avg   0.658562  0.659134  0.658226      3142



In [30]:
print('Confusion matrix SVM, embeddings, Tweet, Filter B')
print(label_encoder.classes_)
print(sklearn.metrics.confusion_matrix(test_classes_5,y_pred_svm_5B))

Confusion matrix SVM, embeddings, Tweet, Filter B
['anger' 'fear' 'joy' 'sadness']
[[487 123  83  67]
 [113 674  95 113]
 [ 51  81 522  60]
 [ 72 127  86 388]]


In [34]:
pred_probabilities_5B = svm_linear_clf_5B.predict_proba(testDataVecs_5B)

pred_labels_5B = []
for predicted_label in y_pred_svm_5B:
    pred_labels_5B.append(label_encoder.classes_[predicted_label])

gold_labels_5B = []
for gold_label in test_classes_5:
    gold_labels_5B.append(label_encoder.classes_[gold_label])

result_frame5B = pd.DataFrame(pred_probabilities_5B*100, columns=label_encoder.classes_)

result_frame5B['Chat']= list(tweets_dftest['Tweet'])
result_frame5B['Prediction']=pred_labels_5B
result_frame5B['Gold']=gold_labels_5B

result_frame5B.to_csv("result_frame5B.csv")
result_frame5B.head()

Unnamed: 0,anger,fear,joy,sadness,Chat,Prediction,Gold
0,45.794562,16.824655,16.870062,20.510721,At the point today where if someone says somet...,anger,anger
1,32.726648,24.080148,40.636461,2.556743,@CorningFootball IT'S GAME DAY!!!! T MIN...,joy,anger
2,72.988061,6.523268,7.924339,12.564331,This game has pissed me off more than any othe...,anger,anger
3,21.944637,9.637974,54.143579,14.273811,@spamvicious I've just found out it's Candice ...,joy,anger
4,44.027014,34.681069,5.279899,16.012018,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,anger,anger
