In [1]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

In [2]:
df_raw = pd.read_csv('Corona_NLP_train.csv', encoding="latin-1")

df_raw

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral
41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative
41154,44953,89905,,14-04-2020,You know itÂs getting tough when @KameronWild...,Positive
41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral


In [3]:
df_text_with_labels = df_raw[['OriginalTweet', 'Sentiment']]

df_text_with_labels

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...
41152,Airline pilots offering to stock supermarket s...,Neutral
41153,Response to complaint not provided citing COVI...,Extremely Negative
41154,You know itÂs getting tough when @KameronWild...,Positive
41155,Is it wrong that the smell of hand sanitizer i...,Neutral


In [4]:
sentences = []

for index, row in df_text_with_labels.iterrows():
    sentences.append(row['OriginalTweet'].split(' '))
    
sentences

[['@MeNyrbie',
  '@Phil_Gahan',
  '@Chrisitv',
  'https://t.co/iFz9FAn2Pa',
  'and',
  'https://t.co/xX6ghGFzCC',
  'and',
  'https://t.co/I2NlzdxNo8'],
 ['advice',
  'Talk',
  'to',
  'your',
  'neighbours',
  'family',
  'to',
  'exchange',
  'phone',
  'numbers',
  'create',
  'contact',
  'list',
  'with',
  'phone',
  'numbers',
  'of',
  'neighbours',
  'schools',
  'employer',
  'chemist',
  'GP',
  'set',
  'up',
  'online',
  'shopping',
  'accounts',
  'if',
  'poss',
  'adequate',
  'supplies',
  'of',
  'regular',
  'meds',
  'but',
  'not',
  'over',
  'order'],
 ['Coronavirus',
  'Australia:',
  'Woolworths',
  'to',
  'give',
  'elderly,',
  'disabled',
  'dedicated',
  'shopping',
  'hours',
  'amid',
  'COVID-19',
  'outbreak',
  'https://t.co/bInCA9Vp8P'],
 ['My',
  'food',
  'stock',
  'is',
  'not',
  'the',
  'only',
  'one',
  'which',
  'is',
  'empty...\r\r\n\r\r\nPLEASE,',
  "don't",
  'panic,',
  'THERE',
  'WILL',
  'BE',
  'ENOUGH',
  'FOOD',
  'FOR',
  'EVE

In [5]:
vector_size = 100
model = Word2Vec(sentences, vector_size=vector_size)

In [6]:
vocab = model.wv.index_to_key
vocab_length = len(vocab)

print(f'Vocabulary Size: {format(vocab_length)}')

Vocabulary Size: 15404


In [7]:
model.wv.similarity('fear', 'paranoid')

0.41544273

In [8]:
vectors = [model.wv[word] for word in vocab]

len(vectors[0])

100

In [9]:
def sentence_to_vector_averaging(sentence, model):
    embeddings = [model.wv[word] for word in sentence if word in vocab]
    
    if len(embeddings) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(embeddings, axis=0)

In [10]:
features = []

for sentence in sentences:
    features.append(sentence_to_vector_averaging(sentence, model))
    
features

[array([-0.00573337,  1.0931644 , -2.1076252 , -0.63127977,  1.7164165 ,
         0.30214608, -0.6672788 ,  0.33980134,  0.76966697,  0.7222553 ,
         0.8808998 ,  0.14142203,  0.4421455 ,  0.7004684 , -0.69566536,
         0.49906772,  0.99766225,  0.46969315,  0.6076586 ,  1.0108026 ,
        -0.17215894, -2.0037944 ,  0.0191701 , -0.39870155,  0.8294985 ,
        -0.5029277 ,  0.8066284 , -0.83381313,  0.09091681,  0.43877688,
         0.7976174 , -1.2106103 , -1.5608245 , -0.37981707, -0.96632844,
         1.0958953 ,  1.6561444 , -0.08279492, -0.03161278, -0.68127173,
        -0.41796097, -1.3040379 , -0.08747209, -0.13033977,  1.258233  ,
         0.9311807 ,  0.91752213, -0.01620101, -0.6802611 , -0.27311125,
         1.467576  , -0.8310564 , -0.4645253 ,  2.7021973 , -0.4749719 ,
         0.16533722, -1.1560311 ,  0.9429542 , -0.5486088 , -0.70429736,
         0.60265833,  1.1314561 ,  0.02333251,  0.4432629 ,  0.13795571,
         0.10781343,  0.16207416, -0.4285718 , -0.1

In [11]:
columns = [f'x{i}' for i in range(model.vector_size)]

df_text = pd.DataFrame(features, columns=columns)

df_text

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x90,x91,x92,x93,x94,x95,x96,x97,x98,x99
0,-0.005733,1.093164,-2.107625,-0.631280,1.716416,0.302146,-0.667279,0.339801,0.769667,0.722255,...,-0.204336,0.800533,0.418789,-0.559109,0.879544,0.481289,0.399432,-0.427920,-0.585766,0.033782
1,-0.648570,0.161324,-0.158983,-0.470081,0.748615,-0.375849,0.223543,0.308315,0.087388,-0.110033,...,0.362323,0.202472,0.131399,-0.173954,0.468886,0.094542,0.016025,-0.030525,-0.341174,0.472661
2,-0.718113,0.131502,-0.327362,-0.602941,0.110238,-0.972190,0.468776,0.842646,-0.041889,-0.545790,...,0.349131,0.050346,0.502296,-0.034400,0.141542,0.030141,-0.405373,-0.191749,-0.489962,0.501837
3,-0.397766,0.428532,-0.129252,-0.439629,0.780359,-0.422228,0.242449,0.140700,-0.120050,-0.423573,...,0.098021,0.303939,-0.051341,0.059579,0.636630,0.299569,-0.030050,-0.096401,0.070474,0.087033
4,-0.404937,0.409364,0.130549,-0.514409,0.913535,-0.397483,0.406027,0.510043,-0.120333,-0.341714,...,0.159937,0.371318,0.090010,-0.124761,0.685557,0.305242,-0.070251,0.175334,0.041705,0.298374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41152,-0.865131,0.209774,-0.054378,-0.532171,0.301757,-0.847475,0.352128,0.882750,-0.393040,-0.202818,...,0.536389,0.198135,-0.386475,-0.148794,0.232704,0.319675,0.242471,0.535173,0.005495,0.596527
41153,-0.499821,0.249284,-0.285645,-0.386648,0.435185,-0.711781,0.390174,0.389833,-0.173218,-0.079213,...,0.125375,0.060107,0.290178,-0.072304,0.196875,0.146808,-0.439601,-0.097496,-0.289766,0.316755
41154,-0.409229,0.214107,0.444568,-0.232377,0.905252,-0.417026,0.391207,0.252912,-0.497888,-0.673446,...,0.139926,0.790138,-0.206936,0.331219,0.944969,0.257871,0.087468,-0.304973,0.368083,-0.457824
41155,-0.612128,0.124839,0.189256,-0.163600,0.964981,-0.167563,0.281915,-0.005719,-0.268584,0.100728,...,0.052441,0.443983,0.227310,0.149179,0.826724,0.218149,-0.151992,-0.541875,-0.152920,0.069031


In [12]:
df_text['y'] = df_raw['Sentiment']

df_text

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x91,x92,x93,x94,x95,x96,x97,x98,x99,y
0,-0.005733,1.093164,-2.107625,-0.631280,1.716416,0.302146,-0.667279,0.339801,0.769667,0.722255,...,0.800533,0.418789,-0.559109,0.879544,0.481289,0.399432,-0.427920,-0.585766,0.033782,Neutral
1,-0.648570,0.161324,-0.158983,-0.470081,0.748615,-0.375849,0.223543,0.308315,0.087388,-0.110033,...,0.202472,0.131399,-0.173954,0.468886,0.094542,0.016025,-0.030525,-0.341174,0.472661,Positive
2,-0.718113,0.131502,-0.327362,-0.602941,0.110238,-0.972190,0.468776,0.842646,-0.041889,-0.545790,...,0.050346,0.502296,-0.034400,0.141542,0.030141,-0.405373,-0.191749,-0.489962,0.501837,Positive
3,-0.397766,0.428532,-0.129252,-0.439629,0.780359,-0.422228,0.242449,0.140700,-0.120050,-0.423573,...,0.303939,-0.051341,0.059579,0.636630,0.299569,-0.030050,-0.096401,0.070474,0.087033,Positive
4,-0.404937,0.409364,0.130549,-0.514409,0.913535,-0.397483,0.406027,0.510043,-0.120333,-0.341714,...,0.371318,0.090010,-0.124761,0.685557,0.305242,-0.070251,0.175334,0.041705,0.298374,Extremely Negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41152,-0.865131,0.209774,-0.054378,-0.532171,0.301757,-0.847475,0.352128,0.882750,-0.393040,-0.202818,...,0.198135,-0.386475,-0.148794,0.232704,0.319675,0.242471,0.535173,0.005495,0.596527,Neutral
41153,-0.499821,0.249284,-0.285645,-0.386648,0.435185,-0.711781,0.390174,0.389833,-0.173218,-0.079213,...,0.060107,0.290178,-0.072304,0.196875,0.146808,-0.439601,-0.097496,-0.289766,0.316755,Extremely Negative
41154,-0.409229,0.214107,0.444568,-0.232377,0.905252,-0.417026,0.391207,0.252912,-0.497888,-0.673446,...,0.790138,-0.206936,0.331219,0.944969,0.257871,0.087468,-0.304973,0.368083,-0.457824,Positive
41155,-0.612128,0.124839,0.189256,-0.163600,0.964981,-0.167563,0.281915,-0.005719,-0.268584,0.100728,...,0.443983,0.227310,0.149179,0.826724,0.218149,-0.151992,-0.541875,-0.152920,0.069031,Neutral
