In [7]:
import numpy as np
import pandas as pd
import string

from gensim.models import Word2Vec

In [8]:
df_train = pd.read_csv("tweet_clean_train.csv")
df_validation = pd.read_csv("tweet_clean_validation.csv")
df_test = pd.read_csv("tweet_clean_test.csv")

In [12]:
# Several actions in here:
#     - Remove all punctuations
#     - remove digits
#     - remove stopwords
#     - Lemmatize all the words
    
#     * Each line can be commented out in order to NOT apply them

from nltk.corpus import stopwords
from textblob import Word

stop = stopwords.words("english")

df_train["text"] = df_train["text"].apply(lambda x: ' '.join(x.lower() for x in x.split()))
df_train["text"] = df_train["text"].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation))
df_train["text"] = df_train["text"].str.replace("[^\w\s]", "")
df_train["text"] = df_train["text"].apply(lambda x: " ".join(x for x in x.split() if not x.isdigit()))
df_train["text"] = df_train["text"].apply(lambda x: " ".join(x for x in x.split() if not x in stop))
df_train["text"] = df_train["text"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [13]:
df_train

Unnamed: 0.1,Unnamed: 0,text,target
0,0,pauln got arse kicked crystal palace first dnf...,0
1,1,yeah pretty cool prefer everyones getting along,4
2,2,im happy real reason happy like,4
3,3,slept welltake shower nowhave tax declaration ...,0
4,4,yes cool favorite tune interestingly waiting r...,4
...,...,...,...
79995,79995,raidys bbq tonight free food morrellis ice cre...,4
79996,79996,dont luv follow friday thank flooding timeline...,4
79997,79997,ahh see need go recruit young lady,4
79998,79998,housewife material selfcooked rice turned wate...,0


In [16]:
# - Ignores all the words with a total frequency lower than 1
# - 300 dimension of the word vectors
# - 4 threads to train the model 

wordList = []
for i in df_train["text"]:
    li = list(i.split(" "))
    wordList.append(li)
model = Word2Vec(wordList,min_count=1,size=300,workers=4, window = 2, sg = 1, hs = 1, alpha = 1e-3)

In [17]:
# Save the W2V model for future usage
model.save("word2vec.model")
model.save("model.bin")

['pauln',
 'got',
 'arse',
 'kicked',
 'crystal',
 'palace',
 'first',
 'dnf',
 'disappointing',
 'yeah',
 'pretty',
 'cool',
 'prefer',
 'everyones',
 'getting',
 'along',
 'im',
 'happy',
 'real',
 'reason',
 'like',
 'slept',
 'welltake',
 'shower',
 'nowhave',
 'tax',
 'declaration',
 'today',
 'veryvery',
 'boring',
 'yes',
 'favorite',
 'tune',
 'interestingly',
 'waiting',
 'remix',
 'sure',
 'haha',
 'read',
 'saw',
 'sad',
 'video',
 'iran',
 'dude',
 'pumped',
 'ill',
 'living',
 'block',
 'countdown',
 'week',
 'movie',
 'hilarious',
 'cried',
 'laughing',
 'l0l',
 'youre',
 'awesome',
 'david',
 'gonna',
 'give',
 'love',
 'u',
 'stuck',
 'blackberry',
 'sorry',
 'dont',
 'want',
 'argue',
 'anything',
 'always',
 'negate',
 'descision',
 'forgive',
 'well',
 'hope',
 'get',
 'good',
 'day',
 'anyway',
 'men',
 'mr',
 'darcy',
 'pride',
 'prejudice',
 'hey',
 'state',
 'origin',
 'start',
 'goin',
 'wait',
 'worth',
 'big',
 'warm',
 'hug',
 'fun',
 'made',
 'mine',
 'humph