##### Imports

In [15]:
import pandas as pd
import nltk
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import pickle
from sklearn.preprocessing import LabelEncoder

##### Load in Text Data

In [16]:
text_df = pd.read_pickle("merged_training.pkl")
text_df = text_df.reset_index(drop = True)
text_df

Unnamed: 0,text,emotions
0,i feel awful about it too because it s my job ...,sadness
1,im alone i feel awful,sadness
2,ive probably mentioned this before but i reall...,joy
3,i was feeling a little low few days back,sadness
4,i beleive that i am much more sensitive to oth...,love
...,...,...
416804,that was what i felt when i was finally accept...,joy
416805,i take every day as it comes i m just focussin...,fear
416806,i just suddenly feel that everything was fake,sadness
416807,im feeling more eager than ever to claw back w...,joy


##### Tokenize Text

In [3]:
# store data by words
data_by_words = []
# loop through texts
for i in text_df['text']:
    # get words, tokenize
    value = nltk.word_tokenize(i)
    data_by_words.append(value)
data_by_words[0]

['i',
 'feel',
 'awful',
 'about',
 'it',
 'too',
 'because',
 'it',
 's',
 'my',
 'job',
 'to',
 'get',
 'him',
 'in',
 'a',
 'position',
 'to',
 'succeed',
 'and',
 'it',
 'just',
 'didn',
 't',
 'happen',
 'here']

##### Get Document Embeddings

In [4]:
model = Word2Vec(sentences = data_by_words, vector_size = 100)
model.train(data_by_words, total_examples = len(data_by_words), epochs = 50)

(271015015, 400544400)

In [5]:
model.wv.save_word2vec_format("word2vec.model", binary = False)

In [19]:
embeddings_word_data = []
# for each document
track_count = 0
for i in data_by_words:
    text = []
    # for each word in doc
    for j in i:
        # if word in model vocab get the embedding
        if j in model.wv.key_to_index:
            text.append(model.wv[j])
    # average embeddings across all word embeddings
    if len(text) != 0:
        text = np.mean(text, axis = 0)
        embeddings_word_data.append(text)
    else:
        print(track_count)
    track_count+=1
embeddings_word_data = np.array(embeddings_word_data)
print(embeddings_word_data.shape)

45779
(416808, 100)


In [20]:
# split into train, val, test
labelencoder = LabelEncoder()
y = list(text_df['emotions'])
del y[45779]
y = labelencoder.fit_transform(y)

X_train, X_val_test, y_train, y_val_test = train_test_split(embeddings_word_data, y, test_size=0.2)

X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.50)

In [24]:
# save embeddings
np.save('text_embeddings_train.npy', X_train)
np.save('text_embeddings_val.npy', X_val)
np.save('text_embeddings_test.npy', X_test)

In [25]:
# save labels
with open("train_y.pkl", "wb") as file:
    pickle.dump(y_train, file)
    
with open("val_y.pkl", "wb") as file:
    pickle.dump(y_val, file)
    
with open("test_y.pkl", "wb") as file:
    pickle.dump(y_test, file)