In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
save_file = "saved.csv"
small_df = pd.read_csv(save_file)

In [3]:
small_df['stemmed_tokens'] = pd.Series(small_df['stemmed_tokens'])
small_df['sentiment'] = pd.Series(small_df['sentiment'])

In [5]:
small_df.stemmed_tokens = small_df.stemmed_tokens.map(lambda x: ast.literal_eval(x))

# Implementing the Word2Vec Model

In this Notebook, an implementation of the Word2Vec original paper model will be presented,
to produce a vectorial space using the previously obtained stemmed tokens, as to allow a Random Forest decision tree algorithm
to classify tweets, based on their lexical content

To see more about an implementation of the Word2Vec model, access the [official tutorial from PyTorch](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html).
If you'd like to read the original paper, access its [arxiv page](https://arxiv.org/abs/1301.3781).

In [9]:
import importlib
import src.models as models
import time

In [10]:
importlib.reload(models)

<module 'src.models' from 'C:\\Users\\rzimm\\Workspace\\data\\word2vec_sa\\src\\models.py'>

In [8]:
example = small_df.stemmed_tokens
models.CBOW_transform(example, 2)

[([['virginamerica',
    'plu',
    'you',
    've',
    'ad',
    'commerci',
    'to',
    'the',
    'experi',
    'tacki'],
   ['virginamerica',
    'ye',
    'nearli',
    'everi',
    'time',
    'fli',
    'vx',
    'thi',
    'ear',
    'worm',
    'won',
    'go',
    'away'],
   ['virginamerica',
    'it',
    'wa',
    'amaz',
    'and',
    'arriv',
    'an',
    'hour',
    'earli',
    'you',
    're',
    'too',
    'good',
    'to',
    'me']],
  ['virginamerica', 'well', 'didn', 'but', 'now', 'do']),
 ([['virginamerica',
    'ye',
    'nearli',
    'everi',
    'time',
    'fli',
    'vx',
    'thi',
    'ear',
    'worm',
    'won',
    'go',
    'away'],
   ['virginamerica', 'well', 'didn', 'but', 'now', 'do'],
   ['virginamerica',
    'lt',
    'pretti',
    'graphic',
    'so',
    'much',
    'better',
    'than',
    'minim',
    'iconographi']],
  ['virginamerica',
   'it',
   'wa',
   'amaz',
   'and',
   'arriv',
   'an',
   'hour',
   'earli',
   'you',
   'r

In [None]:

# Skip-gram model (sg = 1)
size = 400
window = 3
min_count = 1
workers = 3
sg = 1

model_file = 'word2vec.model'
start_time = time.time()
stemmed_tokens = small_df['stemmed_tokens']

w2v_model = models.CBOW(stemmed_tokens, vector_size=size, min_count = min_count, workers = workers, window = window, sg = sg)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(model_file)

In [None]:
w2v_model.wv.key_to_index.keys()