## Importing all the required libraries 

In [None]:
import pandas 
import numpy
import glob
import gensim
import logging
import sys, os
import nltk
import inflect
import itertools
import re, string, unicodedata
import multiprocessing
import tensorflow as tf

from nltk.tokenize import TweetTokenizer, sent_tokenize
from time import time  # To time our operations
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from gensim.models.phrases import Phraser, Phrases
from tensorflow.contrib.tensorboard.plugins import projector

# */site-packages is where your current session is running its python out of
site_path = ''
for path in sys.path:
    if 'site-packages' in path.split('/')[-1]:
        print(path)
        site_path = path
# search to see if gensim in installed packages
if len(site_path) > 0:
    if not 'gensim' in os.listdir(site_path):
        print('package not found')
    else:
        print('gensim installed')    
        

# Checking tensorflow installation
print('TensorFlow version: \t%s' % tensorflow.__version__)

## Defining directories for reading text files and saving checkpoints

In [None]:
# For displaying gensim logs
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

# Directory with raw txt-files
TEXT_DIR  = 'data/train'

# Directory for saving checkpoint and metadata
MODEL_DIR = 'emb_yelp/'

# Word2vec
EMBEDDING_SIZE = 300

## Reading all the text files in the corpus and tokeniztion

In [None]:
def read_files(path):
    """
    Read in text files
    """
    documents = list()
    tokenize  = lambda x: simple_preprocess(x)
    
    # Read in all files in directory
    if os.path.isdir(path):
        for filename in os.listdir(path):
            with open('%s/%s' % (path, filename), encoding='utf-8') as f:
                doc = f.read()
                doc = clean_doc(doc)
                documents.append(tokenize(doc))
    return documents

def clean_doc(doc):
    """
    Cleaning a document by several methods
    """
    # Lowercase
    doc = doc.lower()
    # Remove numbers
    doc = re.sub(r"[0-9]+", "", doc)
    # Split in tokens
    tokens = doc.split()
    # Remove punctuation
    tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in tokens]
    # Tokens with less then two characters will be ignored
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

In [None]:
docs = read_files(TEXT_DIR)
print('Number of documents: %i' % len(docs))

## Building and training our Word2Vec model

Listing the necessary hyperparameteres to tunr our word2Vec model

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [None]:
num_features = 200 # dimensions of each word embedding
min_word_count = 1 # this is not advisable but since we need to extract
# feature vector for each word we need to do this
num_workers = multiprocessing.cpu_count() # number of threads running in parallel
context_size = 7 # context window length
downsampling = 1e-3 # downsampling for very frequent words
seed = 1 # seed for random number generator to make results reproducible

Now defining our Word2Vec model with the above declared hyperparameters

In [None]:
word2vec_ = Word2Vec(
    sg = 1, seed = seed,
    workers = num_workers,
    size = num_features,
    min_count = min_word_count,
    window = context_size,
    sample = downsampling
)

It's important that we train our vocabulary first before training the model

In [None]:
word2vec_.build_vocab(data)

Now training the Word2Vec model with the vocabulary generated above

In [None]:
word2vec_.train(data, total_examples = word2vec_.corpus_count, epochs = word2vec_.iter)

In [None]:
len(word2vec_.wv.vocab)

In [None]:
word2vec_.most_similar('credit')

### Iterate through the entire vocabulary

In [None]:
vocab = list(word2vec_.wv.vocab.keys())
vocab[:100]