## Importing all the required libraries 

In [9]:
import pandas 
import numpy
import gensim
import logging
import sys, os
import nltk
import re, string, unicodedata
import multiprocessing
import tensorflow as tf

from time import time  # To time our operations
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from tensorflow.contrib.tensorboard.plugins import projector

# */site-packages is where your current session is running its python out of
site_path = ''
for path in sys.path:
    if 'site-packages' in path.split('/')[-1]:
        print(path)
        site_path = path
# search to see if gensim in installed packages
if len(site_path) > 0:
    if not 'gensim' in os.listdir(site_path):
        print('package not found')
    else:
        print('gensim installed')    
        

# Checking tensorflow installation
print('TensorFlow version: \t%s' % tf.__version__)

/home2/vvsaripalli/SECReports/Understanding-Financial-Reports-using-Natural-Language-Processing/Natural Language Processing/wordembeddings/lib/python3.5/site-packages
gensim installed
TensorFlow version: 	1.12.0


## Defining directories for reading text files and saving checkpoints

In [10]:
# For displaying gensim logs
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

# Directory with raw txt-files
TEXT_DIR  = 'Train/'

# Directory for saving checkpoint and metadata
MODEL_DIR = 'Checkpoints/'

# Word2vec
EMBEDDING_SIZE = 300

## Reading all the text files in the corpus and tokeniztion

In [11]:
def read_files(path):
    """
    Read in text files
    """
    documents = list()
    tokenize  = lambda x: simple_preprocess(x)
    
    # Read in all files in directory
    if os.path.isdir(path):
        for filename in os.listdir(path):
            with open('%s/%s' % (path, filename), encoding='utf-8') as f:
                doc = f.read()
                doc = clean_doc(doc)
                documents.append(tokenize(doc))
    return documents

def clean_doc(doc):
    """
    Cleaning a document by several methods
    """
    # Lowercase
    doc = doc.lower()
    # Remove numbers
    doc = re.sub(r"[0-9]+", "", doc)
    # Split in tokens
    tokens = doc.split()
    # Remove punctuation
    tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in tokens]
    # Tokens with less then two characters will be ignored
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

In [12]:
docs = read_files(TEXT_DIR)
print('Number of documents: %i' % len(docs))

Number of documents: 7855


## Building and training our Word2Vec model

Listing the necessary hyperparameteres to tunr our word2Vec model

In [13]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

Now defining and training our Word2Vec model

In [14]:
model = gensim.models.Word2Vec(docs, size=EMBEDDING_SIZE)

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : collected 49588 word types from a corpus of 33675431 raw words and 7855 sentences
INFO : Loading a fresh vocabulary
INFO : effective_min_count=5 retains 24966 unique words (50% of original 49588, drops 24622)
INFO : effective_min_count=5 leaves 33630646 word corpus (99% of original 33675431, drops 44785)
INFO : deleting the raw counts dictionary of 49588 items
INFO : sample=0.001 downsamples 59 most-common words
INFO : downsampling leaves estimated 24932968 word corpus (74.1% of prior 33630646)
INFO : estimated required memory for 24966 words and 300 dimensions: 72401400 bytes
INFO : resetting layer weights
INFO : training model with 3 workers on 24966 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO : EPOCH 1 - PROGRESS: at 3.09% examples, 657036 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 1 - PROGRESS: at 6.70% examples, 7263

INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 3 : training on 33675431 raw words (21419134 effective words) took 26.3s, 813930 effective words/s
INFO : EPOCH 4 - PROGRESS: at 3.86% examples, 808815 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 4 - PROGRESS: at 7.50% examples, 837528 words/s, in_qsize 6, out_qsize 0
INFO : EPOCH 4 - PROGRESS: at 11.18% examples, 814573 words/s, in_qsize 6, out_qsize 0
INFO : EPOCH 4 - PROGRESS: at 14.74% examples, 812872 words/s, in_qsize 6, out_qsize 0
INFO : EPOCH 4 - PROGRESS: at 18.20% examples, 809635 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 4 - PROGRESS: at 22.08% examples, 814217 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 4 - PROGRESS: at 26.00% examples, 821046 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 4 - PROGRESS: at 30.40% examples, 824718 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 4 - PROGRESS: at 34.27% examples, 826127 words/s,

Let's save our trained model as a checkpoint

In [15]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
model.save(os.path.join(MODEL_DIR,'word2vec'))

INFO : saving Word2Vec object under Checkpoints/word2vec, separately None
INFO : not storing attribute vectors_norm
INFO : not storing attribute cum_table
INFO : saved Checkpoints/word2vec


Creating metadata and checkpoint

In [16]:
weights     = model.wv.vectors
index_words = model.wv.index2word

vocab_size    = weights.shape[0]
embedding_dim = weights.shape[1]

print('Shape of weights:', weights.shape)
print('Vocabulary size: %i' % vocab_size)
print('Embedding size: %i'  % embedding_dim)

with open(os.path.join(MODEL_DIR,'metadata.tsv'), 'w') as f:
    f.writelines("\n".join(index_words))

# Required if you re-run without restarting the kernel
tf.reset_default_graph()
    
W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]), trainable=False, name="W")
embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])

embedding_init = W.assign(embedding_placeholder)
writer = tf.summary.FileWriter(MODEL_DIR, graph=tf.get_default_graph())
saver = tf.train.Saver()

config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = W.name
embedding.metadata_path = './metadata.tsv'
projector.visualize_embeddings(writer, config)

with tf.Session() as sess:
    sess.run(embedding_init, feed_dict={embedding_placeholder: weights})
    save_path = saver.save(sess, os.path.join(MODEL_DIR, "model.cpkt"))

Shape of weights: (24966, 300)
Vocabulary size: 24966
Embedding size: 300


In [24]:
model.wv.most_similar(positive=['buy'], topn=10)

[('sell', 0.5410637855529785),
 ('contractsbuy', 0.4904049038887024),
 ('issuersbuy', 0.47991445660591125),
 ('valuesell', 0.4697945713996887),
 ('contractssell', 0.4678630232810974),
 ('valuebuy', 0.46492916345596313),
 ('balancebuy', 0.45454537868499756),
 ('issuessold', 0.45103883743286133),
 ('indicessell', 0.45057427883148193),
 ('indicesbuy', 0.44925206899642944)]