## Importing all the required libraries 

In [1]:
import pandas 
import numpy
import gensim
import logging
import sys, os
import nltk
import re, string, unicodedata
import multiprocessing
import tensorflow as tf

from time import time  # To time our operations
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from tensorflow.contrib.tensorboard.plugins import projector

# */site-packages is where your current session is running its python out of
site_path = ''
for path in sys.path:
    if 'site-packages' in path.split('/')[-1]:
        print(path)
        site_path = path
# search to see if gensim in installed packages
if len(site_path) > 0:
    if not 'gensim' in os.listdir(site_path):
        print('package not found')
    else:
        print('gensim installed')    
        

# Checking tensorflow installation
print('TensorFlow version: \t%s' % tf.__version__)

/home2/vvsaripalli/SECReports/Understanding-Financial-Reports-using-Natural-Language-Processing/Natural Language Processing/wordembeddings/lib/python3.5/site-packages
gensim installed
TensorFlow version: 	1.12.0


## Defining directories for reading text files and saving checkpoints

In [2]:
# For displaying gensim logs
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

# Directory with raw txt-files
TEXT_DIR  = 'Train/'

# Directory for saving checkpoint and metadata
MODEL_DIR = 'Checkpoints/'

# Word2vec
EMBEDDING_SIZE = 300

## Reading all the text files in the corpus and tokeniztion

In [3]:
def read_files(path):
    """
    Read in text files
    """
    documents = list()
    tokenize  = lambda x: simple_preprocess(x)
    
    # Read in all files in directory
    if os.path.isdir(path):
        for filename in os.listdir(path):
            with open('%s/%s' % (path, filename), encoding='utf-8') as f:
                doc = f.read()
                doc = clean_doc(doc)
                documents.append(tokenize(doc))
    return documents

def clean_doc(doc):
    """
    Cleaning a document by several methods
    """
    #doing basic cleaning
    doc = re.sub(r'\-+', '.', doc)
    doc = re.sub(r'\=+', '', doc)
    doc = re.sub(r'\(+', '', doc)
    doc = re.sub(r'\)+', '', doc)
    # Lowercase
    doc = doc.lower()
    #Replace fullstop with token to train on
    doc = doc.replace('.', ' __PERIOD__ ')
    # Remove numbers
    doc = re.sub(r"[0-9]+", "__NUMBER__", doc)
    # Remove ' and full stops and brackets
    doc = re.sub(r'[{}()\']', '', doc)
    #Remove other special characters
    doc = re.sub(r'[;,:-@#]', '', doc)
    # Split in tokens
    tokens = doc.split()
    # Tokens with less then two characters will be ignored
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

In [4]:
docs = read_files(TEXT_DIR)
print('Number of documents: %i' % len(docs))

Number of documents: 13848


## Building and training our Word2Vec model

Listing the necessary hyperparameteres to tunr our word2Vec model

In [5]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

Now defining and training our Word2Vec model

In [6]:
model = gensim.models.Word2Vec(docs, size=EMBEDDING_SIZE)

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : PROGRESS: at sentence #10000, processed 34394597 words, keeping 48456 word types
INFO : collected 52587 word types from a corpus of 47086854 raw words and 13848 sentences
INFO : Loading a fresh vocabulary
INFO : effective_min_count=5 retains 28021 unique words (53% of original 52587, drops 24566)
INFO : effective_min_count=5 leaves 47040002 word corpus (99% of original 47086854, drops 46852)
INFO : deleting the raw counts dictionary of 52587 items
INFO : sample=0.001 downsamples 59 most-common words
INFO : downsampling leaves estimated 34921007 word corpus (74.2% of prior 47040002)
INFO : estimated required memory for 28021 words and 300 dimensions: 81260900 bytes
INFO : resetting layer weights
INFO : training model with 3 workers on 28021 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO : EPOCH 1 - PROGRESS: at 2.77% examples, 8

INFO : EPOCH 3 - PROGRESS: at 31.51% examples, 879949 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 3 - PROGRESS: at 34.79% examples, 883664 words/s, in_qsize 5, out_qsize 1
INFO : EPOCH 3 - PROGRESS: at 37.88% examples, 889356 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 3 - PROGRESS: at 40.89% examples, 893364 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 3 - PROGRESS: at 43.95% examples, 896566 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 3 - PROGRESS: at 47.09% examples, 898655 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 3 - PROGRESS: at 50.26% examples, 899469 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 3 - PROGRESS: at 53.47% examples, 901873 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 3 - PROGRESS: at 56.28% examples, 904644 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 3 - PROGRESS: at 59.51% examples, 906299 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 3 - PROGRESS: at 62.61% examples, 908268 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 3 - PROGRESS: at 65.55% exampl

INFO : EPOCH 5 - PROGRESS: at 97.33% examples, 895535 words/s, in_qsize 5, out_qsize 0
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 5 : training on 47086854 raw words (30489151 effective words) took 34.0s, 896810 effective words/s
INFO : training on a 235434270 raw words (152434962 effective words) took 168.5s, 904865 effective words/s


Let's save our trained model as a checkpoint

In [7]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
model.save(os.path.join(MODEL_DIR,'word2vec'))

INFO : saving Word2Vec object under Checkpoints/word2vec, separately None
INFO : not storing attribute vectors_norm
INFO : not storing attribute cum_table
INFO : saved Checkpoints/word2vec


Creating metadata and checkpoint

In [8]:
weights     = model.wv.vectors
index_words = model.wv.index2word

vocab_size    = weights.shape[0]
embedding_dim = weights.shape[1]

print('Shape of weights:', weights.shape)
print('Vocabulary size: %i' % vocab_size)
print('Embedding size: %i'  % embedding_dim)

with open(os.path.join(MODEL_DIR,'metadata.tsv'), 'w') as f:
    f.writelines("\n".join(index_words))

# Required if you re-run without restarting the kernel
tf.reset_default_graph()
    
W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]), trainable=False, name="W")
embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])

embedding_init = W.assign(embedding_placeholder)
writer = tf.summary.FileWriter(MODEL_DIR, graph=tf.get_default_graph())
saver = tf.train.Saver()

config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = W.name
embedding.metadata_path = './metadata.tsv'
projector.visualize_embeddings(writer, config)

with tf.Session() as sess:
    sess.run(embedding_init, feed_dict={embedding_placeholder: weights})
    save_path = saver.save(sess, os.path.join(MODEL_DIR, "model.cpkt"))

Shape of weights: (28021, 300)
Vocabulary size: 28021
Embedding size: 300


In [9]:
model.wv.most_similar(positive=['notional'], topn=10)

INFO : precomputing L2-norms of word weight vectors


[('rate', 0.38391363620758057),
 ('counterparty', 0.3482518196105957),
 ('credit', 0.3377344012260437),
 ('notionalamount', 0.32945629954338074),
 ('default', 0.31928637623786926),
 ('levelsfair', 0.3183319568634033),
 ('spread', 0.31472811102867126),
 ('thenotional', 0.3118211030960083),
 ('longnotional', 0.2922995686531067),
 ('face', 0.286532461643219)]