## Importing all the required libraries 

In [1]:
import pandas 
import numpy
import gensim
import logging
import sys, os
import nltk
import re, string, unicodedata
import multiprocessing
import tensorflow as tf

from time import time  # To time our operations
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from tensorflow.contrib.tensorboard.plugins import projector

# */site-packages is where your current session is running its python out of
site_path = ''
for path in sys.path:
    if 'site-packages' in path.split('/')[-1]:
        print(path)
        site_path = path
# search to see if gensim in installed packages
if len(site_path) > 0:
    if not 'gensim' in os.listdir(site_path):
        print('package not found')
    else:
        print('gensim installed')    
        

# Checking tensorflow installation
print('TensorFlow version: \t%s' % tf.__version__)

/home2/vvsaripalli/SECReports/Understanding-Financial-Reports-using-Natural-Language-Processing/Natural Language Processing/wordembeddings/lib/python3.5/site-packages
gensim installed
TensorFlow version: 	1.12.0


## Defining directories for reading text files and saving checkpoints

In [8]:
# For displaying gensim logs
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

# Directory with raw txt-files
TEXT_DIR  = '/home2/vvsaripalli/WordEmbeddingCorpus/'

# Directory for saving checkpoint and metadata
MODEL_DIR = 'Checkpoints/'

# Word2vec
EMBEDDING_SIZE = 300

## Reading all the text files in the corpus and tokeniztion

In [9]:
def read_files(path):
    """
    Read in text files
    """
    documents = list()
    tokenize  = lambda x: simple_preprocess(x)
    
    # Read in all files in directory
    if os.path.isdir(path):
        for filename in os.listdir(path):
            with open('%s/%s' % (path, filename), encoding='utf-8') as f:
                doc = f.read()
                doc = clean_doc(doc)
                documents.append(tokenize(doc))
    return documents

def clean_doc(doc):
    """
    Cleaning a document by several methods
    """
    #doing basic cleaning
    doc = re.sub(r'\-+', '.', doc)
    doc = re.sub(r'\=+', '', doc)
    doc = re.sub(r'\(+', '', doc)
    doc = re.sub(r'\)+', '', doc)
    # Lowercase
    doc = doc.lower()
    #Replace fullstop with token to train on
    doc = doc.replace('.', ' __PERIOD__ ')
    # eplace numbers with token to train on
    doc = re.sub(r"[0-9]+", "__NUMBER__", doc)
    # Remove ' and full stops and brackets
    doc = re.sub(r'[{}()\']', '', doc)
    #Remove other special characters
    doc = re.sub(r'[;,:-@#]', '', doc)
    # Split in tokens
    tokens = doc.split()
    # Tokens with less then two characters will be ignored
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

In [11]:
docs = read_files(TEXT_DIR)
print('Number of documents: %i' % len(docs))

Number of documents: 4361


## Building and training our Word2Vec model

Listing the necessary hyperparameteres to tunr our word2Vec model

In [12]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

Now defining and training our Word2Vec model

In [13]:
model = gensim.models.Word2Vec(docs, size=EMBEDDING_SIZE)

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : collected 36515 word types from a corpus of 20113566 raw words and 4361 sentences
INFO : Loading a fresh vocabulary
INFO : effective_min_count=5 retains 20985 unique words (57% of original 36515, drops 15530)
INFO : effective_min_count=5 leaves 20081026 word corpus (99% of original 20113566, drops 32540)
INFO : deleting the raw counts dictionary of 36515 items
INFO : sample=0.001 downsamples 71 most-common words
INFO : downsampling leaves estimated 14251841 word corpus (71.0% of prior 20081026)
INFO : estimated required memory for 20985 words and 300 dimensions: 60856500 bytes
INFO : resetting layer weights
INFO : training model with 3 workers on 20985 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO : EPOCH 1 - PROGRESS: at 10.75% examples, 1058743 words/s, in_qsize 5, out_qsize 0
INFO : EPOCH 1 - PROGRESS: at 22.11% examples, 1

Let's save our trained model as a checkpoint

In [14]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
model.save(os.path.join(MODEL_DIR,'word2vec'))

INFO : saving Word2Vec object under Checkpoints/word2vec, separately None
INFO : not storing attribute vectors_norm
INFO : not storing attribute cum_table
INFO : saved Checkpoints/word2vec


Creating metadata and checkpoint

In [15]:
weights     = model.wv.vectors
index_words = model.wv.index2word

vocab_size    = weights.shape[0]
embedding_dim = weights.shape[1]

print('Shape of weights:', weights.shape)
print('Vocabulary size: %i' % vocab_size)
print('Embedding size: %i'  % embedding_dim)

with open(os.path.join(MODEL_DIR,'metadata.tsv'), 'w') as f:
    f.writelines("\n".join(index_words))

# Required if you re-run without restarting the kernel
tf.reset_default_graph()
    
W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]), trainable=False, name="W")
embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])

embedding_init = W.assign(embedding_placeholder)
writer = tf.summary.FileWriter(MODEL_DIR, graph=tf.get_default_graph())
saver = tf.train.Saver()

config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = W.name
embedding.metadata_path = './metadata.tsv'
projector.visualize_embeddings(writer, config)

with tf.Session() as sess:
    sess.run(embedding_init, feed_dict={embedding_placeholder: weights})
    save_path = saver.save(sess, os.path.join(MODEL_DIR, "model.cpkt"))

Shape of weights: (20985, 300)
Vocabulary size: 20985
Embedding size: 300


In [19]:
model.wv.most_similar(positive=['pay'], topn=10)

[('receive', 0.6883692741394043),
 ('rate', 0.3697010278701782),
 ('floating', 0.36679479479789734),
 ('usd', 0.35608696937561035),
 ('pays', 0.34642866253852844),
 ('receiving', 0.34415680170059204),
 ('krw', 0.33488595485687256),
 ('paying', 0.3230683505535126),
 ('sep', 0.32099348306655884),
 ('pribor', 0.30752915143966675)]