## Importing all the required libraries 

In [1]:
import pandas 
import numpy
import glob
import gensim
import logging
import sys, os
import nltk
import inflect
import itertools
import re, string, unicodedata
import multiprocessing 

from nltk.tokenize import TweetTokenizer, sent_tokenize
from time import time  # To time our operations
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases

# */site-packages is where your current session is running its python out of
site_path = ''
for path in sys.path:
    if 'site-packages' in path.split('/')[-1]:
        print(path)
        site_path = path
# search to see if gensim in installed packages
if len(site_path) > 0:
    if not 'gensim' in os.listdir(site_path):
        print('package not found')
    else:
        print('gensim installed')    

/Users/Tarun/Code/Understanding-Financial-Reports-using-Natural-Language-Processing/Natural Language Processing/wordembedding/lib/python2.7/site-packages
gensim installed


## Reading all the text files in the corpus and tokeniztion

In [24]:
def readCorpus():
    #logging.info("reading file {0}...this may take a while".format(input_file))
    for filename in glob.glob('Train/*.txt'):
        print(filename)
        with open(filename, 'rb') as f:
            line = f.read()
            tokenizer_words = TweetTokenizer()
            tokens_sentences = [tokenizer_words.tokenize(t) for t in 
            nltk.sent_tokenize(line)]
            return tokens_sentences

def normalize():
    words = readCorpus()
    return words

In [25]:
data = normalize()
data

Train/0001193125-17-056504_Parsed.txt


[[u'0001193125-17-056504.txt',
  u':',
  u'2017022',
  u'4',
  u'0001193125-17-056504.hdr.sgml',
  u':',
  u'2017022',
  u'4',
  u'2017022417',
  u'0355',
  u'ACCESSION',
  u'NUMBER',
  u':',
  u'00011931',
  u'25-17-',
  u'056504',
  u'CONFORMED',
  u'SUBMISSION',
  u'TYPE',
  u':',
  u'N-Q',
  u'PUBLIC',
  u'DOCUMENT',
  u'COUNT',
  u':',
  u'2',
  u'CONFORMED',
  u'PERIOD',
  u'OF',
  u'REPORT',
  u':',
  u'2016123',
  u'1',
  u'FILED',
  u'AS',
  u'OF',
  u'DATE',
  u':',
  u'2017022',
  u'4',
  u'DATE',
  u'AS',
  u'OF',
  u'CHANGE',
  u':',
  u'2017022',
  u'4',
  u'EFFECTIVENESS',
  u'DATE',
  u':',
  u'2017022',
  u'4',
  u'FILER',
  u':',
  u'COMPANY',
  u'DATA',
  u':',
  u'COMPANY',
  u'CONFORMED',
  u'NAME',
  u':',
  u'PIMCO',
  u'FUNDS',
  u'CENTRAL',
  u'INDEX',
  u'KEY',
  u':',
  u'00008108',
  u'93',
  u'IRS',
  u'NUMBER',
  u':',
  u'9526323',
  u'39',
  u'STATE',
  u'OF',
  u'INCORPORATION',
  u':',
  u'MA',
  u'FISCAL',
  u'YEAR',
  u'END',
  u':',
  u'0331',
  u'F

## Building and training our Word2Vec model

Listing the necessary hyperparameteres to tunr our word2Vec model

In [26]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [27]:
num_features = 200 # dimensions of each word embedding
min_word_count = 1 # this is not advisable but since we need to extract
# feature vector for each word we need to do this
num_workers = multiprocessing.cpu_count() # number of threads running in parallel
context_size = 7 # context window length
downsampling = 1e-3 # downsampling for very frequent words
seed = 1 # seed for random number generator to make results reproducible

Now defining our Word2Vec model with the above declared hyperparameters

In [28]:
word2vec_ = Word2Vec(
    sg = 1, seed = seed,
    workers = num_workers,
    size = num_features,
    min_count = min_word_count,
    window = context_size,
    sample = downsampling
)

It's important that we train our vocabulary first before training the model

In [29]:
word2vec_.build_vocab(data)

Now training the Word2Vec model with the vocabulary generated above

In [30]:
word2vec_.train(data, total_examples = word2vec_.corpus_count, epochs = word2vec_.iter)

  """Entry point for launching an IPython kernel.


(3046606, 4175820)

In [31]:
len(word2vec_.wv.vocab)

40518

In [33]:
word2vec_.most_similar('Reference')

  """Entry point for launching an IPython kernel.


[(u'Entity', 0.9815545082092285),
 (u'ReferenceEntity', 0.9602895975112915),
 (u'Fixed', 0.9542164206504822),
 (u'ReceiveRate', 0.9305229783058167),
 (u'FixedReceiveRate', 0.9270812273025513),
 (u'ImpliedCreditSpread', 0.9150458574295044),
 (u'ImpliedCreditSpreadatDecember', 0.913988471031189),
 (u'ReceiveVolatility', 0.9105256795883179),
 (u'Tranches', 0.9020254015922546),
 (u'ImpliedCredit', 0.9012861251831055)]

### Iterate through the entire vocabulary

In [23]:
vocab = list(word2vec_.wv.vocab.keys())
vocab[:100]

[u'1037 1188',
 u'481 456 1000',
 u'PTYCX',
 u'0 904  1220',
 u'741   738  0500',
 u'11542',
 u'11543',
 u'11540',
 u'11546',
 u'623639',
 u'11544',
 u'11545',
 u'11549',
 u'5988',
 u'5989',
 u'5982',
 u'5983',
 u'5980',
 u'5981',
 u'5986',
 u'5987',
 u'5984',
 u'5985',
 u'0470 1230201',
 u'Western',
 u'239 198 0916',
 u'0 102017 2500',
 u'Successor',
 u'012017  7909',
 u'644 8966',
 u'137041',
 u'990 1043',
 u'215306',
 u'1705 3421',
 u'458 426 2137',
 u'83899',
 u'748 771 0536',
 u'82991',
 u'Signatures',
 u'02012039',
 u'912 905 2990',
 u'1051Cost',
 u'101645 0111',
 u'46130',
 u'359  6073',
 u'279509',
 u'980 979 3158',
 u'0059',
 u'172 1982',
 u'1927 337 6625',
 u'0052',
 u'0051',
 u'0050',
 u'0057',
 u'0055',
 u'0054',
 u'270',
 u'271',
 u'272',
 u'273',
 u'274',
 u'275',
 u'276',
 u'277',
 u'278',
 u'279',
 u'16701',
 u'16700',
 u'727  2292',
 u'12019',
 u'05202017',
 u'12015',
 u'012017 8494',
 u'05202018',
 u'012017 8496',
 u'PEDAX',
 u'12013',
 u'01012033',
 u'01012032',
 u'0