In [None]:
# Word2Vec Embedding

# import libraries we need...

import glob
import inflect
import itertools
import numpy as np
import pandas as pd
import re
import warnings

from collections import Counter

from gensim.models import Word2Vec, KeyedVectors, Phrases
from gensim.parsing.preprocessing import strip_numeric, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_punctuation
from gensim.test.utils import get_tmpfile

from nltk import tokenize
from nltk.corpus import stopwords

from spherecluster import  VonMisesFisherMixture

warnings.filterwarnings(action='ignore')

In [None]:
articles = [] 
for i in glob.glob('./extracted_papers/*.txt'):
    paper = open(i, encoding='utf-8')
    articles.append(paper.read())

In [None]:
# Remove non-unicode & newline characters

a = ''.join([chr(n) for n in range(256)])
unwanted = '[' + re.escape(''.join([n for n in a if ord(n) < 32 or ord(n) > 128])) + ']'

cleaned_articles = list(map(lambda x : x.lower(),
                           list(map(lambda x: re.sub(unwanted, ' ', x), articles))))

In [None]:
# Removing unwanted text formats (numeric, whitespace, punctuation, short words stripped)

In [None]:
def preprocess_text(s):

    s = strip_numeric(strip_multiple_whitespaces(s))
    s = strip_short(strip_punctuation(s), minsize = 2)
    
    return s

In [None]:
cleaned_articles = list(map(preprocess_text, cleaned_articles))

In [None]:
# Removing all non-alphabetical charaters

In [None]:
cleaned_sentences = []
for i in cleaned_articles:
    cleaned_sentences += list(map(lambda x: x, tokenize.sent_tokenize(i)))

In [None]:
stop_words = set(stopwords.words('english') + ['’','“', '‘', 'within', 
                                               'however','”','\uf8f6\uf8f7\uf8f7\uf8f7\uf8f8']) 

In [None]:
# Stripping stopwords and tokenization of sentence to words
cleaned_sentences_w = list(map(lambda sentence: [w for w in tokenize.word_tokenize(sentence) if not w in stop_words], cleaned_sentences))

In [None]:
def singularize(sentence):
    p = inflect.engine()
    for i,word in enumerate(sentence):
        if p.singular_noun(word):
            sentence[i] = p.singular_noun(word)
    return sentence
        
cleaned_sentences_w = list(map(singularize, cleaned_sentences_w))

In [None]:
bigram_transformer = Phrases(cleaned_sentences_w)

In [None]:
model = Word2Vec(list(bigram_transformer[cleaned_sentences_w]),
                 window = 5, min_count = 3, size = 200)
model.train(cleaned_sentences_w,total_examples=len(cleaned_sentences_w),epochs=50)

In [None]:
model.wv.most_similar ('derivative', topn = 20)

In [None]:
model.wv.save("word_vectors.kv")

In [None]:
def build_vocabulary(sentences):
    
    word_counts = Counter(itertools.chain(*sentences)) # Building of vocabulary
    vocabulary_inv = [x[0] for x in word_counts.most_common()] # Mapping from index to word
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)} # Mapping from word to index
    
    return word_counts, vocabulary, vocabulary_inv

In [None]:
word_counts = Counter(itertools.chain(*list(bigram_transformer[cleaned_sentences_w])))

# Here we rank words by importance
vocabulary = {x: i for i, x in enumerate([x[0] for x in word_counts.most_common()])}

In [None]:
filename = "word_vectors.kv"
model = KeyedVectors.load(filename, mmap='r')

In [None]:
word_embedding = model.wv.vectors 
vocab = list(model.wv.vocab)

In [None]:
linfnorm = np.linalg.norm(word_embedding, axis=1, ord=2)
word_embedding_normalized = word_embedding / linfnorm[:,None]

In [None]:
words_df = pd.DataFrame(word_embedding_normalized.T, columns=vocab)
words_df[['equity','stock', 'fixed_income', 'bond','real_estate','derivative', 'cds', 'swap', 'mortgage']]

In [None]:
class_keywords_str  = open('class keywords.txt', encoding='utf-8').read()
class_keywords = {i.split(': ')[0]: i.split(': ')[1].split(', ') for i in class_keywords_str.split('\n')}

In [None]:
class_keywords_supplied = {class_label: [np.array(words_df[word]) for word in words] 
                           for class_label, words in class_keywords.items()}

In [None]:
kappa_r = []
mu_r = []

for i in class_keywords_supplied.keys():
    
    vmF = VonMisesFisherMixture(n_clusters=1, n_jobs=10)
    vmF.fit(np.vstack(class_keywords_supplied[i]))
    
    mu_r.append(vmF.cluster_centers_[0])
    kappa_r.append(vmF.concentrations_[0])