In [None]:
import glob
import inflect
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings

from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec, KeyedVectors, Phrases
from gensim.parsing.preprocessing import strip_short,strip_punctuation,\
                                         strip_numeric, strip_multiple_whitespaces

from keras import Sequential
from keras.callbacks import ModelCheckpoint

from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from nltk import tokenize
from nltk.corpus import stopwords

warnings.filterwarnings(action='ignore')

In [None]:
# Training CNN

In [None]:
training_data = np.load('./pseudo_docs.npy').item()
labels = np.load('./pseudo_labels.npy').item()

label_names = ['equity', 'fixed_income', 'derivatives', 'alternatives']
training_x = np.vstack(list(map(lambda x: training_data[x], label_names)))
training_y = np.vstack(list(map(lambda x: labels[x], label_names)))

filename = "./word vectors.kv"
word_vec = KeyedVectors.load(filename, mmap='r')
word_embedding = np.array(word_vec.wv.vectors)

In [None]:
vocab = list(word_vec.wv.vocab)
word_embedding_padded = np.vstack([np.zeros((1, word_embedding.shape[1])), word_embedding])

In [None]:
print('Shape of Data Tensor:', training_x.shape)
print('Shape of Label Tensor:', training_y.shape)

indices = np.arange(training_x.shape[0])
np.random.shuffle(indices)
x_train = training_x[indices]
y_train = training_y[indices]

In [None]:
embedding_layer = Embedding(word_embedding_padded.shape[0],
                            word_embedding_padded.shape[1],
                            weights=[word_embedding_padded],
                            input_length=5000,
                            trainable=False)

sequence_input = Input(shape=(5000,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)

l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)

l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)

l_flat = Flatten()(l_pool3)

l_dense_1 = Dense(128, activation='relu')(l_flat)
l_dropout_1 = Dropout(0.25)(l_dense_1)

l_dense_2 = Dense(128, activation='relu')(l_dropout_1)
l_dropout_2 = Dropout(0.4)(l_dense_2)

preds = Dense(4, activation='softmax')(l_dropout_2)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()

In [None]:
history = model.fit(x_train, y_train,epochs = 20, batch_size = 50)

In [None]:
model_json = model.to_json()
with open('./model/model.json', 'w') as wf:
    wf.write(model_json)
    
model.save('./model/model_weights.h5')

In [None]:
def docs_to_index(file_path):
    articles = []
    labels = []
    for i in glob.glob(file_path + '/*.txt'):
        try:
            paper = open(i, encoding='utf-8')
            articles.append(paper.read())
            labels.append(i.split('/')[-1].split('.')[0][5:])
        except:
            pass
    
    cleaned_articles = list(map(lambda x:x.lower(), articles))
    cleaned_articles = list(map(lambda x: re.sub(r"[^a-zA-Z0-9()_-]", ' ', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r"- ", "", x), cleaned_articles))
    
    cleaned_articles = list(map(lambda x: re.sub("[\(\[].*?[\)\]]", "", x), cleaned_articles))
    
    cleaned_articles = list(map(lambda x: re.sub(r'\bx.*?\b', '', x), cleaned_articles))
    
    cleaned_articles = list(map(lambda x: re.sub(r'\by[^aeiou].*?\b', '', x), cleaned_articles))
    
    cleaned_articles = list(map(lambda x: re.sub(r'\w*\d\w*\s*', '', x), cleaned_articles))
    
    cleaned_articles = list(map(lambda x: re.sub(r'\bmax\b\s*', '', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r'\bmin\b\s*', '', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r'\bsup\b\s*', '', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r'\blim\b\s*', '', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r'\bexp\b\s*', '', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r'\beqz\b\s*', '', x), cleaned_articles))
    
    cleaned_articles = list(map(lambda x: re.sub(r'\b(\w+)\s+\1\b\s*', '', x), cleaned_articles))
    
    cleaned_articles = list(map(lambda x: ''.join(x.split('reference')[:-1])
                                if x.find('reference') != -1 else x, cleaned_articles))
    
    def preprocess_text(s):
        s = strip_multiple_whitespaces(s)
        s = strip_punctuation(s)
        s = strip_short(s, minsize = 3)
        regex = re.compile('[^\w]')
        regex.sub('', s)
        return s
    cleaned_articles = list(map(preprocess_text, cleaned_articles))
    cleaned_sentences = []
    for i in cleaned_articles:
        cleaned_sentences += list(map(lambda x: x, tokenize.sent_tokenize(i)))
    
    stop_words = set(stopwords.words('english') + ['within', 'however']) 
    
    cleaned_sentences_w = list(map(lambda sentence: [w for w in tokenize.word_tokenize(sentence) if not w in stop_words], 
                              cleaned_sentences))
    
    bigram_transformer = Phrases(cleaned_sentences_w)
    return list(bigram_transformer[cleaned_sentences_w]), labels