In [1]:
# Importing the required packages

import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
from numpy import random
from pickle import load
from numpy import array
import os
import re
import pandas as pd
import tensorflow as tf
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras import optimizers
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.metrics import RootMeanSquaredError
import matplotlib.pyplot as plt
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.recurrent import LSTM

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\catuser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Building embedding for the words
embed_file = "10k-sample/sim.expand.200d.vec"

#Define Hyper parameters
max_inp_len = 20000
# the dimension of vectors to be used
embed_dim = 200
rounding = 6
# filter sizes of the different conv layers 
filter_sizes = [3,4,5]
num_filters = 1
pool_size = 199
# dropout probability
drop = 0.5
batch_size = 50
learning_rate = 0.001
epochs = 30

In [None]:
#define embedding dictionary and embed matrix for the vocabulary
embeddings_dic = dict()
f = open(embed_file,encoding='utf8')
with open(embed_file, 'r', encoding='utf-8') as e_file:
  for line in e_file:
    splitlines = line.split()
    word = splitlines[0].strip()
    coefs = np.asarray(splitlines[1:], dtype='float32')
    embeddings_dic[word] = coefs

print("length of embedding dictionary",len(embeddings_dic))

In [None]:
vocabulary_size = len(embeddings_dic.keys())
embed_token = Tokenizer()
embed_token.fit_on_texts(embeddings_dic.keys())
embedding_matrix = np.zeros((vocabulary_size, embed_dim))
for word, index in embed_token.word_index.items():
    embedding_matrix[index] = embeddings_dic.get(word)
print("embedding_matrix dimension",len(embedding_matrix),len(embedding_matrix[0]))
print("no of token in the tokenizer",len(embed_token.word_index) + 1)

In [None]:
#function to pre process the document
def process_doc(path_file,embed_token) :

  #tokenizing the words 
  with open(path_file,'r', encoding='utf-8') as tok_file :
    file_words = list(tok_file)[0].split()
    
  #removing the stop words
  stop_words = set(stopwords.words('english'))
  filtered_words = []  
  for word in file_words: 
      if word not in stop_words and word.isalpha(): 
          filtered_words.append(word)

  # applying stemming using PorterStemmer

  p_stemmer = PorterStemmer()
  stem_words=[]
  for word in filtered_words:
    stem_words.append(p_stemmer.stem(word))
    
  #tokenizing the words using the embed token
  tokens=[]
  for word in stem_words:
    try:
      tokens.append(embed_token.word_index[word])
    except:
      tokens.append(1)

  if len(tokens) < max_inp_len:
    tokens.extend([0]*(max_inp_len-len(tokens)))
  else:
    tokens = tokens[:max_inp_len]
    
  return np.array(tokens)

In [None]:
#output dataset
def output_data(company_id, out_path_file):
  with open(out_path_file,'r', encoding='utf-8') as out_file :
    for line in out_file.readlines():
      if company_id == line.split()[1]:
        return line.split()[0]
  return None

In [None]:
def pre_processing(meta_file,output_file):

  with open(meta_file,'r', encoding='utf-8') as m_file :
    
    year = meta_file.split('/')[2].split('.')[0]
    dir_path = os.path.dirname(meta_file).split('/')[0] + '/all.tok/' +year+'.tok'
    data =[]

    for line in m_file.readlines():
      inp_path_file = dir_path +'/'+ line.split()[0] + '.mda'
      
      # get input tokens from the company document
      inp_tokens = process_doc(inp_path_file,embed_token)
      
      # get output value for the company
      out_values = output_data(line.split()[0],output_file)

      #insert values into the data list
      data.append({'token':inp_tokens,'value':out_values})

  return data

In [None]:
def define_model(max_inp_len,vocabulary_size,embed_dim,filter_sizes,num_filters,pool_size,drop,learning_rate):
  
  # input and embedding matrix
  inputs = Input(shape=(max_inp_len,))
  embedding = Embedding(vocabulary_size, embed_dim, weights=[embedding_matrix],trainable = False)(inputs)

  custom_objects={'leaky_relu': tf.nn.leaky_relu}

 # Layer1 RNN with LSTM
  layer_1 = LSTM(units=64, activation = 'tanh' )(embedding)
       
  # 1 fully connected layers
  outputs = Dense(1, activation=custom_objects['leaky_relu'])(layer_1)

  model = Model(inputs=[inputs], outputs=outputs)
    
  opt = optimizers.SGD(learning_rate=learning_rate)
  model.compile(loss='mse', optimizer=opt)

  return model

In [None]:
# model summary
model = define_model(max_inp_len,vocabulary_size,embed_dim,filter_sizes,num_filters,pool_size,drop,learning_rate)
model.summary()

In [None]:
test_loss_all_years = []
train_loss_all_years = []
val_loss_all_years = []
history_all_years = []
data = []
n_splits = 5
for year in range(2008,2014):
    
    ######## extracting text and storing it in dataframes ########      
    data_train = pre_processing('10k-sample/all.meta/'+str(year-3)+'.meta.txt','10k-sample/all.logfama/'+str(year-3)+'.logfama.txt')
    data_train.extend(pre_processing('10k-sample/all.meta/'+str(year-2)+'.meta.txt','10k-sample/all.logfama/'+str(year-2)+'.logfama.txt'))
    data_train.extend(pre_processing('10k-sample/all.meta/'+str(year-1)+'.meta.txt','10k-sample/all.logfama/'+str(year-1)+'.logfama.txt'))
    train_df = pd.DataFrame(data_train,columns=['token','value'])

    data_test = pre_processing('10k-sample/all.meta/'+str(year)+'.meta.txt','10k-sample/all.logfama/'+str(year)+'.logfama.txt')
    test_df = pd.DataFrame(data_test,columns=['token','value'])
    data.append({'year':year,'train_df_length':len(data_train),'test_df_length':len(data_test)})

    ######## reshapping data to required format ########
    RNN_train_input = train_df.token.values
    RNN_train_output = [ float(x) for x in train_df.value.values ]
    RNN_test_input = test_df.token.values
    RNN_test_output = [ float(x) for x in test_df.value.values ]
    RNN_train_output = np.array(RNN_train_output).reshape(len(RNN_train_output),1)
    RNN_test_output = np.array(RNN_test_output).reshape(len(RNN_test_output),1)

    RNN_train_input = np.stack(RNN_train_input)
    RNN_test_input = np.stack(RNN_test_input)
    
    ### applying minmax scalar
    scaler = MinMaxScaler()
    RNN_train_output = np.array(RNN_train_output).reshape(len(RNN_train_output),1)
    RNN_test_output = np.array(RNN_test_output).reshape(len(RNN_test_output),1)
    output = np.concatenate((RNN_train_output, RNN_test_output))
    output = scaler.fit_transform(output)
    RNN_train_output = output[:len(RNN_train_input)]
    RNN_test_output = output[-len(RNN_test_input):]
    
    ######## Kfold training and saving checkpoints ########
# with tf.device('/device:GPU:0'):

    kf = KFold(n_splits=n_splits)
    history =[]
    train_loss=[]
    vald_loss=[]
    test_loss = []
    fold = 1

    for train_index, test_index in kf.split(RNN_train_input):

        checkpoint_filepath = 'Results/RNN_results_min/CheckPoints/'+str(year)+'RNN_checkpoint'+str(fold)
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                                                                        filepath=checkpoint_filepath,
                                                                        save_weights_only=False,
                                                                        monitor='loss',
                                                                        mode='min',
                                                                        save_best_only=True
                                                                        )
        train_history = model.fit(
                                  RNN_train_input[train_index],
                                  RNN_train_output[train_index],#output
                                  epochs=epochs, #epochs
                                  verbose=1,
                                  callbacks=[model_checkpoint_callback]
                              )
        model_best = tf.keras.models.load_model(checkpoint_filepath)
        fold+=1
        loss_T = model_best.evaluate(RNN_train_input[train_index],RNN_train_output[train_index], verbose=0)
        loss_V = model_best.evaluate(RNN_train_input[test_index],RNN_train_output[test_index], verbose=0)
        loss_test = model_best.evaluate(RNN_test_input,RNN_test_output, verbose=0)
        
        train_loss.append(loss_T)
        vald_loss.append(loss_V)
        history.append(train_history)
        test_loss.append(loss_test)

    test_loss_all_years.append(test_loss)
    train_loss_all_years.append(train_loss)
    val_loss_all_years.append(vald_loss)
    history_all_years.append(history)

In [None]:
stats_df = pd.DataFrame(data,columns=['year','train_df_length','test_df_length'])
stats_df.to_csv('Loss_values/RNN_stats_minmax.csv', header=False, index=False)

In [None]:
import matplotlib.pylab as plt
data=[test_loss]
  
fig = plt.figure()  
# Creating axes instance 
ax = fig.add_axes([0, 0, 1, 1]) 
  
# Creating plot 
ax.boxplot(data)
years = [year for year in range(2008,2014)]
ax.set_xticklabels([year for year in range(2008,2014)]) 

# naming the y axis 
plt.ylabel('MSE Loss')
plt.title("Box plot for Test Loss")
textstr ='Test Loss for RNN : '+str(np.round(np.mean(test_loss),3))+' ('+str(np.round(np.std(test_loss),3))+')'
plt.gcf().text(0, -0.25, textstr, fontsize=14)
# show plot 
plt.savefig('Plots/block_plot_RNN_minmax.png',bbox_inches='tight')

In [None]:
loss_data = []
train_data = []
vald_data = []
for year_loss_test,year_loss_train,year_loss_vald,year in zip(test_loss_all_years,train_loss_all_years,val_loss_all_years,years) :
    loss_data.append({'year':year,'value':year_loss_test})
    train_data.append({'year':year,'value':year_loss_train})
    vald_data.append({'year':year,'value':year_loss_vald})
    
loss_data_test_df = pd.DataFrame(loss_data,columns=['year','value'])
loss_data_test_df.to_csv('Loss_values/RNN_Loss_test_minmax.csv', header=False, index=False)

loss_data_train_df = pd.DataFrame(train_data,columns=['year','value'])
loss_data_train_df.to_csv('Loss_values/RNN_Loss_train_minmax.csv', header=False, index=False)

loss_data_vald_df = pd.DataFrame(vald_data,columns=['year','value'])
loss_data_vald_df.to_csv('Loss_values/RNN_Loss_vald_minmax.csv', header=False, index=False)