<a href="https://colab.research.google.com/github/sravanisasu/volatality_pred/blob/master/CNN_2013_org.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
year = 2013

**Required imports**

In [None]:
# Importing the required packages

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
from numpy import random
from pickle import load
from numpy import array
import os
import re
import pandas as pd
import tensorflow as tf
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras import optimizers
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.metrics import RootMeanSquaredError
import matplotlib.pyplot as plt
from keras.layers.advanced_activations import LeakyReLU

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**building embedding for the words**

In [None]:
#Define file paths required for the model

# embedding bin file
embed_file = "/content/drive/MyDrive/10-K dataset/sim.expand.200d.vec"

#Define Hyper parameters
max_inp_len = 20000
# the dimension of vectors to be used
embed_dim = 200
rounding = 6
# filter sizes of the different conv layers 
filter_sizes = [3,4,5]
num_filters = 1
pool_size = 199
# dropout probability
drop = 0.5
batch_size = 10
learning_rate = 0.001

epochs = 30

In [None]:
#define embedding dictionary and embed matrix for the vocabulary
embeddings_dic = dict()
f = open(embed_file,encoding='utf8')
with open(embed_file, 'r', encoding='utf-8') as e_file:
  for line in e_file:
    splitlines = line.split()
    word = splitlines[0].strip()
    coefs = np.asarray(splitlines[1:], dtype='float32')
    embeddings_dic[word] = coefs

print("length of embedding dictionary",len(embeddings_dic))

length of embedding dictionary 70429


In [None]:
vocabulary_size = len(embeddings_dic.keys())
embed_token = Tokenizer()
embed_token.fit_on_texts(embeddings_dic.keys())
embedding_matrix = np.zeros((vocabulary_size, embed_dim))
for word, index in embed_token.word_index.items():
  embedding_matrix[index] = embeddings_dic.get(word)
print("embedding_matrix dimension",len(embedding_matrix),len(embedding_matrix[0]))
print("no of token in the tokenizer",len(embed_token.word_index) + 1)

embedding_matrix dimension 70429 200
no of token in the tokenizer 70429


**Pre processing input and output**

In [None]:
#function to pre process the document
def process_doc(path_file,embed_token) :

  #tokenizing the words 
  with open(path_file,'r', encoding='utf-8') as tok_file :
    file_words = list(tok_file)[0].split()
    
  #removing the stop words
  stop_words = set(stopwords.words('english'))
  filtered_words = []  
  for word in file_words: 
      if word not in stop_words and word.isalpha(): 
          filtered_words.append(word)

  # applying stemming using PorterStemmer

  p_stemmer = PorterStemmer()
  stem_words=[]
  for word in filtered_words:
    stem_words.append(p_stemmer.stem(word))
    
  #tokenizing the words using the embed token
  tokens=[]
  for word in stem_words:
    try:
      tokens.append(embed_token.word_index[word])
    except:
      tokens.append(1)

  if len(tokens) < max_inp_len:
    tokens.extend([0]*(max_inp_len-len(tokens)))
  else:
    tokens = tokens[:max_inp_len]
    
  return np.array(tokens)

In [None]:
#output dataset
def output_data(company_id, out_path_file):
  with open(out_path_file,'r', encoding='utf-8') as out_file :
    for line in out_file.readlines():
      if company_id == line.split()[1]:
        return line.split()[0]
  return None

In [None]:
def pre_processing(meta_file,output_file,year):

  with open(meta_file,'r', encoding='utf-8') as m_file :
    
    dir_path = '/content/drive/MyDrive/10-K dataset/all.tok' + '/' +str(year)+'.tok'
    data =[]

    for line in m_file.readlines():
      inp_path_file = dir_path +'/'+ line.split()[0] + '.mda'
      
      # get input tokens from the company document
      inp_tokens = process_doc(inp_path_file,embed_token)
      
      # get output value for the company
      out_values = output_data(line.split()[0],output_file)

      #insert values into the data list
      data.append({'token':inp_tokens,'value':out_values})

  return data

In [None]:
######## extracting text and storing it in dataframes ########
data_train = pre_processing('/content/drive/MyDrive/10-K dataset/all.meta/'+str(year-3)+'.meta.txt','/content/drive/MyDrive/10-K dataset/all.logfama/'+str(year-3)+'.logfama.txt',year-3)
data_train.extend(pre_processing('/content/drive/MyDrive/10-K dataset/all.meta/'+str(year-2)+'.meta.txt','/content/drive/MyDrive/10-K dataset/all.logfama/'+str(year-2)+'.logfama.txt',year-2))
data_train.extend(pre_processing('/content/drive/MyDrive/10-K dataset/all.meta/'+str(year-1)+'.meta.txt','/content/drive/MyDrive/10-K dataset/all.logfama/'+str(year-1)+'.logfama.txt',year-1))
train_df = pd.DataFrame(data_train,columns=['token','value'])
print("Length of training data",len(data_train))


data_test = pre_processing('/content/drive/MyDrive/10-K dataset/all.meta/'+str(year)+'.meta.txt','/content/drive/MyDrive/10-K dataset/all.logfama/'+str(year)+'.logfama.txt',year)
test_df = pd.DataFrame(data_test,columns=['token','value'])
print("Length of testing data",len(data_test))

print("SAMPLE INPUT TEXT AND VOLATILITY VALUES")
print(train_df.sample(5)[['token','value']])
print(test_df.sample(5)[['token','value']])

Length of training data 7261
Length of testing data 2336
SAMPLE INPUT TEXT AND VOLATILITY VALUES
                                                  token     value
5885  [170, 79, 197, 303, 45, 124, 25, 18, 462, 964,...  -4.44411
1187  [170, 79, 197, 303, 45, 124, 25, 18, 251, 121,...  -3.84645
5640  [170, 79, 197, 303, 45, 124, 25, 18, 111, 77, ...  -3.42651
1977  [170, 79, 197, 303, 45, 124, 25, 18, 181, 435,...  -3.49088
821   [170, 79, 197, 303, 45, 124, 25, 18, 181, 435,...  -3.43611
                                                  token     value
1253  [170, 79, 197, 303, 45, 124, 25, 18, 72, 47, 5...  -3.87449
1046  [170, 79, 197, 303, 45, 124, 25, 18, 181, 78, ...  -4.26236
1050  [170, 79, 197, 303, 45, 124, 25, 18, 964, 2195...  -2.89789
182   [170, 79, 197, 303, 45, 124, 25, 18, 150, 197,...  -4.19876
2024  [170, 79, 197, 303, 45, 124, 25, 18, 181, 435,...  -4.47229


In [None]:
CNN_train_input = train_df.token.values
CNN_train_output = [ float(x) for x in train_df.value.values ]
CNN_test_input = test_df.token.values
CNN_test_output = [ float(x) for x in test_df.value.values ]

In [None]:
CNN_train_output = np.array(CNN_train_output).reshape(len(CNN_train_output),1)
CNN_test_output = np.array(CNN_test_output).reshape(len(CNN_test_output),1)

**define the model**

In [None]:
def define_model(max_inp_len,vocabulary_size,embed_dim,filter_sizes,num_filters,pool_size,drop,learning_rate):
  
  # input and embedding matrix
  inputs = Input(shape=(max_inp_len,))
  embedding = Embedding(vocabulary_size, embed_dim, weights=[embedding_matrix],trainable = True)(inputs)

  custom_objects={'leaky_relu': tf.nn.leaky_relu}

  # channel 1 convolution and local max-pooling
  convolution_1 = Conv1D(filters=num_filters, kernel_size=filter_sizes[0], activation=custom_objects['leaky_relu'])(embedding)
  pool_1 = MaxPooling1D(pool_size=pool_size)(convolution_1)
  
	# channel 2 convolution and local max-pooling
  convolution_4 = Conv1D(filters=num_filters, kernel_size=filter_sizes[1], activation=custom_objects['leaky_relu'])(embedding)
  pool_2 = MaxPooling1D(pool_size=pool_size)(convolution_4)
  
  # channel 3 convolution and local max-pooling
  convolution_5 = Conv1D(filters=num_filters, kernel_size=filter_sizes[2], activation=custom_objects['leaky_relu'])(embedding)
  pool_3 = MaxPooling1D(pool_size=pool_size)(convolution_5)
  
  # merge and dropout
  merged = concatenate([pool_1,pool_2,pool_3],axis=1)
  drop_out = Dropout(drop)(merged)
  flat = Flatten()(drop_out)

  # 2 fully connected layers
  dense1 = Dense(100, activation=custom_objects['leaky_relu'])(flat)
  outputs = Dense(1, activation=custom_objects['leaky_relu'])(dense1)
  model = Model(inputs=[inputs], outputs=outputs)
    
  opt = optimizers.SGD(learning_rate=learning_rate)
  model.compile(loss='mse', optimizer=opt)

  return model

In [None]:
# define model
model = define_model(max_inp_len,vocabulary_size,embed_dim,filter_sizes,num_filters,pool_size,drop,learning_rate)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 20000)]      0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20000, 200)   14085800    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 19998, 1)     601         embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 19997, 1)     801         embedding_1[0][0]                
____________________________________________________________________________________________

**Fit the model**

In [None]:
### applying minmax scalar
#from sklearn.preprocessing import MinMaxScaler

#scaler = MinMaxScaler()
CNN_train_output = np.array(CNN_train_output).reshape(len(CNN_train_output),1)
CNN_test_output = np.array(CNN_test_output).reshape(len(CNN_test_output),1)
#output = np.concatenate((CNN_train_output, CNN_test_output))
#output = scaler.fit_transform(output)
#CNN_train_output = output[:len(CNN_train_input)]
#CNN_test_output = output[-len(CNN_test_input):]

In [None]:
from sklearn.model_selection import KFold
n_splits = 5
data =[]
CNN_train_input = np.stack(CNN_train_input)
CNN_test_input = np.stack(CNN_test_input)
with tf.device('/device:GPU:0'):
  kf = KFold(n_splits=n_splits)
  fold = 1
  for train_index, val_index in kf.split(CNN_train_input):
    
    checkpoint_filepath = '/content/drive/MyDrive/CNN_Results/CheckPoints/CNN'+str(year)+'_checkpoint_org'+str(fold)
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='loss',
    mode='min',
    save_best_only=True)

    train_history = model.fit(
                              CNN_train_input[train_index],
                              CNN_train_output[train_index],#output
                              epochs=epochs, #epochs
                              verbose=1,
                              callbacks=[model_checkpoint_callback]
                          )
    model_best = tf.keras.models.load_model(checkpoint_filepath)
    loss_Tr = model_best.evaluate(CNN_train_input[train_index],CNN_train_output[train_index], verbose=0)
    loss_Va = model_best.evaluate(CNN_train_input[val_index],CNN_train_output[val_index], verbose=0)
    loss_Te = model_best.evaluate(CNN_test_input,CNN_test_output, verbose=0)
    data.append({'Training Loss':loss_Tr,'Validation Loss':loss_Va,'Test loss':loss_Te,'year':year,'fold':fold})
    fold+=1
df = pd.DataFrame(data)
df.to_csv("CNN_"+str(year)+"_results_org.csv")
from google.colab import files
files.download("CNN_"+str(year)+"_results_org.csv")

Epoch 1/30
INFO:tensorflow:Assets written to: /content/drive/MyDrive/CNN_Results/CheckPoints/CNN2013_checkpoint_org1/assets
Epoch 2/30
INFO:tensorflow:Assets written to: /content/drive/MyDrive/CNN_Results/CheckPoints/CNN2013_checkpoint_org1/assets
Epoch 3/30
INFO:tensorflow:Assets written to: /content/drive/MyDrive/CNN_Results/CheckPoints/CNN2013_checkpoint_org1/assets
Epoch 4/30
INFO:tensorflow:Assets written to: /content/drive/MyDrive/CNN_Results/CheckPoints/CNN2013_checkpoint_org1/assets
Epoch 5/30
INFO:tensorflow:Assets written to: /content/drive/MyDrive/CNN_Results/CheckPoints/CNN2013_checkpoint_org1/assets
Epoch 6/30
Epoch 7/30
INFO:tensorflow:Assets written to: /content/drive/MyDrive/CNN_Results/CheckPoints/CNN2013_checkpoint_org1/assets
Epoch 8/30

KeyboardInterrupt: ignored