**Setup GPU**

In [1]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


**Necessary imports and installations for the implementation of ALBERT Architecture**

In [2]:
import tensorflow_hub as hub
import tensorflow as tf
import os as os
import regex as re
import pandas as pd
import numpy as np
from transformers import AlbertTokenizer
from keras.models import Model
from keras.layers import Flatten
from keras import optimizers
from keras.metrics import MeanSquaredError
from keras.layers.advanced_activations import LeakyReLU
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt


**Create a AlBERT layer from the tensorflow-hub library**

In [3]:
module_url = 'https://tfhub.dev/tensorflow/albert_en_base/1'
albert_layer = hub.KerasLayer(module_url, trainable=False)

INFO:absl:Using /tmp/tfhub_modules to cache modules.


**Functions to preprocess input 10-K documents and output values**

In [4]:
######## Function to extract the input text from the files ########
def process_inp_doc(path_file) :

  file_text = open(path_file,encoding='utf8').read()

  # remove punctations and digits and remove <PAGE> which was used for page number
  file_data = re.sub(r'[\d$%-:;!]', '', file_text)
  file_data = re.sub(r'<PAGE>', '', file_data)
  file_data = ''.join(file_data)

  return file_data

######## Function to extract the output values from the file ########
def process_out(company_id,output_file):
  
  with open(output_file,'r', encoding='utf-8') as m_file :
    for line in m_file.readlines():
      if company_id == line.split()[1]:
        return line.split()[0]
    print("not found")
  return None

######## Function to pre-process the documents from meta-file of a given year ########
def pre_processing(meta_file,output_file):
  
  with open(meta_file,'r', encoding='utf-8') as m_file :
    
    year = meta_file.split('/')[1].split('.')[0]
    dir_path = os.path.dirname(meta_file) + '/' +year+'.tok'
    data =[]
    for line in m_file.readlines():
      inp_path_file = dir_path +'/'+ line.split()[0] + '.mda'

      # get input sentences from the company document
      inp_sentences = process_inp_doc(inp_path_file)
    
      # get output value for the company
      out_values = float(process_out(line.split()[0],output_file))

      #insert values into the data list
      data.append({'text':inp_sentences,'value':out_values})

  return data

**Functions to get the embeddings(token,masked,segment) and to encode the text for the model**

In [5]:
######## Function to get the encoded values ######## 
def albert_encode(sentences, tokenizer, MAX_SEQ_LEN=512):

  all_tokens = []
  all_masks = []
  all_segments = []
  for sentence in sentences:
    stokens = tokenizer.tokenize(sentence)
    stokens = stokens[-MAX_SEQ_LEN+2:]
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    token_ids = tokenizer.convert_tokens_to_ids(stokens,)

    ids = token_ids + [0] * (MAX_SEQ_LEN-len(token_ids))
    masks = [1]*len(token_ids) + [0] * (MAX_SEQ_LEN - len(token_ids))
    segments = [0] * (MAX_SEQ_LEN)

    all_tokens.append(ids)
    all_masks.append(masks)
    all_segments.append(segments)

  return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

**Data Preprocessing**

In [6]:
with tf.device('/device:GPU:0'):
  ######## extracting text and storing it in dataframes ########
  data_train = pre_processing('10k-sample/2007.meta.txt','10k-sample/2007.logvol.+12.txt')
  data_train.extend(pre_processing('10k-sample/2008.meta.txt','10k-sample/2008.logvol.+12.txt'))
  data_train.extend(pre_processing('10k-sample/2009.meta.txt','10k-sample/2009.logvol.+12.txt'))
  train_df = pd.DataFrame(data_train,columns=['text','value'])
  print("Length of training data",len(data_train))

  data_test = pre_processing('10k-sample/2010.meta.txt','10k-sample/2010.logvol.+12.txt')
  test_df = pd.DataFrame(data_test,columns=['text','value'])
  print("Length of testing data",len(data_test))

  print("SAMPLE INPUT TEXT AND VOLATILITY VALUES")
  print(train_df.sample(5)[['text','value']])
  print(test_df.sample(5)[['text','value']])

Length of training data 7571
Length of testing data 2439
SAMPLE INPUT TEXT AND VOLATILITY VALUES
                                                   text    value
7095  item # management s discussion and analysis of... -2.24259
3719  item # management s discussion and analysis of... -2.87623
4110  item # management s discussion and analysis of... -2.52248
3197  item # management s discussion and analysis of... -2.37330
2586  item # management s discussion and analysis of... -3.15975
                                                   text    value
1656  item # management s discussion and analysis of... -3.45522
1032  item # management s discussion and analysis of... -3.14370
1988  item # management s discussion and analysis of... -3.35437
374   item # management s discussion and analysis of... -3.66717
806   item # management s discussion and analysis of... -3.61892


In [7]:
train_df = train_df.loc[train_df["text"].apply(lambda x: x.split().__len__())>256]
print(train_df)
#88.7%
test_df = test_df.loc[test_df["text"].apply(lambda x: x.split().__len__())>256]
print(test_df)
#89.3%

                                                   text    value
0     item # management s discussion and analysis of... -3.46398
1     item # management s discussion and analysis of... -3.58048
2     item # management s discussion and analysis of... -3.87840
3     item # management s discussion and analysis of... -3.37969
4     item # management s discussion and analysis of... -4.34506
...                                                 ...      ...
7566  item # management s discussion and analysis of... -2.75096
7567  item # management s discussion and analysis of... -3.46372
7568  item # management s discussion and analysis of... -2.94439
7569  item # management s discussion and analysis of... -3.27556
7570  item # management s discussion and analysis of... -3.33055

[6717 rows x 2 columns]
                                                   text    value
0     item # management s discussion and analysis of... -3.87816
1     item # management s discussion and analysis of... -3.45482


In [8]:
######## extracting tokens from dataframes ########
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")

MAX_SEQ_LEN = 512

with tf.device('/device:GPU:0'):
  
  #### training 
  # input encoding
  sentences = train_df.text.values
  albert_train_input = albert_encode(sentences, tokenizer, MAX_SEQ_LEN)
  # output values
  albert_train_output = train_df.value.values

  #### test
  # input encoding
  sentences = test_df.text.values
  albert_test_input = albert_encode(sentences, tokenizer, MAX_SEQ_LEN)
  # output values
  albert_test_output = test_df.value.values

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
albert_train_output = np.array(albert_train_output).reshape(len(albert_train_output),1)
albert_test_output = np.array(albert_test_output).reshape(len(albert_test_output),1)
output = np.concatenate((albert_train_output, albert_test_output))
output = scaler.fit_transform(output)
albert_train_output = output[:len(albert_train_input[0])]
albert_test_output = output[-len(albert_test_input[0]):]

**Function that define the model architecture**

In [10]:
def get_model():

  input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,name="input_mask")
  segment_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,name="segment_ids")

  custom_objects={'leaky_relu': tf.nn.leaky_relu}

  pooled_output, sequence_output = albert_layer([input_word_ids, input_mask, segment_ids])
  clf_output = pooled_output
  
  net = tf.keras.layers.Dense(64, activation=custom_objects['leaky_relu'])(clf_output)
  net = tf.keras.layers.Dropout(0.1)(net)
  net = Flatten()(net)
  out = tf.keras.layers.Dense(1, activation=custom_objects['leaky_relu'])(net)

  model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)

  opt = optimizers.Adam(learning_rate=0.05)
  model.compile(optimizer=opt, loss='mse')

  return model

**Model Summary**

In [11]:
model = get_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 512)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 11683584    input_word_ids[0][0]             
                                                                 input_mask[0][0]             

**Fit the Model**

In [None]:
n_splits = 10
epochs = 50
with tf.device('/device:GPU:0'):
  kf = KFold(n_splits=n_splits)
  history =[]
  train_loss=[]
  vald_loss=[]
  fold = 1
  for train_index, test_index in kf.split(albert_train_input[0]):
    
    checkpoint_filepath = 'Albert_results/CheckPoints/albert_checkpoint'+str(fold)
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='loss',
    mode='min',
    save_best_only=True)

    train_history = model.fit(
                              [albert_train_input[0][train_index],albert_train_input[1][train_index],albert_train_input[2][train_index]],#input
                              albert_train_output[train_index],#output
                              epochs=epochs, #epochs
                              verbose=1,
                              callbacks=[model_checkpoint_callback]
                          )
    model_best = tf.keras.models.load_model(checkpoint_filepath)
    fold+=1
    loss_T = model_best.evaluate([albert_train_input[0][train_index],albert_train_input[1][train_index],albert_train_input[2][train_index]]
                                       , albert_train_output[train_index], verbose=0)
    loss_V = model_best.evaluate([albert_train_input[0][test_index],albert_train_input[1][test_index],albert_train_input[2][test_index]]
                                      , albert_train_output[test_index], verbose=0)
    print(loss_T,loss_V)
    train_loss.append(loss_T)
    vald_loss.append(loss_V)
    history.append(train_history)  

Train Index:  [ 672  673  674 ... 6714 6715 6716] 

Validation Index:  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 



INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


Epoch 2/50




INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


Epoch 3/50




INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


Epoch 4/50




INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


Epoch 5/50
Epoch 6/50




INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


Epoch 7/50
Epoch 8/50




INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


Epoch 9/50
Epoch 10/50
Epoch 11/50




INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


INFO:tensorflow:Assets written to: Albert_results/CheckPoints/albert_checkpoint1/assets


Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
 27/189 [===>..........................] - ETA: 2:57 - loss: 0.1462

**Plot all the folds together**

In [None]:
plt.plot(train_loss, label = "Trainng Loss")
plt.plot(vald_loss, label = "Validation Loss")
# naming the x axis 
plt.xlabel('Folds') 
# naming the y axis 
plt.ylabel('Error') 
# function to show the plot 
plt.legend()
plt.savefig('Albert_results/Plots/albert_loss_check.png')

**Predict the Model from the checkpoint**

In [None]:
test_loss = []
with tf.device('/device:GPU:0'):
    
    for i in range(n_splits):

        checkpoint_filepath = 'Albert_results/CheckPoints/albert_checkpoint'+str(i+1) 
        best_model = tf.keras.models.load_model(
                                                checkpoint_filepath, custom_objects=None, compile=True, options=None
                                                )
        predicted = best_model.predict(albert_test_input[0:50])
        
        loss_test = best_model.evaluate([albert_test_input[0],albert_test_input[1],albert_test_input[2]]
                                          , albert_test_output, verbose=0)
        print("Test Errror for the fold ",i+1," is",loss_test )
        
        
        plt.plot(predicted[0:50], label = "Predicted Values")  
        plt.plot(albert_test_output[0:50], label = "Actual Values")
        # naming the x axis 
        plt.xlabel('Test Samples') 
        # naming the y axis 
        plt.ylabel('Output Values') 
        # function to show the plot 
        plt.legend()
        textstr = "Test Errror for the fold "+ str(i+1)+" is "+str(np.round(loss_test,3))
        plt.gcf().text(0, -0.25, textstr, fontsize=14)
        plt.savefig('Albert_results/Plots/albert_fold'+str(i+1)+'.png',bbox_inches='tight')
        plt.clf()

        test_loss.append(loss_test)

In [None]:
import matplotlib.pylab as plt
data=[]
data.append(train_loss)
data.append(vald_loss)
data.append(test_loss)
  
fig = plt.figure()  
# Creating axes instance 
ax = fig.add_axes([0, 0, 1, 1]) 
  
# Creating plot 
ax.boxplot(data)

ax.set_xticklabels(['Training', 'Validation','Test']) 

# naming the y axis 
plt.ylabel('MSE Loss')
plt.title("Box plot for Training, Validation and Test Loss")
textstr ='Training Loss  : '+str(np.round(np.mean(train_loss),3))+' ('+str(np.round(np.std(train_loss),3))+')\n'+'Validation Loss  : '+str(np.round(np.mean(vald_loss),3))+' ('+str(np.round(np.std(vald_loss),3))+')\n'+'Test Loss  : '+str(np.round(np.mean(test_loss),3))+' ('+str(np.round(np.std(test_loss),3))+')'
plt.gcf().text(0, -0.25, textstr, fontsize=14)
# show plot 
plt.savefig('Albert_results/Plots/block_albert.png',bbox_inches='tight')

print('Training Loss: %.3f (%.3f)' % (np.mean(train_loss), np.std(train_loss)))
print('Validation Loss: %.3f (%.3f)' % (np.mean(vald_loss), np.std(vald_loss)))
print('Test Loss: %.3f (%.3f)' % (np.mean(test_loss), np.std(test_loss)))
