In [2]:
#### Required imports
import tensorflow_hub as hub
import tensorflow as tf
import os as os
import regex as re
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from keras.models import Model
from keras import optimizers
from keras.metrics import RootMeanSquaredError
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

In [3]:
###### BERT Layer
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1'
bert_layer = hub.KerasLayer(module_url, trainable=False)

INFO:absl:Using C:\Users\catuser\AppData\Local\Temp\tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1'.
INFO:absl:Downloading https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1: 140.00MB
INFO:absl:Downloading https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1: 280.00MB
INFO:absl:Downloading https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1: 417.73MB
INFO:absl:Downloaded https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1, Total size: 423.26MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1'.


In [4]:
######## Function to extract the input text from the files ########
def process_inp_doc(path_file) :

  file_text = open(path_file,encoding='utf8').read()

  # remove punctations and digits and remove <PAGE> which was used for page number
  file_data = re.sub(r'[\d$%-:;!]', '', file_text)
  file_data = re.sub(r'<PAGE>', '', file_data)
  file_data = ''.join(file_data)

  return file_data

######## Function to extract the output values from the file ########
def process_out(company_id,output_file):
  
  with open(output_file,'r', encoding='utf-8') as m_file :
    for line in m_file.readlines():
      if company_id == line.split()[1]:
        return line.split()[0]
    print("not found")
  return None

######## Function to pre-process the documents from meta-file of a given year ########
def pre_processing(meta_file,output_file):
  
  with open(meta_file,'r', encoding='utf-8') as m_file :
    
    year = meta_file.split('/')[2].split('.')[0]
    dir_path = os.path.dirname(meta_file).split('/')[0] + '/all.tok/' +year+'.tok'
    data =[]
    for line in m_file.readlines():
      inp_path_file = dir_path +'/'+ line.split()[0] + '.mda'

      # get input sentences from the company document
      inp_sentences = process_inp_doc(inp_path_file)
    
      # get output value for the company
      out_values = float(process_out(line.split()[0],output_file))

      #insert values into the data list
      data.append({'text':inp_sentences,'value':out_values})

  return data

In [5]:
######## Function to get the encoded values ######## 
def bert_encode(sentences, tokenizer, MAX_SEQ_LEN=512):

  all_tokens = []
  all_masks = []
  all_segments = []
  for sentence in sentences:
    stokens = tokenizer.tokenize(sentence)
    stokens = stokens[-MAX_SEQ_LEN+2:]
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    token_ids = tokenizer.convert_tokens_to_ids(stokens,)

    ids = token_ids + [0] * (MAX_SEQ_LEN-len(token_ids))
    masks = [1]*len(token_ids) + [0] * (MAX_SEQ_LEN - len(token_ids))
    segments = [0] * (MAX_SEQ_LEN)

    all_tokens.append(ids)
    all_masks.append(masks)
    all_segments.append(segments)

  return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [6]:
##### function that defines the model
def get_model():

  input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,name="input_mask")
  segment_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,name="segment_ids")

  custom_objects={'leaky_relu': tf.nn.leaky_relu}

  pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
  clf_output = pooled_output
  net = tf.keras.layers.Dense(64, activation=custom_objects['leaky_relu'])(clf_output)
  out = tf.keras.layers.Dense(1, activation=custom_objects['leaky_relu'], name='output')(net)

  model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)

  opt = optimizers.Adam(learning_rate=0.1)
  model.compile(optimizer=opt, loss='mse')

  return model

In [8]:
#### model summary 
MAX_SEQ_LEN = 512
model = get_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 512)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [9]:
##### Data extraction and Fitting the model
test_loss_all_years = []
train_loss_all_years = []
val_loss_all_years = []
history_all_years = []
data = []
n_splits = 5
epochs = 5
for year in range(2008,2014):
    
    ######## extracting text and storing it in dataframes ########
#     with tf.device('/device:GPU:0'):
    data_train = pre_processing('10k-sample/all.meta/'+str(year-3)+'.meta.txt','10k-sample/all.logfama/'+str(year-3)+'.logfama.txt')
    data_train.extend(pre_processing('10k-sample/all.meta/'+str(year-2)+'.meta.txt','10k-sample/all.logfama/'+str(year-2)+'.logfama.txt'))
    data_train.extend(pre_processing('10k-sample/all.meta/'+str(year-1)+'.meta.txt','10k-sample/all.logfama/'+str(year-1)+'.logfama.txt'))
    train_df = pd.DataFrame(data_train,columns=['text','value'])

    data_test = pre_processing('10k-sample/all.meta/'+str(year)+'.meta.txt','10k-sample/all.logfama/'+str(year)+'.logfama.txt')
    test_df = pd.DataFrame(data_test,columns=['text','value'])
    data.append({'year':year,'train_df_length':len(data_train),'test_df_length':len(data_test)})
    
    ###### removing few documents which are not processed properly####
    train_df = train_df.loc[train_df["text"].apply(lambda x: x.split().__len__())>256]
    test_df = test_df.loc[test_df["text"].apply(lambda x: x.split().__len__())>256]
    
    ######## extracting tokens from dataframes ########
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # with tf.device('/device:GPU:0'):

    #### training 
    # input encoding
    sentences = train_df.text.values
    bert_train_input = bert_encode(sentences, tokenizer, MAX_SEQ_LEN)
    # output values
    bert_train_output = train_df.value.values

    #### test
    # input encoding
    sentences = test_df.text.values
    bert_test_input = bert_encode(sentences, tokenizer, MAX_SEQ_LEN)
    # output values
    bert_test_output = test_df.value.values
    
    ### applying minmax scalar
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    bert_train_output = np.array(bert_train_output).reshape(len(bert_train_output),1)
    bert_test_output = np.array(bert_test_output).reshape(len(bert_test_output),1)
    output = np.concatenate((bert_train_output, bert_test_output))
    output = scaler.fit_transform(output)
    bert_train_output = output[:len(bert_train_input[0])]
    bert_test_output = output[-len(bert_test_input[0]):]    

    ######## Kfold training and saving checkpoints ########
#     with tf.device('/device:GPU:0'):
    kf = KFold(n_splits=n_splits)
    history =[]
    train_loss=[]
    vald_loss=[]
    test_loss = []
    fold = 1

    for train_index, test_index in kf.split(bert_train_input[0]):

        checkpoint_filepath = 'Results/BERT_results_min/CheckPoints/'+str(year)+'BERT_checkpoint'+str(fold)
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                                                                        filepath=checkpoint_filepath,
                                                                        save_weights_only=False,
                                                                        monitor='loss',
                                                                        mode='min',
                                                                        save_best_only=True
                                                                        )

        train_history = model.fit(
                                  [bert_train_input[0][train_index],bert_train_input[1][train_index],bert_train_input[2][train_index]],#input
                                  bert_train_output[train_index],#output
                                  epochs=epochs, #epochs
                                  verbose=1,
                                  callbacks=[model_checkpoint_callback]
                              )
        model_best = tf.keras.models.load_model(checkpoint_filepath)
        fold+=1
        loss_T = model_best.evaluate([bert_train_input[0][train_index],bert_train_input[1][train_index],bert_train_input[2][train_index]]
                                            , bert_train_output[train_index], verbose=0)
        loss_V = model_best.evaluate([bert_train_input[0][test_index],bert_train_input[1][test_index],bert_train_input[2][test_index]]
                                          , bert_train_output[test_index], verbose=0)
        loss_test = model_best.evaluate([bert_test_input[0],bert_test_input[1],bert_test_input[2]]
                                      , bert_test_output, verbose=0)
        
        train_loss.append(loss_T)
        vald_loss.append(loss_V)
        history.append(train_history)
        test_loss.append(loss_test)
            
    test_loss_all_years.append(test_loss)
    train_loss_all_years.append(train_loss)
    val_loss_all_years.append(vald_loss)
    history_all_years.append(history)

FileNotFoundError: [Errno 2] No such file or directory: '10k-sample/all.meta/2005.meta.txt'

In [None]:
stats_df = pd.DataFrame(data,columns=['year','train_df_length','test_df_length'])
stats_df.to_csv('Loss_values/BERT_stats_minmax.csv', header=False, index=False)

In [None]:
import matplotlib.pylab as plt
data=[test_loss]
  
fig = plt.figure()  
# Creating axes instance 
ax = fig.add_axes([0, 0, 1, 1]) 
  
# Creating plot 
ax.boxplot(data)
years = [year for year in range(2008,2014)]
ax.set_xticklabels([year for year in range(2008,2014)]) 

# naming the y axis 
plt.ylabel('MSE Loss')
plt.title("Box plot for Test Loss")
textstr ='Test Loss for BERT : '+str(np.round(np.mean(test_loss),3))+' ('+str(np.round(np.std(test_loss),3))+')'
plt.gcf().text(0, -0.25, textstr, fontsize=14)
# show plot 
plt.savefig('Plots/block_plot_BERT_minmax.png',bbox_inches='tight')

In [None]:
loss_data = []
train_data = []
vald_data = []
for year_loss_test,year_loss_train,year_loss_vald,year in zip(test_loss_all_years,train_loss_all_years,val_loss_all_years,years) :
    loss_data.append({'year':year,'value':year_loss_test})
    train_data.append({'year':year,'value':year_loss_train})
    vald_data.append({'year':year,'value':year_loss_vald})
    
loss_data_test_df = pd.DataFrame(loss_data,columns=['year','value'])
loss_data_test_df.to_csv('Loss_values/BERT_Loss_test_minmax.csv', header=False, index=False)

loss_data_train_df = pd.DataFrame(train_data,columns=['year','value'])
loss_data_train_df.to_csv('Loss_values/BERT_Loss_train_minmax.csv', header=False, index=False)

loss_data_vald_df = pd.DataFrame(vald_data,columns=['year','value'])
loss_data_vald_df.to_csv('Loss_values/BERT_Loss_vald_minmax.csv', header=False, index=False)