<a href="https://colab.research.google.com/github/sravanisasu/BERT_Regression/blob/main/RoBERTa_10K.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Setup GPU**

In [1]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


**Clone data from github**

In [2]:
!git clone https://github.com/sravanisasu/10k-sample

Cloning into '10k-sample'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 9557 (delta 1), reused 1 (delta 0), pack-reused 9548[K
Receiving objects: 100% (9557/9557), 158.15 MiB | 21.20 MiB/s, done.
Resolving deltas: 100% (336/336), done.
Checking out files: 100% (10020/10020), done.


**Necessary imports and installations for the implementation of RoBERTa Architecture**

In [3]:
% pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |▏                               | 10kB 23.6MB/s eta 0:00:01[K     |▎                               | 20kB 16.5MB/s eta 0:00:01[K     |▌                               | 30kB 14.1MB/s eta 0:00:01[K     |▋                               | 40kB 12.8MB/s eta 0:00:01[K     |▉                               | 51kB 8.3MB/s eta 0:00:01[K     |█                               | 61kB 7.8MB/s eta 0:00:01[K     |█▏                              | 71kB 8.7MB/s eta 0:00:01[K     |█▎                              | 81kB 9.7MB/s eta 0:00:01[K     |█▌                              | 92kB 10.2MB/s eta 0:00:01[K     |█▋                              | 102kB 8.1MB/s eta 0:00:01[K     |█▉                              | 112kB 8.1MB/s eta 0:00:01[K     |██                              | 122kB 

In [4]:
import tensorflow_hub as hub
import tensorflow as tf
import os as os
import regex as re
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer
from keras.models import Model
from keras import optimizers
from keras.metrics import MeanSquaredError
from transformers import RobertaConfig
from transformers import TFRobertaModel
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from transformers import RobertaTokenizer, RobertaModel

**Create a RoBERTa model from the transformers library**

In [5]:
config = RobertaConfig.from_pretrained('roberta-base')
roberta_model = TFRobertaModel.from_pretrained('roberta-base',config=config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=657434796.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


**Functions to preprocess input 10-K documents and output values**

In [6]:
######## Function to extract the input text from the files ########
def process_inp_doc(path_file) :

  file_text = open(path_file,encoding='utf8').read()

  # remove punctations and digits and remove <PAGE> which was used for page number
  file_data = re.sub(r'[\d$%-:;!]', '', file_text)
  file_data = re.sub(r'<PAGE>', '', file_data)
  file_data = ''.join(file_data)

  return file_data

######## Function to extract the output values from the file ########
def process_out(company_id,output_file):
  
  with open(output_file,'r', encoding='utf-8') as m_file :
    for line in m_file.readlines():
      if company_id == line.split()[1]:
        return line.split()[0]
    print("not found")
  return None

######## Function to pre-process the documents from meta-file of a given year ########
def pre_processing(meta_file,output_file):
  
  with open(meta_file,'r', encoding='utf-8') as m_file :
    
    year = meta_file.split('/')[3].split('.')[0]
    dir_path = os.path.dirname(meta_file) + '/' +year+'.tok'
    data =[]
    
    for line in m_file.readlines():
      inp_path_file = dir_path +'/'+ line.split()[0] + '.mda'

      # get input sentences from the company document
      inp_sentences = process_inp_doc(inp_path_file)
    
      # get output value for the company
      out_values = float(process_out(line.split()[0],output_file))

      #insert values into the data list
      data.append({'text':inp_sentences,'value':out_values})

  return data

**Functions to get the embeddings(token,masked,segment) and to encode the text for the model**

In [7]:
######## Function to get the encoded values ######## 
def roberta_encode(sentences, tokenizer, MAX_SEQ_LEN=512):

  all_tokens = []
  all_masks = []
  all_segments = []
  for sentence in sentences:
    stokens = tokenizer.tokenize(sentence)
    stokens = stokens[-MAX_SEQ_LEN+2:]
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    token_ids = tokenizer.convert_tokens_to_ids(stokens,)

    ids = token_ids + [0] * (MAX_SEQ_LEN-len(token_ids))
    masks = [1]*len(token_ids) + [0] * (MAX_SEQ_LEN - len(token_ids))
    segments = [0] * (MAX_SEQ_LEN)

    all_tokens.append(ids)
    all_masks.append(masks)
    all_segments.append(segments)

  return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

**Data Preprocessing**

In [8]:
with tf.device('/device:GPU:0'):
  ######## extracting text and storing it in dataframes ########
  data_train = pre_processing('/content/10k-sample/2007.meta.txt','/content/10k-sample/2007.logvol.+12.txt')
  data_train.extend(pre_processing('/content/10k-sample/2008.meta.txt','/content/10k-sample/2008.logvol.+12.txt'))
  data_train.extend(pre_processing('/content/10k-sample/2009.meta.txt','/content/10k-sample/2009.logvol.+12.txt'))
  train_df = pd.DataFrame(data_train,columns=['text','value'])
  print("Length of training data",len(data_train))

  data_test = pre_processing('/content/10k-sample/2010.meta.txt','/content/10k-sample/2010.logvol.+12.txt')
  test_df = pd.DataFrame(data_test,columns=['text','value'])
  print("Length of testing data",len(data_test))

  print("SAMPLE INPUT TEXT AND VOLATILITY VALUES")
  print(train_df.sample(5)[['text','value']])
  print(test_df.sample(5)[['text','value']])

Length of training data 7571
Length of testing data 2439
SAMPLE INPUT TEXT AND VOLATILITY VALUES
                                                   text    value
1505  item # management s discussion and analysis of... -3.52373
5652  item # management s discussion and analysis of... -3.65403
7548  item # management s discussion and analysis of... -3.26501
4881  item # management s discussion and analysis of... -3.22109
265   item # management s discussion and analysis of... -2.94449
                                                   text    value
2353  item # management s discussion and analysis of... -3.55146
1321  item # management s discussion and analysis of... -3.37405
1451  item # management s discussion and analysis of... -3.96599
2194  item # management s discussion and analysis of... -3.58078
2218  item # management s discussion and analysis of... -4.37163


In [9]:
train_df = train_df.loc[train_df["text"].apply(lambda x: x.split().__len__())>256]
print(train_df)
#88.7%
test_df = test_df.loc[test_df["text"].apply(lambda x: x.split().__len__())>256]
print(test_df)
#89.3%

                                                   text    value
0     item # management s discussion and analysis of... -3.46398
1     item # management s discussion and analysis of... -3.58048
2     item # management s discussion and analysis of... -3.87840
3     item # management s discussion and analysis of... -3.37969
4     item # management s discussion and analysis of... -4.34506
...                                                 ...      ...
7566  item # management s discussion and analysis of... -2.75096
7567  item # management s discussion and analysis of... -3.46372
7568  item # management s discussion and analysis of... -2.94439
7569  item # management s discussion and analysis of... -3.27556
7570  item # management s discussion and analysis of... -3.33055

[6717 rows x 2 columns]
                                                   text    value
0     item # management s discussion and analysis of... -3.87816
1     item # management s discussion and analysis of... -3.45482


In [10]:
MAX_SEQ_LEN = 512

######## extracting tokens from dataframes ########
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

with tf.device('/device:GPU:0'):

  #### training 
  # input encoding
  sentences = train_df.text.values
  roberta_train_input = roberta_encode(sentences, tokenizer, MAX_SEQ_LEN)
  # output values
  roberta_train_output = train_df.value.values

  #### test
  # input encoding
  sentences = test_df.text.values
  roberta_test_input = roberta_encode(sentences, tokenizer, MAX_SEQ_LEN)
  # output values
  roberta_test_output = test_df.value.values

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [11]:
roberta_train_output = np.array(roberta_train_output).reshape(len(roberta_train_output),1)
roberta_test_output = np.array(roberta_test_output).reshape(len(roberta_test_output),1)

**Function that define the model architecture**

In [12]:
def get_model():

  input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,name="input_mask")
  segment_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,name="segment_ids")

  model_roberta = roberta_model(input_word_ids, attention_mask  = input_mask, token_type_ids  = segment_ids)
  clf_output = model_roberta.pooler_output  
  
  net = tf.keras.layers.Dropout(0.5)(clf_output)
  net = tf.keras.layers.Dense(64, activation='linear')(net)
  net = tf.keras.layers.LayerNormalization()(net)
  out = tf.keras.layers.Dense(1, activation='linear', name='output')(net)

  model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)

  opt = optimizers.Adam(learning_rate=0.05)
  model.compile(optimizer=opt, loss='mse')

  return model

In [13]:
model = get_model()

model.summary()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 512)]        0                                            
_________________________________________

**Fit the Model**

In [None]:
n_splits = 5
epochs = 10
batch_size = 10
with tf.device('/device:GPU:0'):
  kf = KFold(n_splits=n_splits)
  history =[]
  train_loss=[]
  vald_loss=[]
  test_loss = []
  fold = 1
  for train_index, test_index in kf.split(roberta_train_input[0]):
    checkpoint_filepath = 'RoBERTa_results/CheckPoints/roberta_checkpoint'+str(fold)
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='loss',
    mode='min',
    save_best_only=True)

    train_history = model.fit(
                              [roberta_train_input[0][train_index],roberta_train_input[1][train_index],roberta_train_input[2][train_index]],#input
                              roberta_train_output[train_index],#output
                              epochs=epochs, #epochs
                              verbose=1,
                              batch_size = batch_size,
                              callbacks=[model_checkpoint_callback]
                          )
    #model_best = tf.keras.models.load_model(checkpoint_filepath)
    fold+=1
    loss_T = model.evaluate([roberta_train_input[0][train_index],roberta_train_input[1][train_index],roberta_train_input[2][train_index]]
                                       , roberta_train_output[train_index], verbose=0)
    loss_V = model.evaluate([roberta_train_input[0][test_index],roberta_train_input[1][test_index],roberta_train_input[2][test_index]]
                                      , roberta_train_output[test_index], verbose=0)
    print(loss_T,loss_V)
    train_loss.append(loss_T)
    vald_loss.append(loss_V)
    history.append(train_history)
    predicted = model.predict(roberta_test_input[0:50])
        
    loss_test = model.evaluate([roberta_test_input[0],roberta_test_input[1],roberta_test_input[2]]
                                      , roberta_test_output, verbose=0)
    print("Test Errror for the fold ",fold," is",loss_test )
    
    
    plt.plot(predicted[0:50], label = "Predicted Values")  
    plt.plot(roberta_test_output[0:50], label = "Actual Values")
    # naming the x axis 
    plt.xlabel('Test Samples') 
    # naming the y axis 
    plt.ylabel('Output Values') 
    # function to show the plot 
    plt.legend()
    textstr = "Test Errror for the fold "+ str(fold)+" is "+str(np.round(loss_test,3))
    plt.gcf().text(0, -0.25, textstr, fontsize=14)
    plt.savefig('RoBERTa_results/Plots/roberta_fold'+str(fold)+'.png',bbox_inches='tight')
    plt.clf()

    test_loss.append(loss_test)

Epoch 1/10








































































































INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


Epoch 2/10








































































































INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


Epoch 3/10








































































































INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


Epoch 4/10








































































































INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


Epoch 5/10








































































































INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


Epoch 6/10








































































































INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


INFO:tensorflow:Assets written to: RoBERTa_results/CheckPoints/roberta_checkpoint1/assets


Epoch 7/10
Epoch 8/10
 72/538 [===>..........................] - ETA: 9:04 - loss: 0.2579

**Plot the results**

In [None]:
plt.plot(train_loss, label = "Trainng Loss")
plt.plot(vald_loss, label = "Validation Loss")
# naming the x axis 
plt.xlabel('Folds') 
# naming the y axis 
plt.ylabel('Error') 
# function to show the plot 
plt.legend()
plt.savefig('RoBERTa_results/Plots/roberta_loss_check.png')

In [None]:
# test_loss = []
# with tf.device('/device:GPU:0'):
    
#     for i in range(n_splits):

#         checkpoint_filepath = 'RoBERTa_results/CheckPoints/roberta_checkpoint'+str(i+1) 
#         best_model = tf.keras.models.load_model(
#                                                 checkpoint_filepath, custom_objects=None, compile=True, options=None
#                                                 )
#         predicted = best_model.predict(roberta_test_input[0:50])
        
#         loss_test = best_model.evaluate([roberta_test_input[0],roberta_test_input[1],roberta_test_input[2]]
#                                           , roberta_test_output, verbose=0)
#         print("Test Errror for the fold ",i+1," is",loss_test )
        
        
#         plt.plot(predicted[0:50], label = "Predicted Values")  
#         plt.plot(roberta_test_output[0:50], label = "Actual Values")
#         # naming the x axis 
#         plt.xlabel('Test Samples') 
#         # naming the y axis 
#         plt.ylabel('Output Values') 
#         # function to show the plot 
#         plt.legend()
#         textstr = "Test Errror for the fold "+ str(i+1)+" is "+str(np.round(loss_test,3))
#         plt.gcf().text(0, -0.25, textstr, fontsize=14)
#         plt.savefig('RoBERTa_results/Plots/roberta_fold'+str(i+1)+'.png',bbox_inches='tight')
#         plt.clf()

#         test_loss.append(loss_test)

In [None]:
import matplotlib.pylab as plt
data=[]
data.append(train_loss)
data.append(vald_loss)
data.append(test_loss)
  
fig = plt.figure()  
# Creating axes instance 
ax = fig.add_axes([0, 0, 1, 1]) 
  
# Creating plot 
ax.boxplot(data)

ax.set_xticklabels(['Training', 'Validation','Test']) 

# naming the y axis 
plt.ylabel('MSE Loss')
plt.title("Box plot for Training, Validation and Test Loss")
textstr ='Training Loss  : '+str(np.round(np.mean(train_loss),3))+' ('+str(np.round(np.std(train_loss),3))+')\n'+'Validation Loss  : '+str(np.round(np.mean(vald_loss),3))+' ('+str(np.round(np.std(vald_loss),3))+')\n'+'Test Loss  : '+str(np.round(np.mean(test_loss),3))+' ('+str(np.round(np.std(test_loss),3))+')'
plt.gcf().text(0, -0.25, textstr, fontsize=14)
# show plot 
plt.savefig('RoBERTa_results/Plots/block_roberta.png',bbox_inches='tight')

print('Training Loss: %.3f (%.3f)' % (np.mean(train_loss), np.std(train_loss)))
print('Validation Loss: %.3f (%.3f)' % (np.mean(vald_loss), np.std(vald_loss)))
print('Test Loss: %.3f (%.3f)' % (np.mean(test_loss), np.std(test_loss)))