<a href="https://colab.research.google.com/github/sahithikodali1/Drowsy-Driver-Detection-System/blob/master/BERTbase_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Mounting the drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#import the libraries

import random
import numpy as np
import pandas as pd
import os
import json
import csv
import pickle
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
% matplotlib inline

# BERT imports
import torch

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange

import copy
from sklearn.metrics import f1_score

!pip install transformers
from transformers import get_linear_schedule_with_warmup

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

#Data file path
DATA_DIR = "/content/drive/MyDrive/Thesis_B"
file = '8b_data.csv'

In [None]:
from transformers import BertTokenizer

# Tokenize with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
#Function to find the datasize
def find_datasize(path,filename):
    file = pd.read_csv(os.path.join(path,filename))
    f_values = file.values
    q_id_split = np.split(f_values, np.where(np.diff(f_values[:,0]))[0]+1)
    col_names = file.columns
    print(col_names)
    print(file[:10])
    train_data_size = round(len(q_id_split)*0.8)
    val_data_size = round(len(q_id_split) - train_data_size)
    print("Training data size:{}".format(train_data_size))
    print("Validation data size:{}".format(val_data_size))
    return q_id_split,train_data_size,val_data_size,col_names

#Function to split data into train and validation data based on questionID's randomly
def split_dataframes(q_id_split,train_data_size,col_names):
    random.seed(3007)
    random.shuffle(q_id_split)
    train_data = q_id_split[:train_data_size]
    val_data = q_id_split[train_data_size:]
    f_values_train = np.concatenate(train_data, axis=0)
    f_values_val = np.concatenate(val_data, axis=0)
    print(len(train_data))
    print(len(val_data))
    train_df = pd.DataFrame(f_values_train, columns = col_names)
    val_df = pd.DataFrame(f_values_val, columns = col_names)
    return train_df,val_df

#Function to obtain labels list
def obtain_SU4labels_list(dataframe):
    SU4_labels = dataframe['SU4_labels']
    labels_list = list(SU4_labels)
    print('Labels size:{}'.format(len(labels_list)))
    return labels_list

In [None]:
#Splitting the data and obtaining labels
q_id_split, train_data_size, val_data_size, col_names = find_datasize(DATA_DIR,file)
train_df, val_df = split_dataframes(q_id_split,train_data_size,col_names)
train_labels = obtain_SU4labels_list(train_df)
val_labels = obtain_SU4labels_list(val_df)


In [None]:
#Add special tokens
def obtain_specialtokenized_list(dataframe):
    sentences = dataframe['sentence text']
    questions = dataframe['question']
    sentences_list = list(sentences)
    questions_list = list(questions)
    question_sentence_list = []
    for i in range(len(sentences_list)):
        question_sentence_list = question_sentence_list + ["[CLS] " + questions_list[i] + " [SEP] " + sentences_list[i] + " [SEP]"]
    return question_sentence_list

#Tokenize texts
def tokenize_sent(givenlist):
    tokenized_texts = [tokenizer.tokenize(sent) for sent in givenlist]
    return tokenized_texts

#Replace commas from tokens
def remove_token_commas(givenlist):
  no_commas_tokenlist = []
  for each in givenlist:
    y = [i.replace(',',';') for i in each]
    no_commas_tokenlist.append(y)
  return no_commas_tokenlist

#Create segment ids from tokens
def segment_id(givenlist):
  MAX_LEN = 512
  segment_ids = []
  for each in givenlist:
    token_sent = ','.join(each)
    d = "[SEP]"
    ques_ans =  [token+d for token in token_sent.split(d) if token]
    ques = [0]*len(ques_ans[0].split(','))
    ans = [1]*(len(ques_ans[1].split(','))-1)
    seg_ids_ques_ans = ques+ans
    length = len(seg_ids_ques_ans)
    if length >= MAX_LEN:
      length_current = MAX_LEN
      seg_ids_ques_ans = seg_ids_ques_ans[:length_current]
      segment_ids.append(seg_ids_ques_ans)
    else:
      length_current = MAX_LEN - length
      seg_ids_ques_ans += [0]*length_current
      segment_ids.append(seg_ids_ques_ans)
  return segment_ids

#Convert tokenized sentences to respective token ids
def token2ids(tokenized_texts):
    MAX_LEN = 512
    tokens_to_ids = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_texts]
    tokens_to_ids = pad_sequences(tokens_to_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    return tokens_to_ids

#Create masks for the tokenids
def create_masks(token_ids):
    attention_masks = []
    for tid in token_ids:
        tid_mask = [float(i>0) for i in tid]
        attention_masks.append(tid_mask)
    return attention_masks


In [None]:
#Obtain the tokens, converting to inputs required
train_specialtok_list = obtain_specialtokenized_list(train_df)  
val_specialtok_list = obtain_specialtokenized_list(val_df)  

train_tokenized = tokenize_sent(train_specialtok_list)
val_tokenized = tokenize_sent(val_specialtok_list)

train_tokenized_nocommas = remove_token_commas(train_tokenized)
val_tokenized_nocommas = remove_token_commas(val_tokenized)

train_token_type_ids =  segment_id(train_tokenized_nocommas)
val_token_type_ids =  segment_id(val_tokenized_nocommas)

train_tokenids = token2ids(train_tokenized_nocommas)
val_tokenids = token2ids(val_tokenized_nocommas)

train_masks = create_masks(train_tokenids)
val_masks = create_masks(val_tokenids)

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_tokenids, dtype=torch.int64)
validation_inputs = torch.tensor(val_tokenids, dtype=torch.int64)

train_labels = torch.tensor(train_labels, dtype=torch.int64)
validation_labels = torch.tensor(val_labels, dtype=torch.int64)

train_masks = torch.tensor(train_masks, dtype=torch.float32)
validation_masks = torch.tensor(val_masks, dtype=torch.float32)

train_token_type_ids = torch.tensor(train_token_type_ids, dtype=torch.int64)
validation_token_type_ids = torch.tensor(val_token_type_ids, dtype=torch.int64)

print(type(train_inputs))
print(type(train_masks))
print(type(train_labels))
print(type(train_token_type_ids))
print('***************')
print(train_inputs.dtype)
print(train_masks.dtype)
print(train_labels.dtype)
print(train_token_type_ids.dtype)
print('***************')
print(validation_inputs.dtype)
print(validation_masks.dtype)
print(validation_labels.dtype)
print(validation_token_type_ids.shape)
print('***************')
print(train_inputs.shape)
print(train_masks.shape)
print(train_labels.shape)
print(train_token_type_ids.shape)
print('***************')
print(train_inputs[0].shape)
print(train_masks[0].shape)
print(train_labels[0].shape)
print(train_token_type_ids[0].shape)
print('***************')
print(train_inputs[0])
print(train_masks[0])
print(train_labels[0])
print(train_token_type_ids[0])


# Select a batch size for training. 
batch_size = 32

# Create an iterator of our data with torch DataLoader 
train_data = TensorDataset(train_inputs, train_token_type_ids, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_token_type_ids, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
#Declare random seed value

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

#Functions to calculate evaluation metrics
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def flat_f1score(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat, average = 'weighted')

In [None]:
#Function to define model
def model(type,pretrainedmodel):
    model = type.from_pretrained(pretrainedmodel, num_labels = 2)
    for param in model.bert.parameters():
        param.requires_grad = False
    return model

In [None]:
#Calling the desired model architecture
from transformers import BertForSequenceClassification
model = model(BertForSequenceClassification,'bert-base-uncased')
model.cuda()

In [None]:
#Define epochs
epochs = 4

#Optimizer & Scheduler fine-tuning parameters
lr=2e-5
num_warmup_steps = 10
num_training_steps = 1000

#Optmizer and Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr = lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_training_steps)


In [None]:
#Importing loss functions
from torch.nn import CrossEntropyLoss, BCELoss, Sigmoid, BCEWithLogitsLoss

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []
best_accuracy_val = 0
best_epoch = -1
best_epoch_weights = copy.deepcopy(model.state_dict())

for epoch in trange(epochs, desc="Epoch"):  
  ## TRAINING
  # Set our model to training mode
  model.train()  
  tr_loss = 0
  tr_accuracy, tr_f1score = 0, 0
  nb_tr_steps = 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_tokenids, b_input_masks, b_labels = batch

    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(b_input_ids, token_type_ids = b_input_tokenids, attention_mask = b_input_masks, labels = None)

    loss_fn = CrossEntropyLoss()
    loss = loss_fn(outputs.logits, b_labels)

    loss.backward()

    # Move logits and labels to CPU
    logits = outputs.logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    optimizer.step()
    scheduler.step()

    tr_loss += loss.item()
    tmp_tr_accuracy = flat_accuracy(logits, label_ids)
    tmp_tr_f1score = flat_f1score(logits, label_ids)    
    tr_accuracy += tmp_tr_accuracy
    tr_f1score += tmp_tr_f1score
    nb_tr_steps += 1

  epoch_loss = tr_loss/nb_tr_steps
  epoch_accuracy = tr_accuracy/nb_tr_steps
  epoch_f1score = tr_f1score/nb_tr_steps
  train_loss_set.append(epoch_loss)

  print("Train loss: {}".format(epoch_loss))
  print("Training Accuracy for epoch: {}".format(epoch_accuracy))
  print("Training f1score: {}".format(epoch_f1score))

  ##VALIDATION
  model.eval()

  val_loss_set = []
  eval_loss, eval_accuracy, eval_f1score = 0, 0, 0
  nb_eval_steps = 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
      batch = tuple(t.to(device) for t in batch)
      
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_tokenids, b_input_masks, b_labels = batch

      # Telling the model not to compute or store gradients, saving memory and speeding up validation
      with torch.no_grad():
          outputs = model(b_input_ids, token_type_ids = b_input_tokenids, attention_mask = b_input_masks, labels = None)
            
      loss_fn = CrossEntropyLoss()
      loss = loss_fn(outputs.logits, b_labels)

      # Move logits and labels to CPU
      logits = outputs.logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()

      eval_loss += loss.item()
      tmp_eval_accuracy = flat_accuracy(logits, label_ids)
      tmp_eval_f1score = flat_f1score(logits, label_ids)
      eval_accuracy += tmp_eval_accuracy
      eval_f1score += tmp_eval_f1score
      nb_eval_steps += 1

      val_epoch_loss = eval_loss/nb_eval_steps
      val_epoch_accuracy = eval_accuracy/nb_eval_steps
      epoch_f1score = eval_f1score/nb_eval_steps
      val_loss_set.append(val_epoch_loss)

  print("Validation loss: {}".format(val_epoch_loss))
  print("Validation Accuracy: {}".format(val_epoch_accuracy))
  print("Validation f1score: {}".format(epoch_f1score))

  if (epoch_accuracy > best_accuracy_val):
    best_accuracy_val = epoch_accuracy
    best_epoch = epoch 
    torch.save(model.state_dict(), os.path.join(DATA_DIR, 'Bert-8b_5b_epoch_bccloss{}.pth'.format(epoch)))

print(" Best epoch: {}".format(best_epoch))
print(" Best Accuracy: {}".format(best_accuracy_val))

In [None]:
#Model to load the saved best model
model.load_state_dict(torch.load('/content/drive/MyDrive/Thesis_B/Bert-base-8b_5b_epoch_2_final.pth'))

In [None]:
###TESTING DATA
##Code to load jsonfile
from pandas.io.json import json_normalize 

#Test file path
test_file = '/content/drive/MyDrive/Thesis_B/8B1_golden.json'

with open(test_file, 'r') as json_file:
    data = json.load(json_file)

##Code to get questions list
def get_testdatalists(data):
    data_access = data['questions']
    id_list = []
    type_list = []
    body_list = []
    all_text_list = []
    for i in range(len(data_access)):
      text_list = []
      id_list.append(data_access[i]['id'])
      type_list.append(data_access[i]['type'])
      body_list.append(data_access[i]['body'])
      for j in data_access[i]['snippets']:
          text_list.append(j['text'])
      all_text_list.append(text_list)
    return id_list, body_list, type_list, all_text_list

#Make a dataframe for test data
def get_dataframes(id_list, body_list, type_list, all_text_list):
    test_df = pd.DataFrame()
    test_df['id'] = id_list
    test_df['body'] = body_list
    test_df['type'] = type_list
    test_df['sentences'] = all_text_list
    print(test_df[:10])
    return test_df

#Get list of questions and sentences
def get_datalists(test_df):
    qid_test_df = test_df['id']
    type_test_df= test_df['type']
    sentences_test_df = test_df['sentences']
    questions_test_df = test_df['body']
    qid_list_test_df = list(qid_test_df)
    type_list_test_df = list(type_test_df)
    sentences_list_test_df = list(sentences_test_df)
    questions_list_test_df = list(questions_test_df)
    return questions_list_test_df, sentences_list_test_df

#Add special tokens
def join_ques_sent(questions_list_test_df, sentences_list_test_df):
    question_sentence_list_test_df = []
    for i in range(len(questions_list_test_df)):
      each_list = []
      for j in sentences_list_test_df[i]:
          each_list= each_list + ["[CLS] " + questions_list_test_df[i] + " [SEP] " + j + " [SEP]"]
      question_sentence_list_test_df.append(each_list)
    return question_sentence_list_test_df

#Tokenize and create tokenids, typeids and masks
def create_test_tokens_masks(question_sentence_list_test_df):
    token_ids_test_df_all = []
    type_ids_test_df_all = []
    masks_test_df_all = []
    for ques_ans_sent in question_sentence_list_test_df:
        tokenized_test_df = tokenize_sent(ques_ans_sent)
        no_commas_tokenlist_test_df = remove_token_commas(tokenized_test_df)
        
        token_ids_test_df = token2ids(no_commas_tokenlist_test_df)
        masks_test_df = create_masks(token_ids_test_df)
        type_ids_test_df = segment_id(no_commas_tokenlist_test_df)

        token_ids_test_df_all.append(token_ids_test_df)
        type_ids_test_df_all.append(type_ids_test_df)
        masks_test_df_all.append(masks_test_df)
    return token_ids_test_df_all, type_ids_test_df_all, masks_test_df_all

In [None]:
#obtain inputs for model
id_list, body_list, type_list, all_text_list = get_testdatalists(data)
test_df = get_dataframes(id_list, body_list, type_list, all_text_list)

questions_list_test_df, sentences_list_test_df = get_datalists(test_df)
question_sentence_list_test_df = join_ques_sent(questions_list_test_df, sentences_list_test_df)

token_ids_test_df_all, type_ids_test_df_all, masks_test_df_all = create_test_tokens_masks(question_sentence_list_test_df)

In [None]:
print(len(token_ids_test_df_all))
print(len(type_ids_test_df_all))
print(len(masks_test_df_all))

print(type(token_ids_test_df_all[0]))
print(type(type_ids_test_df_all[0]))
print(type(masks_test_df_all[0]))

print(token_ids_test_df_all[0])
print(type_ids_test_df_all[0])
print(masks_test_df_all[0])

In [None]:
#Evaluating the model to produce a list of top sentences i.e summary
topsentences_summary = []

for i in range(len(token_ids_test_df_all)):
  batch_size = len(token_ids_test_df_all[i])

  test_inputs = torch.tensor(token_ids_test_df_all[i], dtype=torch.int64)
  test_masks = torch.tensor(masks_test_df_all[i], dtype=torch.float32)
  test_token_ids = torch.tensor(type_ids_test_df_all[i], dtype=torch.int64)

  test_data = TensorDataset(test_inputs, test_token_ids, test_masks)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  logits_list_test_df = []

  model.eval()

  # Evaluate data for one epoch
  for batch in test_dataloader:

    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_tokenids, b_input_masks = batch
    
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids = b_input_tokenids, attention_mask = b_input_masks, labels = None)   
      logits_list_test_df.append(outputs.logits)
    
    prediction = torch.sigmoid(outputs.logits)
    print("Softmax:{}".format(prediction))

    #Code to extract top 5 sentences based on probabilities
    if len(prediction) >= 5:
      values, indices = torch.topk(prediction,5,dim=0)
      print("top_indices:{}".format(indices))
      indexes = indices[:,0].tolist()
      each_summ = []
      for ind in indexes:
        each_summ.append(test_df['sentences'][i][ind])
      each_summ = ' '.join(map(str, each_summ))
      topsentences_summary.append(each_summ)
      print(each_summ)
    else:
      values, indices = torch.topk(prediction,len(prediction),dim=0)
      print("top_indices:{}".format(indices))
      indexes = indices[:,0].tolist()
      each_summ = []
      for ind in indexes:
        each_summ.append(test_df['sentences'][i][ind])
      each_summ = ' '.join(map(str, each_summ))
      topsentences_summary.append(each_summ)
      print(each_summ)

    # Move logits to CPU
    logits = outputs.logits.detach().cpu().numpy()
    print("Logits:{}".format(logits))

print("summary:{}".format(topsentences_summary))


In [None]:
#Function to create answer lists required
def create_answer_df(test_df,summaries):
  qid_test_df = test_df['id']
  type_test_df = test_df['type']
  summaries_test_df = summaries
  return qid_test_df, type_test_df, summaries_test_df

In [None]:
qid_test_df, type_test_df, summaries_test_df = create_answer_df(test_df, topsentences_summary)

In [None]:
#Join the answers in required format of BioASQ
question_details = []
for i in range(len(qid_test_df)):
  dicti = {"id" : qid_test_df[i], "ideal_answer" : summaries_test_df[i], "exact_answer" : "yes"}
  question_details.append(dicti)

In [None]:
#Converting teh data into JSON file as needed.
import json

x = {"questions" : question_details}

# Serializing json 
json_object = json.dumps(x, indent = 2)

with open('/content/drive/MyDrive/Thesis_B/BIOASQ_8b_batch1_bertbase_results_final.json', 'w') as outfile:
    outfile.write(json_object)