In [2]:
import json
import os
import pandas as pd
#!pip install simpletransformers

import time 
from simpletransformers.question_answering import QuestionAnsweringModel


In [3]:
#Training on squad data so that we can get an QA  model from BERT
with open('/home/jupyter/squad_data/train-v1.1.json', 'r') as f:
    train_data = json.load(f)

train_data = [item for topic in train_data['data'] for item in topic['paragraphs'] ]

print(len(train_data))
print(train_data[10])

18896
{'context': "Father Joseph Carrier, C.S.C. was Director of the Science Museum and the Library and Professor of Chemistry and Physics until 1874. Carrier taught that scientific research and its promise for progress were not antagonistic to the ideals of intellectual and moral culture endorsed by the Church. One of Carrier's students was Father John Augustine Zahm (1851–1921) who was made Professor and Co-Director of the Science Department at age 23 and by 1900 was a nationally prominent scientist and naturalist. Zahm was active in the Catholic Summer School movement, which introduced Catholic laity to contemporary intellectual issues. His book Evolution and Dogma (1896) defended certain aspects of evolutionary theory as true, and argued, moreover, that even the great Church teachers Thomas Aquinas and Augustine taught something like it. The intervention of Irish American Catholics in Rome prevented Zahm's censure by the Vatican. In 1913, Zahm and former President Theodore Roosevel

In [4]:
context = []
question = []
answer = []

for i in range(len(train_data)):
    context.append(train_data[i]['context'])
    question.append(train_data[i]['qas'][0]['question'])
    answer.append(train_data[i]['qas'][0]['answers'][0]['text'])

df = pd.DataFrame ( {'context': context, 'question':question, 'answer': answer})
df.to_csv('/mnt/disks/mount/data/squad.csv', index=False)
print(train_data[0]['qas'][0]['answers'][0]['text'])
display(df.head())

Saint Bernadette Soubirous


Unnamed: 0,context,question,answer
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous
1,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...,September 1876
2,The university is the major seat of the Congre...,Where is the headquarters of the Congregation ...,Rome
3,The College of Engineering was established in ...,How many BS level degrees are offered in the C...,eight
4,All of Notre Dame's undergraduate students are...,What entity provides help with the management ...,Learning Resource Center


In [7]:
train_args = {
    'output_dir': '/home/jupyter/outputs_bert_base/',
    'learning_rate': 3e-5,
    'num_train_epochs': 1, #for now traiing 1 epoch only, but should be increased in real training
    'max_seq_length': 512,
    'doc_stride': 128,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'train_batch_size': 8,
    'gradient_accumulation_steps': 8,
    'do_lower_case': True,
    'silent': False,
    'n_gpu': 8,
    'process_count': 30,
    'fp16': False,
    'save_eval_checkpoints': False,
    'save_model_every_epoch': False,
    'lazy_loading' : False
}

In [8]:
# Load model of your choice, after training model will be stored into output dir
model = QuestionAnsweringModel('bert', 'bert-base-uncased', args=train_args, use_cuda=True)
#model = QuestionAnsweringModel('bert', 'bert-large-uncased', args=train_args, use_cuda=True)
#model = QuestionAnsweringModel('distilbert', 'distilbert-base-uncased', args=train_args, use_cuda = True)
#model = QuestionAnsweringModel("roberta","roberta-base", args=train_args, use_cuda = True                            
#model = QuestionAnsweringModel("xlnet","xlnet-base-cased", args=train_args, use_cuda = True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [9]:

print (time.asctime() )
t1 = time.time()
model.train_model(train_data)
print (time.asctime() )

t2= time.time()
print ('Time Elapsed = ' , (t2-t1)/60 , 'minutes' )


Tue May 11 04:34:26 2021


convert squad examples to features: 100%|██████████| 87599/87599 [01:09<00:00, 1262.45it/s]  
add example index and unique id: 100%|██████████| 87599/87599 [00:00<00:00, 773167.31it/s]


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=10969.0, style=ProgressStyle(d…





Tue May 11 06:43:22 2021
Time Elapsed =  128.92298378944398 minutes


In [10]:
def answering_machine (model, question, passage):
    
    input_list  = [ {'context': passage,
               'qas':[{'question': question, 'id': 'dummy_id'}]
              } ]
    
    #print (input_list)
        
    prediction = model.predict(input_list, n_best_size=1)
    
    #print (prediction)
    
    answer = prediction[0][0]['answer'][0]
    score = prediction[1][0]['probability'][0]
    
    
    return score, answer



In [23]:
#Testing model train with squad
p = 'my name is john. john lives in california'
q = 'what state I live'
answering_machine(model, q, p)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 219.82it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 4144.57it/s]


HBox(children=(FloatProgress(value=0.0, description='Running Prediction', max=1.0, style=ProgressStyle(descrip…




(0.999999972737221, 'california')

In [12]:
##Training with custom data so that we can get domain finetuned QA  model from BERT

#sample domain specific/custom data into a dataframe, use your own domain data
contexts= ["John is a 10 year old boy. He is the son of Robert Smith.  Elizabeth Davis is Robert's wife. She teaches at UC Berkeley. Sophia Smith is Elizabeth's daughter. She studies at UC Davis",
" Apple has told employees it'll provide them with paid time off to vote in the US presidential election on Nov. 3, according to a report. Workers, who wish to vote that Tuesday will be given up to four hours of pay, Bloomberg reported Friday citing an internal Apple memo.",
"BERT, which is an acronym for Bi-directional Encoder Representation from Transformer, is a state of the art language model which can be used for various natural language processing (NLP) tasks. My objective is to introduce BERT at a high level, and enable you to create practical applications using BERT. You need to have basic knowledge of Python as well as a basic idea of machine learning."
]
questions = ["Which college does John's sister attend", "On what date we have Election Day", "What is full form of BERT"]
answers = ["UC Davis", "Nov. 3", "Bi-directional Encoder Representation from Transformer"]

df=pd.DataFrame({'context':contexts, 'question':questions, 'answer':answers})
df.head()

Unnamed: 0,context,question,answer
0,John is a 10 year old boy. He is the son of Ro...,Which college does John's sister attend,UC Davis
1,Apple has told employees it'll provide them w...,On what date we have Election Day,Nov. 3
2,"BERT, which is an acronym for Bi-directional E...",What is full form of BERT,Bi-directional Encoder Representation from Tra...


In [13]:
df['answer_start']= df.apply(lambda x: str(x.context).find(str(x.answer)), axis=1)
df = df[df['answer_start'] != -1] #filter out where answer was not in context
print(df.shape)
df = df.reset_index(drop= True)
df.head()

(3, 4)


Unnamed: 0,context,question,answer,answer_start
0,John is a 10 year old boy. He is the son of Ro...,Which college does John's sister attend,UC Davis,174
1,Apple has told employees it'll provide them w...,On what date we have Election Day,Nov. 3,107
2,"BERT, which is an acronym for Bi-directional E...",What is full form of BERT,Bi-directional Encoder Representation from Tra...,30


In [14]:
#converting into format so that we can feed into bert
df_list = []
for i in range (len(df)):
    dict_qa = {}
    dict_qa['context'] = df.loc[i].context
#     dict_qa['question'] = df.loc[i].question
#     dict_qa['id'] = i
    dict_qa['qas'] = [{'answers': [{'answer_start': df.loc[i].answer_start,'text' : df.loc[i].answer }], 'question':df.loc[i].question, 'id': i  }]
    df_list.append(dict_qa)
    
print (df_list[0] )
# print (df_list[1] )
# print (df_list[2] )

{'context': "John is a 10 year old boy. He is the son of Robert Smith.  Elizabeth Davis is Robert's wife. She teaches at UC Berkeley. Sophia Smith is Elizabeth's daughter. She studies at UC Davis", 'qas': [{'answers': [{'answer_start': 174, 'text': 'UC Davis'}], 'question': "Which college does John's sister attend", 'id': 0}]}


In [18]:
train_args = {
    'output_dir': 'outputs_bert_custom/',
    'learning_rate': 3e-5,
    'num_train_epochs': 1,
    'max_seq_length': 512,
    'doc_stride': 128,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'train_batch_size': 1,
    'gradient_accumulation_steps': 1,
    'do_lower_case': True,
    'silent': False,
    'n_gpu': 4,
    'process_count': 10,
    'fp16': False,
    'save_eval_checkpoints': False,
    'save_model_every_epoch': False,
    'lazy_loading' : False
}

In [19]:
#load model train on squad
model_custom = QuestionAnsweringModel('bert', '/home/jupyter/outputs_bert_base/', args=train_args, use_cuda=True)

In [20]:
print (time.asctime() )
t1 = time.time()
model_custom.train_model(df_list)
print (time.asctime() )

t2= time.time()
print ('Time Elapsed = ' , (t2-t1)/60 , 'minutes' )

Tue May 11 14:17:20 2021


convert squad examples to features: 100%|██████████| 3/3 [00:00<00:00, 117.50it/s]
add example index and unique id: 100%|██████████| 3/3 [00:00<00:00, 13329.36it/s]


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=3.0, style=ProgressStyle(descr…





Tue May 11 14:17:28 2021
Time Elapsed =  0.12594921986262003 minutes


In [24]:
#Testing model after triining with custom data
p = 'my name is john. john lives in california'
q = 'what state I live'
answering_machine(model_custom, q, p)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 235.13it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 4490.69it/s]


HBox(children=(FloatProgress(value=0.0, description='Running Prediction', max=1.0, style=ProgressStyle(descrip…




(0.9999999742049577, 'california')