In [None]:
!pip install pymysql
!pip install --upgrade sagemaker
!pip install urllib3 --upgrade
!pip install boto3 --upgrade
#!pip uninstall botocore
#!pip install botocore
!pip install botocore --upgrade

#file name :QnaModel.py
import pymysql 
import pandas as pd
from os import path
import json
from sklearn.model_selection import train_test_split
import os
from DataFetchServices import *
from modelServices import *
from modelPredict import *
class QnAModel():
    
    def __init__(self):
        try:
            os.mkdir('squad')
        except:
            print("Folder already exists")
    def fetch_data_from_db(self):
        create_datasets()
        

    # prepare train and test data  to train bert  
    def data_prep_for_model(self):
        
        def read_squad(path):
            
            with open(path,'rb') as f:
                #load the train and test JSON file  into squad_dict 
                squad_dict = json.load(f)
            # read the context , questions and answers from squad_dict and put them into contexts , questions and answers list objects respectivel 
            contexts = []
            questions = []
            answers = []
            for group in squad_dict['data']:
                for passage in group['paragraphs']:
                    context = passage["context"]
                    for qa in passage['qas']:
                        question = qa["question"]
                        for answer in qa["answers"]:
                            contexts.append(context)
                            questions.append(question)
                            answers.append(answer)
            return {"contexts":contexts,"questions":questions,"answers":answers}

        # create train_dataset and test_dataset  with "contexts":contexts,"questions":questions,"answers":answers
        train_dataset = read_squad("./squad/training_data1.json")
        test_dataset = read_squad("./squad/test_data1.json")


        def add_end_index(answers,contexts):
            # loop through each answers and contexts that you got in the read_squad function 
            for answer,context in zip(answers,contexts):
                '''
                "answers": [
                                {
                                    "answer_start": 42,
                                    "text": "senco"
                                }

                '''
                # put the  individual answer text( which is company name) in gold_text
                gold_text = answer['text']
                # put the data from answer_start into start_idx 
                start_idx = answer['answer_start']
                # get the end_index for the text match( which is company name ) 
                end_idx = start_idx+len(gold_text)
                # if the text between start and end index matches with text ( company name ), then put  end_index in answer_end
                if context[start_idx:end_idx] == gold_text:
                    answer['answer_end'] = end_idx
                else:
                    for n in [1,2]:
                        if(context[start_idx-n:end_idx-n] == gold_text):
                            answer['answer_end'] = end_idx-n
                            answer['answer_start'] = start_idx-n

        #add end_index to train and test dataset
        add_end_index(train_dataset["answers"],train_dataset["contexts"])
        add_end_index(test_dataset["answers"],test_dataset["contexts"])
        return train_dataset,test_dataset

    # end of data_prep_for_model


    #fine tune the train data of bert
      
    def fine_tune_train(self,train_dataset,test_dataset,model_name='bert-base-uncased',tokenizer_name='bert-base-uncased',epochs=10,number_of_rows_data = 2000):
        return fine_tune_qna_bert('bert-base-uncased','bert-base-uncased',epochs=3,train_dataset=train_dataset,test_dataset=test_dataset,number_of_rows_data = 2000)
    
    #predict the comany name from the daily IPO by reading  title and text  ( by comparing with the trained data taken from Multilex table )
    #here we will usethe dataframe created out of the IPO files saved in S3 bucket 
    def predict_on_dataframe(self,input_dir, output_dir,tokenizer,device,myModel):
        #return QnA(input_dir,output_dir,tokenizer,device,myModel)
        #modified
        # Bert based prediction complete 
        return QnA1(input_dir,output_dir,tokenizer,device,myModel)
    
    #save the pickle model 
    def save(self,model,model_path,model_name):
        return pickle_save(model,model_path,model_name)
    
    #load the pickle model
    def load(self,model_path,device):
        with open(model_path, "rb") as newFile:
            myModel = pickle.load(newFile)
            myModel.to(device)
        return myModel


    def load_model_on_the_fly(self):
        # call the create dataset function from Datafetchservices file where the data is trained from the multilex table using sklearn train and test model and fed into json file (both traing and test data ) 
        self.fetch_data_from_db()

        #prepare the data model for bert
        train_dataset,test_dataset = self.data_prep_for_model()

        #Option 1 
        # now train the bert model for making prediction 

        model,test_dataset_for_model,device,tokenizer = bert.fine_tune_train(train_dataset=train_dataset,test_dataset=test_dataset,model_name='bert-base-uncased',tokenizer_name='bert-base-uncased',epochs=1,number_of_rows_data = 2000)

        #predict the company name from daily IPO xls

        bert.predict_on_dataframe("","",tokenizer,device,model)


    def use_pretrained_model(self):
        # load the trained model from drive 
        from google.colab import drive
        drive.mount("/content/drive")

        #!pip install transformers
        import transformers

        import pickle

        #!ls '/content/drive'

        # add shortcut of the pkl file in my drive to access via colab by right click on the pkl file and click on Add shortcut to Drive 
        # then copy it to the colab folder in the left side by the following command .y 
        !cp '/content/drive/My Drive/bert-base-uncased.pkl' bert-base-uncased.pkl
        filename='bert-base-uncased.pkl'
        #infile = open(filename,'rb')
        #best_model2 = pickle.load(infile)
       
        

        !pip install torch 
        !pip install tdqm 
        from transformers import BertTokenizerFast
        from transformers import BertForQuestionAnswering
        from torch.utils.data import DataLoader
        from transformers import AdamW
        from tqdm import tqdm 
        import torch


        tokenizer_name='bert-base-uncased'
        tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name)
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

        # using Vishwajeets function to load the model 
        model =self.load(filename,device)


        bert.predict_on_dataframe("","",tokenizer,device,model)


if __name__ == "__main__":
    # call the init function where it checks for the presence of squad dir else create it 
    bert = QnAModel()

    #Option1 
    #First train the model and then predict 

    #bert.load_model_on_the_fly()
    
    #Option 2 : 
    # use pretrained model and predict 
    bert.use_pretrained_model()

        
    


In [None]:
!pip install kora -q
from kora import drive
drive.link_nbs()