In [1]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')
nb_path = '/content/notebooks'
os.symlink('/content/drive/My Drive/Colab Notebooks', nb_path)
sys.path.insert(0,nb_path)

Mounted at /content/drive


In [None]:
!pip install --target=$nb_path transformers

In [1]:
import os, sys
nb_path = '/content/notebooks'
sys.path.insert(0,nb_path)

In [2]:
import pandas as pd
import os
import re
import nltk
import string
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import pipeline
import warnings
warnings.filterwarnings('ignore')

In [57]:
class QnA:

  def __init__(self):
    
    try:
        self.tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
        self.model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
        self.qna = pipeline('question-answering', model=self.model, tokenizer=self.tokenizer)
        print("Model Loaded")
    except:
      print("Error Loading Model")

  def data_load(self,path:str):

    if path.endswith(".csv"):
      self.context = pd.read_csv(path,encoding='latin-1')
    
    elif path.endswith(".xls"):
      self.context = pd.read_excel(path,encoding='latin-1')

    elif path.endswith(".json"):
      self.context = pd.read_json(path)

    elif path.endswith(".txt"):
      self.context = open(path).read()

    return self.context

  def cleaning_text(self,text:str)->str:
    # punc = string.punctation
    # nltk.corpus.stopwords.words("english")

    text = text.lstrip().rstrip()
    self.temp = re.sub("\n+"," ",text)

    return self.temp

  def predict_sol(self,context,ques):
    ans = []

    try:
      if str(type(ques)) == "<class 'list'>":

        for i in ques:
          req_ques = {
              'question':i,
              'context':context
          }
          sol = { 'ques':i, 'ans':self.qna(req_ques)['answer'] }

          ans.append(sol)

      else:
        req_ques = {
              'question':ques,
              'context':context
          }
        sol = {
            'ques':ques,
            'ans':self.qna(req_ques)['answer']
        }
        ans.append(sol)
      return ans      

    except:
      pass

  def save_model(self):
    self.tokenizer.save_pretrained('/content/drive/MyDrive/tokenizer/')
    self.model.save_pretrained('/content/drive/MyDrive/QA/')

In [58]:
model = QnA()

Model Loaded


In [59]:
question = [["What is present in the orbitals of atoms?"],
            ["Of the two countries that produce soybeans, which country is clearing rain forest in order to increase production?"]]
            
context = ["""
This combination of cancellations and σ and π overlaps results in dioxygen’s double bond character and reactivity, and a triplet electronic 
ground state. An electron configuration with two unpaired electrons as found in dioxygen (see the filled π* orbitals in the diagram), 
orbitals that are of equal energy—i.e., degenerate—is a configuration termed a spin triplet state. Hence, the ground state of the O2 
molecule is referred to as triplet oxygen.[b] The highest energy, partially filled orbitals are antibonding, and so their filling weakens 
the bond order from three to two. Because of its unpaired electrons, triplet oxygen reacts only slowly with most organic molecules, which 
have paired electron spins; this prevents spontaneous combustion.
""","""
Between 1991 and 2000, the total area of forest lost in the Amazon rose from 415,000 to 587,000 square kilometres (160,000 to 227,000 sq mi), 
with most of the lost forest becoming pasture for cattle. Seventy percent of formerly forested land in the Amazon, and 91% of land deforested 
since 1970, is used for livestock pasture. Currently, Brazil is the second-largest global producer of soybeans after the United States. 
New research however, conducted by Leydimere Oliveira et al., has shown that the more rainforest is logged in the Amazon, the less 
precipitation reaches the area and so the lower the yield per hectare becomes. So despite the popular perception, there has been no 
economical advantage for Brazil from logging rainforest zones and converting these to pastoral fields."""]

In [60]:
context_cleaned = []
for i in context:
  context_cleaned.append(model.cleaning_text(i))
context_cleaned

['This combination of cancellations and σ and π overlaps results in dioxygen’s double bond character and reactivity, and a triplet electronic  ground state. An electron configuration with two unpaired electrons as found in dioxygen (see the filled π* orbitals in the diagram),  orbitals that are of equal energy—i.e., degenerate—is a configuration termed a spin triplet state. Hence, the ground state of the O2  molecule is referred to as triplet oxygen.[b] The highest energy, partially filled orbitals are antibonding, and so their filling weakens  the bond order from three to two. Because of its unpaired electrons, triplet oxygen reacts only slowly with most organic molecules, which  have paired electron spins; this prevents spontaneous combustion.',
 'Between 1991 and 2000, the total area of forest lost in the Amazon rose from 415,000 to 587,000 square kilometres (160,000 to 227,000 sq mi),  with most of the lost forest becoming pasture for cattle. Seventy percent of formerly forested la

In [61]:
ans = []
for i,j in zip(context_cleaned,question):
  ans.append(model.predict_sol(i,j))

In [62]:
ans

[[{'ans': 'two unpaired electrons',
   'ques': 'What is present in the orbitals of atoms?'}],
 [{'ans': 'Brazil',
   'ques': 'Of the two countries that produce soybeans, which country is clearing rain forest in order to increase production?'}]]

In [63]:
model.save_model()