In [1]:
!pip install tokenizers
!pip install transformers
!pip install tensorflow

Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
onnx 1.13.1 requires protobuf<4,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.
kfp 1.8.20 requires google-api-python-client<2,>=1.7.8, but you have google-api-python-client 2.86.0 which is incompatible.
kfp 1.8.20 requires PyYAML<6,>=5.3, but you have pyyaml 6.0 which is incompatible.
gcsfs 2023.3.0 requires fsspec==2023.3.0, but you have fs

In [2]:
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
from tqdm import tqdm as tqdm

max_len = 384
configuration = BertConfig()



In [3]:
# calling and saving tokenizer
called_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
called_tokenizer.save_pretrained(save_path)


tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
def create_model():
    # Load pre-trained BERT model
    encoder = TFBertModel.from_pretrained("bert-base-uncased")

    # Exclude pooler layer from optimization
    encoder.layers[-1].pooler.trainable = False

    # Define input layers
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)

    # Obtain BERT embeddings
    embedding = encoder(
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask
    )[0]

    # Define output layers
    start_logits = layers.Dense(1, name="start_logit")(embedding)
    end_logits = layers.Dense(1, name="end_logit")(embedding)

    # Flatten output layers
    start_logits = layers.Flatten()(start_logits)
    end_logits = layers.Flatten()(end_logits)

    # Apply activation functions to output layers
    start_probs = layers.Activation(keras.activations.softmax, name="start_prob")(start_logits)
    end_probs = layers.Activation(keras.activations.softmax, name="end_prob")(end_logits)

    # Define and compile model
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs]
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(learning_rate=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])

    return model

In [5]:
model = create_model()

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


#Have uploaded the weights from previous notebook to google drive
from google.colab import drive
drive.mount('/content/drive')
#copying 
!cp "/content/drive/My Drive/reader/weights.h5" "weights.h5"



In [6]:
#for kaggle (make sure to upload the weights in kaggle before hand )
print(os.listdir("../input"))

['document', 'constitutionofindiasquadformat', 'weights']


In [7]:
#for kaggle
model.load_weights('/kaggle/input/weights/finetuned_epoch10_squad.h5')

# for google collab
model.load_weights("weights.h5")

In [8]:
def create_input(question, context):

    context = " ".join(str(context).split())
    question = " ".join(str(question).split())

    tokenized_context = tokenizer.encode(context)
    tokenized_question = tokenizer.encode(question)


    input_ids = tokenized_context.ids + tokenized_question.ids[1:]
    token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])

    attention_mask = [1] * len(input_ids)

    padding_length = max_len - len(input_ids)

    if padding_length > 0:  
        input_ids = input_ids + ([0] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

    return [np.array([input_ids]), np.array([token_type_ids]), np.array([attention_mask])] , tokenized_context.offsets 

In [9]:
def get_answer(model_output, offsets, context):

    start_pred, end_pred = model_output

    start = np.argmax(start_pred)
    end = np.argmax(end_pred)
    

    if(start > end):
        end = np.argmax(end_pred[:,start:]) + start

    pred_char_start = offsets[start][0]
    pred_char_end = offsets[end][1]
    pred_ans = context[pred_char_start:pred_char_end]

    return pred_ans

In [10]:
def get_json_file(path):
    f = open(path)
    return json.load(f)

In [11]:
#loading json files into the notebook
raw_train_data = get_json_file('/kaggle/input/constitutionofindiasquadformat/constitution_train.json')
raw_eval_data = get_json_file('/kaggle/input/constitutionofindiasquadformat/constitution_eval.json')

In [12]:
raw_eval_data

{'data': [{'paragraphs': [{'qas': [{'question': 'On what basis elections to the House of the People and to the Legislative Assembly of every State shall be?\n',
       'id': 918148,
       'answers': [{'answer_id': 806311,
         'document_id': 1447678,
         'question_id': 918148,
         'text': 'adult suffrage;',
         'answer_start': 118,
         'answer_end': 133,
         'answer_category': None}],
       'is_impossible': False},
      {'question': 'After what age is a person is eligible to vole?',
       'id': 918149,
       'answers': [{'answer_id': 806312,
         'document_id': 1447678,
         'question_id': 918149,
         'text': 'eighteen years',
         'answer_start': 216,
         'answer_end': 230,
         'answer_category': None}],
       'is_impossible': False}],
     'context': '326. The elections to the House of the People and to the Legislative Assembly of every State shall be on the basis of adult suffrage; that is to say, every person who is a ci

In [13]:
corpus = []

for i in range( len(raw_eval_data["data"]) ):
    for j in range( len(raw_eval_data["data"][i]["paragraphs"])):

        corpus.append(raw_eval_data["data"][i]["paragraphs"][j]["context"])
        

In [14]:
for i in range( len(raw_train_data["data"]) ):
    for j in range( len(raw_train_data["data"][i]["paragraphs"])):

        corpus.append(raw_train_data["data"][i]["paragraphs"][j]["context"])

In [15]:
import spacy
import re

nlp = spacy.load('en_core_web_sm')

def lowercase(string):
    return string.lower()

def remove_non_alpha(string):
    return re.sub('[^a-z]', ' ', string)

def remove_extra_spaces(string):
    return re.sub(' +', ' ', string)

def tokenize(string):
    doc = nlp(string)
    return [token for token in doc]

def lemmatize(tokens):
    return [token.lemma_.lower() for token in tokens]

def preprocess(string):
    string = lowercase(string)
    string = remove_non_alpha(string)
    string = remove_extra_spaces(string)
    tokens = tokenize(string)
    tokens = lemmatize(tokens)
    return tokens

In [16]:
processed = []

for i in corpus:
    processed.append(preprocess(i))

In [17]:
!pip install rank_bm25

from rank_bm25 import BM25Okapi

bm25 = BM25Okapi(processed)

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
[0m

In [18]:
def read_file(filename):
    with open(filename, 'r') as f:
        return f.read()

def split_documents(string, separator):
    return string.split(separator)

def remove_newlines(document):
    return re.sub('\n', ' ', document).strip()

def preprocess_corpus(corp, separator):
    preprocessed_corpus = corpus
    for document in corp:
        document = remove_newlines(document)
        if document.startswith(' '):
            document = document[1:]
        preprocessed_corpus.append(document)
    return preprocessed_corpus

def create_corpus_from_file(filename, separator='/----/'):
    string = read_file(filename)
    corp = split_documents(string, separator)
    preprocessed_corpus = preprocess_corpus(corp, separator)
    return preprocessed_corpus

In [19]:
new_corpus=create_corpus_from_file('/kaggle/input/document/documentCOI2.txt')

In [24]:
processed = []

for i in new_corpus:
    processed.append(preprocess(i))

In [25]:
bm25 = BM25Okapi(processed)

In [26]:
def get_top_context(query):
    return bm25.get_top_n(query, new_corpus, n=1)[0]

In [27]:
def ask_question(question):
    query = preprocess(question)
    context = get_top_context(query)

    input, offsets = create_input(question, context)
    output = model.predict(input)

    answer = get_answer(output, offsets, context)

    print("answer:", answer)
    print("context from corpus:", context)

In [28]:
ask_question("Lunguanges in which member can express themselves in parliament")

answer:  Chairman of the Council of States or Speaker of the House of the People, or person acting as such, as the case may be, may permit any member who cannot adequately express himself in Hindi or in English to address the House in his mother-tongu
context from corpus: 120. (1) Notwithstanding anything in  Part XVII, but subject to the provisions of article 348, business in Parliament shall be transacted in Hindi or in English: Provided that the Chairman of the Council of States or Speaker of the House of the People, or person acting as such, as the case may be, may permit any member who cannot adequately express himself in Hindi or in English to address the House in his mother-tongue. (2) Unless Parliament by law otherwise provides, this article shall, after the expiration of a period of fifteen years from the commencement of this Constitution, have effect as if the words "or in English" were omitted therefrom.
