In [1]:
!pip install tokenizers
!pip install transformers
!pip install tensorflow

Collecting tokenizers
  Downloading tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mCollecting transformers
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting regex!=2019.12.17
  Downloading regex-2023.3.23-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m771.9/771.9 KB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13

In [2]:
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
from tqdm import tqdm as tqdm

max_len = 384
configuration = BertConfig()

D0414 06:21:06.039469008      14 config.cc:119]                        gRPC EXPERIMENT tcp_frame_size_tuning               OFF (default:OFF)
D0414 06:21:06.039524513      14 config.cc:119]                        gRPC EXPERIMENT tcp_rcv_lowat                       OFF (default:OFF)
D0414 06:21:06.039528171      14 config.cc:119]                        gRPC EXPERIMENT peer_state_based_framing            OFF (default:OFF)
D0414 06:21:06.039531038      14 config.cc:119]                        gRPC EXPERIMENT flow_control_fixes                  ON  (default:ON)
D0414 06:21:06.039533291      14 config.cc:119]                        gRPC EXPERIMENT memory_pressure_controller          OFF (default:OFF)
D0414 06:21:06.039535748      14 config.cc:119]                        gRPC EXPERIMENT unconstrained_max_quota_buffer_size OFF (default:OFF)
D0414 06:21:06.039538096      14 config.cc:119]                        gRPC EXPERIMENT new_hpack_huffman_decoder           ON  (default:ON)
D0414 06:21:06.

In [3]:
# calling and saving tokenizer
called_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
called_tokenizer.save_pretrained(save_path)


tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 5.41MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 4.22kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 325kB/s]


In [4]:
def create_model():
    # Load pre-trained BERT model
    encoder = TFBertModel.from_pretrained("bert-base-uncased")

    # Exclude pooler layer from optimization
    encoder.layers[-1].pooler.trainable = False

    # Define input layers
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)

    # Obtain BERT embeddings
    embedding = encoder(
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask
    )[0]

    # Define output layers
    start_logits = layers.Dense(1, name="start_logit")(embedding)
    end_logits = layers.Dense(1, name="end_logit")(embedding)

    # Flatten output layers
    start_logits = layers.Flatten()(start_logits)
    end_logits = layers.Flatten()(end_logits)

    # Apply activation functions to output layers
    start_probs = layers.Activation(keras.activations.softmax, name="start_prob")(start_logits)
    end_probs = layers.Activation(keras.activations.softmax, name="end_prob")(end_logits)

    # Define and compile model
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs]
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(learning_rate=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])

    return model

In [5]:
model = create_model()

Downloading tf_model.h5: 100%|██████████| 536M/536M [00:02<00:00, 195MB/s]  
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


#Have uploaded the weights from previous notebook to google drive
from google.colab import drive
drive.mount('/content/drive')
#copying 
!cp "/content/drive/My Drive/reader/weights.h5" "weights.h5"



In [6]:
#for kaggle (make sure to upload the weights in kaggle before hand )
print(os.listdir("../input"))

['weights']


In [7]:
#for kaggle
model.load_weights('/kaggle/input/weights/finetuned_epoch1.h5')

# for google collab
model.load_weights("weights.h5")

In [9]:
def create_input(question, context):

    context = " ".join(str(context).split())
    question = " ".join(str(question).split())

    tokenized_context = tokenizer.encode(context)
    tokenized_question = tokenizer.encode(question)


    input_ids = tokenized_context.ids + tokenized_question.ids[1:]
    token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])

    attention_mask = [1] * len(input_ids)

    padding_length = max_len - len(input_ids)

    if padding_length > 0:  
        input_ids = input_ids + ([0] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

    return [np.array([input_ids]), np.array([token_type_ids]), np.array([attention_mask])] , tokenized_context.offsets 

In [10]:
context="Born and raised in a Hindu family in coastal Gujarat, Gandhi trained in the law at the Inner Temple, London, and was called to the bar at age 22 in June 1891. After two uncertain years in India, where he was unable to start a successful law practice, he moved to South Africa in 1893 to represent an Indian merchant in a lawsuit."
a, offsets = create_input("where did he moved to?", "Born and raised in a Hindu family in coastal Gujarat, Gandhi trained in the law at the Inner Temple, London, and was called to the bar at age 22 in June 1891. After two uncertain years in India, where he was unable to start a successful law practice, he moved to South Africa in 1893 to represent an Indian merchant in a lawsuit.")

In [11]:
b, offsets_b = create_input("where was he raised?", "Born and raised in a Hindu family in coastal Gujarat, Gandhi trained in the law at the Inner Temple, London, and was called to the bar at age 22 in June 1891. After two uncertain years in India, where he was unable to start a successful law practice, he moved to South Africa in 1893 to represent an Indian merchant in a lawsuit.")

In [12]:
c, offsets_c = create_input("why he moved to south Africa?", "Born and raised in a Hindu family in coastal Gujarat, Gandhi trained in the law at the Inner Temple, London, and was called to the bar at age 22 in June 1891. After two uncertain years in India, where he was unable to start a successful law practice, he moved to South Africa in 1893 to represent an Indian merchant in a lawsuit.")

In [27]:
context_d="Right to Freedom 19. (1) All citizens shall have the right— (a) to freedom of speech and expression; (b) to assemble peaceably and without arms; (c) to form associations or unions; (d) to move freely throughout the territory of India; (e) to reside and settle in any part of the territory(g) to practise any profession, or to carry on any occupation, trade or business.3[(2) Nothing in sub-clause (a) of clause (1) shall affect the operation of any existing law, or prevent the State from making any law, in so far as such law imposes reasonable restrictions on the exercise of the right conferred by the said sub-clause in the interests of 4[the sovereignty and integrity of India,] the security of the State, friendly relations with foreign States, public order, decency or morality, or in relation to contempt of court, defamation or incitement to an offence.]"

d,offsets_d = create_input("are we allowed to assemble?","Right to Freedom 19. (1) All citizens shall have the right— (a) to freedom of speech and expression; (b) to assemble peaceably and without arms; (c) to form associations or unions; (d) to move freely throughout the territory of India; (e) to reside and settle in any part of the territory(g) to practise any profession, or to carry on any occupation, trade or business.3[(2) Nothing in sub-clause (a) of clause (1) shall affect the operation of any existing law, or prevent the State from making any law, in so far as such law imposes reasonable restrictions on the exercise of the right conferred by the said sub-clause in the interests of 4[the sovereignty and integrity of India,] the security of the State, friendly relations with foreign States, public order, decency or morality, or in relation to contempt of court, defamation or incitement to an offence.]")

In [14]:
model_output = model.predict(a)



In [15]:
model_output_b=model.predict(b)



In [16]:
model_output_c=model.predict(c)



In [28]:
model_output_d=model.predict(d)



In [17]:
def get_answer(model_output, offsets, context):

    start_pred, end_pred = model_output

    start = np.argmax(start_pred)
    end = np.argmax(end_pred)
    

    if(start > end):
        end = np.argmax(end_pred[:,start:]) + start

    pred_char_start = offsets[start][0]
    pred_char_end = offsets[end][1]
    pred_ans = context[pred_char_start:pred_char_end]

    return pred_ans

In [18]:
answer=get_answer(model_output,offsets,context)

In [19]:
print(answer)

South Africa


In [20]:
answer=get_answer(model_output_b,offsets_b,context)

In [21]:
print(answer)

coastal Gujarat


In [22]:
answer=get_answer(model_output_c,offsets_c,context)

In [23]:
print(answer)

to represent an Indian merchant in a lawsuit


In [29]:
answer=get_answer(model_output_d,offsets_d,context_d)

In [30]:
print(answer)

to assemble peaceably and without arms
