In [11]:
# Download the Stanford Question Answering Dataset
!kaggle datasets download -d stanfordu/stanford-question-answering-dataset

# Extract the downloaded dataset
!unzip stanford-question-answering-dataset.zip


Dataset URL: https://www.kaggle.com/datasets/stanfordu/stanford-question-answering-dataset
License(s): CC-BY-SA-4.0
Downloading stanford-question-answering-dataset.zip to /content
 80% 7.00M/8.73M [00:01<00:00, 8.48MB/s]
100% 8.73M/8.73M [00:01<00:00, 6.16MB/s]
Archive:  stanford-question-answering-dataset.zip
  inflating: dev-v1.1.json           
  inflating: train-v1.1.json         


In [8]:
!pip install -q simpletransformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.3/316.3 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m113.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m e

In [9]:
import pandas as pd
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, Subset
import json

In [12]:
with open('/content/train-v1.1.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

In [13]:
def preprocess_data(data):
    contexts = []

    for item in data['data']:
        context = item['paragraphs'][0]['context']
        qas = item['paragraphs'][0]['qas']

        qa_list = []

        for qa in qas:
            question = qa['question']
            answers = qa['answers']
            if answers:
                answer_text = answers[0]['text']
                answer_start = answers[0]['answer_start']
                qa_id = qa['id']
                qa_dict = {
                    'id': qa_id,
                    'is_impossible': False,
                    'question': question,
                    'answers': [
                        {
                            'text': answer_text,
                            'answer_start': answer_start,
                        }
                    ],
                }

                qa_list.append(qa_dict)

        context_dict = {
            'context': context,
            'qas': qa_list,
        }

        contexts.append(context_dict)

    return contexts


In [14]:
train_dataset=preprocess_data(train_data)

In [15]:
import random
num_samples = 100
validate_dataset = random.sample(train_dataset, num_samples)

print(f"Training Data Size: {len(train_dataset)}")
print(f"Validation Data Size: {len(validate_dataset)}")

Training Data Size: 442
Validation Data Size: 100


In [16]:
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs

model_name= "bert-base-cased"
model_type= "bert"

In [17]:
train_args = {
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "output_dir": f"outputs/bert",
    "evaluate_during_training": True,
    "max_seq_length": 256,
    'num_train_epochs': 30,
    'learning_rate': 1e-5,
    "evaluate_during_training_steps": 1000,
    "n_best_size": 3,
    "train_batch_size": 64,
    "eval_batch_size": 64,
    "use_gpu": True,
    "gradient_accumulation_steps": 2,
    "save_eval_checkpoints": False,
    "save_steps": -1,
    "save_model_every_epoch": False,
}


In [18]:
model = QuestionAnsweringModel(
    model_type,model_name, args=train_args,use_cuda=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [19]:
model.train_model(train_dataset, eval_data=validate_dataset)

convert squad examples to features: 100%|██████████| 2596/2596 [00:17<00:00, 151.25it/s]
add example index and unique id: 100%|██████████| 2596/2596 [00:00<00:00, 693176.29it/s]


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

  with amp.autocast():

convert squad examples to features:   0%|          | 0/544 [00:00<?, ?it/s][A
convert squad examples to features:   0%|          | 1/544 [00:02<21:50,  2.41s/it][A
convert squad examples to features: 100%|██████████| 544/544 [00:02<00:00, 204.85it/s]

add example index and unique id: 100%|██████████| 544/544 [00:00<00:00, 486296.12it/s]


Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

  features = torch.load(cached_features_file)


Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 3 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 4 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 5 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 6 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 7 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 8 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 9 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 10 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 11 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 12 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 13 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 14 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 15 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 16 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 17 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 18 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 19 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 20 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 21 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 22 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 23 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 24 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 25 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 26 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 27 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 28 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 29 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 30 of 30:   0%|          | 0/41 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

(600,
 {'global_step': [20,
   40,
   60,
   80,
   100,
   120,
   140,
   160,
   180,
   200,
   220,
   240,
   260,
   280,
   300,
   320,
   340,
   360,
   380,
   400,
   420,
   440,
   460,
   480,
   500,
   520,
   540,
   560,
   580,
   600],
  'correct': [1,
   10,
   27,
   71,
   142,
   187,
   214,
   234,
   268,
   281,
   301,
   322,
   334,
   345,
   358,
   373,
   383,
   388,
   400,
   397,
   405,
   412,
   421,
   425,
   429,
   432,
   429,
   428,
   433,
   432],
  'similar': [177,
   153,
   177,
   177,
   192,
   184,
   183,
   187,
   176,
   177,
   176,
   161,
   155,
   149,
   141,
   135,
   124,
   121,
   114,
   116,
   110,
   107,
   101,
   98,
   98,
   94,
   98,
   98,
   94,
   94],
  'incorrect': [366,
   381,
   340,
   296,
   210,
   173,
   147,
   123,
   100,
   86,
   67,
   61,
   55,
   50,
   45,
   36,
   37,
   35,
   30,
   31,
   29,
   25,
   22,
   21,
   17,
   18,
   17,
   18,
   17,
   18],
  'train_loss': [

In [20]:
import torch
from simpletransformers.question_answering import QuestionAnsweringModel
from transformers import BertTokenizer
# use model after train
saved_model_dir = "outputs/bert" # Change to your output directory
model = QuestionAnsweringModel("bert", saved_model_dir)
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
def predict_answer(context: str, question: str, model) -> str:

    # Tokenize the input
    inputs = tokenizer(question, context, return_tensors="pt")

    # Generate predictions
    outputs = model.model(**inputs)

    # Decode the predicted answer
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])

    return answer

In [21]:
context="""My Name Is Zain Habib and My semester is 05 and i am studing at comsats"""
question = "what is the name of student?"
predict_answer(context,question,model)

'Zain Habib'