In [None]:
!pip install -q transformers datasets accelerate sentencepiece faiss-cpu sentence-transformers


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m23.6/23.6 MB[0m [31m108.0 MB/s[0m eta [36m0:00:00[0m
[?25h

<h2>Set Paths

In [None]:
TRAIN_PATH = "/content/drive/MyDrive/Project/policy_data/train.json"
TEST_PATH  = "/content/drive/MyDrive/Project/policy_data/test.json"
MODEL_SAVE_PATH = "/content/drive/MyDrive/Project/bert-large"


<h2>Load & Prepare Dataset

In [None]:
import json
from datasets import Dataset

def load_squad_like(path):
    with open(path, "r", encoding="utf-8") as f:
        js = json.load(f)

    examples = []
    for article in js["data"]:
        for para in article["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                answers = qa.get("answers", [])
                answers_dict = {
                    "text": [a["text"] for a in answers],
                    "answer_start": [a["answer_start"] for a in answers],
                }
                examples.append({
                    "id": qa.get("id", ""),
                    "context": context,
                    "question": qa["question"],
                    "answers": answers_dict
                })
    return Dataset.from_list(examples)

train_ds = load_squad_like(TRAIN_PATH)
val_ds = load_squad_like(TEST_PATH)

train_ds, val_ds


(Dataset({
     features: ['id', 'context', 'question', 'answers'],
     num_rows: 17056
 }),
 Dataset({
     features: ['id', 'context', 'question', 'answers'],
     num_rows: 4152
 }))

<h2> Tokenize Data

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "Microsoft/phi-3"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 384
DOC_STRIDE = 128

def prepare_features(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=MAX_LEN,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = tokenized.pop("overflow_to_sample_mapping")
    offset_map = tokenized.pop("offset_mapping")

    start_positions, end_positions = [], []

    for i, offsets in enumerate(offset_map):
        input_ids = tokenized["input_ids"][i]
        cls = input_ids.index(tokenizer.cls_token_id)

        sample_idx = sample_map[i]
        answers = examples["answers"][sample_idx]

        if len(answers["answer_start"]) == 0:
            start_positions.append(cls)
            end_positions.append(cls)
            continue

        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0])
        sequence_ids = tokenized.sequence_ids(i)

        # Find context range
        context_start = next(i for i,s in enumerate(sequence_ids) if s == 1)
        context_end = len(sequence_ids) - 1 - next(i for i,s in enumerate(reversed(sequence_ids)) if s == 1)

        # Check if answer fits
        if not(offsets[context_start][0] <= start_char <= offsets[context_end][1]):
            start_positions.append(cls)
            end_positions.append(cls)
            continue

        # Map tokens
        start_token = context_start
        while start_token <= context_end and offsets[start_token][0] <= start_char:
            start_token += 1
        start_token -= 1

        end_token = context_end
        while end_token >= context_start and offsets[end_token][1] >= end_char:
            end_token -= 1
        end_token += 1

        start_positions.append(start_token)
        end_positions.append(end_token)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized

tokenized_train = train_ds.map(
    prepare_features,
    batched=True,
    remove_columns=train_ds.column_names
)

tokenized_val = val_ds.map(
    prepare_features,
    batched=True,
    remove_columns=val_ds.column_names
)


Map:   0%|          | 0/17056 [00:00<?, ? examples/s]

Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
import torch
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

args = TrainingArguments(
    output_dir=MODEL_SAVE_PATH,
    eval_strategy="steps",
    eval_steps=200,
    logging_steps=100,
    save_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,  # BERT-large needs this
    learning_rate=3e-5,
    num_train_epochs=2,
    fp16=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

trainer.train()

trainer.save_model(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)


Using: cuda


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-large-uncased-whole-word-masking and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
200,3.2106,3.027174
400,2.6649,2.658758
600,2.5515,2.644403
800,2.4751,2.606359
1000,2.3741,2.475771
1200,2.113,2.56204
1400,1.9334,2.452502
1600,1.99,2.427937
1800,1.9452,2.401627
2000,1.9058,2.380503


('/content/drive/MyDrive/Project/bert-large/tokenizer_config.json',
 '/content/drive/MyDrive/Project/bert-large/special_tokens_map.json',
 '/content/drive/MyDrive/Project/bert-large/vocab.txt',
 '/content/drive/MyDrive/Project/bert-large/added_tokens.json',
 '/content/drive/MyDrive/Project/bert-large/tokenizer.json')

In [None]:
!pip install -q evaluate


[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ[0m [32m81.9/84.1 kB[0m [31m116.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import json
import torch
import numpy as np
from collections import defaultdict

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    default_data_collator,
)
import evaluate
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

MODEL_DIR = "/content/drive/MyDrive/Project/phi-3"  # <-- change if needed
TEST_PATH = "/content/drive/MyDrive/Project/policy_data/test.json"  # <-- change if needed


Device: cuda


In [None]:
def load_squad_like(path):
    with open(path, "r", encoding="utf-8") as f:
        js = json.load(f)

    examples = []
    for article in js["data"]:
        for para in article["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                answers = qa.get("answers", [])
                answers_dict = {
                    "text": [a["text"] for a in answers],
                    "answer_start": [a["answer_start"] for a in answers],
                }
                examples.append(
                    {
                        "id": qa.get("id", ""),
                        "context": context,
                        "question": qa["question"],
                        "answers": answers_dict,
                    }
                )
    return Dataset.from_list(examples)

test_ds = load_squad_like(TEST_PATH)
print(test_ds[0])
print("Test examples:", len(test_ds))


{'id': '3f23wv3kh9cmvjio', 'context': 'Last Updated on May 22, 2015', 'question': "Do you take the user's opinion before or after making changes in policy?", 'answers': {'answer_start': [0], 'text': ['Last Updated on May 22, 2015']}}
Test examples: 4152


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DIR).to(DEVICE)
model.eval()


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,)

In [None]:
MAX_LEN = 384
DOC_STRIDE = 128

def prepare_test_features(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=MAX_LEN,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    tokenized["example_id"] = []
    tokenized["offset_mapping"] = tokenized["offset_mapping"]

    for i in range(len(tokenized["input_ids"])):
        sample_idx = sample_mapping[i]
        tokenized["example_id"].append(examples["id"][sample_idx])

    return tokenized

test_features = test_ds.map(
    prepare_test_features,
    batched=True,
    remove_columns=test_ds.column_names,
)

print(test_features.column_names)
print("Features:", len(test_features))


Map:   0%|          | 0/4152 [00:00<?, ? examples/s]

['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id']
Features: 4164


In [None]:
test_for_loader = test_features.remove_columns(["example_id", "offset_mapping"])

dataloader = DataLoader(
    test_for_loader,
    batch_size=8,
    shuffle=False,
    collate_fn=default_data_collator,   # üëà ensures dict batch, not list
)

batch_example = next(iter(dataloader))
print(type(batch_example))
print(batch_example.keys())


<class 'dict'>
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [None]:
all_start_logits = []
all_end_logits = []

for batch in tqdm(dataloader):
    batch = {k: v.to(DEVICE) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    all_start_logits.append(outputs.start_logits.cpu().numpy())
    all_end_logits.append(outputs.end_logits.cpu().numpy())

all_start_logits = np.concatenate(all_start_logits, axis=0)
all_end_logits = np.concatenate(all_end_logits, axis=0)

print(all_start_logits.shape, all_end_logits.shape)


  0%|          | 0/521 [00:00<?, ?it/s]

(4164, 384) (4164, 384)


In [None]:
metric = evaluate.load("squad")

example_id_to_index = {k: i for i, k in enumerate(test_ds["id"])}

features_per_example = defaultdict(list)
for i, feat_id in enumerate(test_features["example_id"]):
    features_per_example[feat_id].append(i)

max_answer_len = 30
predictions = []

for example in test_ds:
    example_id = example["id"]
    context = example["context"]
    feature_indices = features_per_example[example_id]

    best_answer = ""
    best_score = -1e9

    for idx in feature_indices:
        start_logits = all_start_logits[idx]
        end_logits = all_end_logits[idx]

        start_indexes = np.argsort(start_logits)[-5:][::-1]
        end_indexes = np.argsort(end_logits)[-5:][::-1]

        for s in start_indexes:
            for e in end_indexes:
                if e < s or e - s + 1 > max_answer_len:
                    continue
                score = start_logits[s] + end_logits[e]
                if score > best_score:
                    input_ids = test_features["input_ids"][idx][s:e+1]
                    text = tokenizer.decode(input_ids, skip_special_tokens=True)
                    best_answer = text
                    best_score = score

    predictions.append({"id": example_id, "prediction_text": best_answer})

references = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in test_ds
]

results = metric.compute(predictions=predictions, references=references)
results


{'exact_match': 26.32466281310212, 'f1': 54.45164446045573}

In [None]:
def ask_question(question, context):
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        truncation=True,
        max_length=384
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    start = torch.argmax(outputs.start_logits)
    end = torch.argmax(outputs.end_logits)

    answer_ids = inputs["input_ids"][0][start : end + 1]
    answer = tokenizer.decode(answer_ids, skip_special_tokens=True)

    return answer


In [None]:
context = """
The cancellation policy allows a customer to cancel their insurance policy
within 15 days of purchase for a full refund, provided that no claims have
been submitted during this period. If a claim has been made, the customer
is not eligible for a refund.
"""


In [None]:
question = "Within how many days can a customer cancel the policy for a full refund?"
answer = ask_question(question, context)

print("Question:", question)
print("Answer:", answer)


Question: Within how many days can a customer cancel the policy for a full refund?
Answer: 15 days of purchase for a full refund,
