In [None]:
import json
train_file_path = "/content/train-v1.1.json"
test_file_path = "/content/dev-v1.1.json"
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

train_data = load_data(train_file_path)
test_data = load_data(test_file_path)

In [None]:
print(type(train_data))  # Should be <class 'list'>
print(type(test_data))    # Should be <class 'list'>

<class 'dict'>
<class 'dict'>


In [None]:
# import pandas as pd
# from datasets import Dataset

# def json_to_dataframe(json_data):
#     df = pd.DataFrame(json_data)
#     return Dataset.from_pandas(df)

# train_dataset = json_to_dataframe(train_data)
# test_dataset = json_to_dataframe(test_data)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Example preprocessing function
def preprocess(data):
    return [
        {
            'input_ids': tokenizer.encode(item['question'], item['context'], return_tensors='pt'),
            'attention_mask': tokenizer.encode(item['question'], item['context'], return_tensors='pt', add_special_tokens=True)
        }
        for item in data
    ]

train_preprocessed = preprocess(train_data)
test_preprocessed = preprocess(test_data)

In [None]:
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import numpy as np

def token_level_iou(predictions, references):
    pred_tokens = set(predictions)
    ref_tokens = set(references)

    intersection = pred_tokens.intersection(ref_tokens)
    union = pred_tokens.union(ref_tokens)

    return len(intersection) / len(union) if len(union) > 0 else 0

In [None]:
from transformers import AdamW
from torch.utils.data import DataLoader

train_loader = DataLoader(train_preprocessed, batch_size=8)

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):  # Number of epochs
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
model.eval()
iou_scores = []

for item in test_preprocessed:
    with torch.no_grad():
        outputs = model(**item)
        # Assuming you have a function to extract the predicted tokens
        predicted_tokens = extract_predictions(outputs)
        reference_tokens = item['reference_tokens']  # True tokens
        iou = token_level_iou(predicted_tokens, reference_tokens)
        iou_scores.append(iou)

average_iou = np.mean(iou_scores)
print(f'Average Token-Level IoU: {average_iou}')

In [None]:
def inference(question, context):
    inputs = tokenizer.encode(question, context, return_tensors='pt')
    with torch.no_grad():
        outputs = model(inputs)
        return extract_predictions(outputs)