# Preprocess the SQUAD Train Dataset

In [None]:
import json

# Load the SQuAD dataset
with open('train-v2.0 (1).json', 'r') as f:
    squad_data = json.load(f)

# Initialize the list to store transformed data
train_questions_answers = []

# Iterate through the SQuAD dataset
for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question_id = qa['id']
            is_impossible = qa.get('is_impossible', False)
            question_text = qa['question']
            answers = qa['answers'] if 'answers' in qa else []

            # Create a dictionary for question-answer pair with context included
            qa_pair = {
                "context": context,
                "qas": [
                    {
                        "id": question_id,
                        "is_impossible": is_impossible,
                        "question": question_text,
                        "answers": answers
                    }
                ]
            }
            # Append the question-answer pair to the list
            train_questions_answers.append(qa_pair)

# Save the transformed data into a JSON file
output_file_path = 'train_questions_answers_with_context.json'
with open(output_file_path, 'w') as outfile:
    json.dump(train_questions_answers, outfile, indent=4)

print("Data saved to:", output_file_path)

# Train the model

In [None]:
!pip install simpletransformers

In [None]:
import json
import random
import os
import shutil
from sklearn.model_selection import train_test_split
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
from transformers import BertTokenizer

# Load the entire dataset
print("Loading dataset...")
with open(r"train_questions_answers_with_context.json", "r") as read_file:
    data = json.load(read_file)
print("Dataset loaded successfully.")


# Split the dataset into train and test sets
print("Splitting dataset into train and test sets...")
train_data, test_data = train_test_split(data, test_size=0.2)
print(f"Dataset split complete. Train set size: {len(train_data)}, Test set size: {len(test_data)}")

model_type = "bert"
model_name = "base-model"


# Define training arguments
print("Defining training arguments...")
# Define training arguments
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "output_dir": f"outputs/{model_type}",
    "best_model_dir": f"outputs/{model_type}/best_model",
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 1000,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "n_best_size": 3,
    "eval_batch_size": 18,
    "eval_metrics": ['exact_match'],
    "num_train_epochs": 15,
    "early_stopping_patience": 3,
    "evaluate_during_training_verbose": True
}
print("Training arguments defined.")

# Initialize the model
print(f"Initializing the model with type: {model_type}, name: {model_name}...")
model = QuestionAnsweringModel(model_type, model_name, args=train_args)
print("Model initialized.")

# Train the model on the new train set
print("Starting model training...")
model.train_model(train_data, eval_data=test_data)
print("Model training complete.")

# Save the best model directory as a zip file
print("Saving the best model directory as a zip file...")
shutil.make_archive('final', 'zip', root_dir=os.path.join('outputs', model_type, 'best_model'))
print("Model saved as 'final.zip'.")
