# Phase II: Preprocessing and Tokenization

This notebook demonstrates the preprocessing pipeline for converting SQuAD raw text into tokenized features with start and end position labels.

In [1]:
from datasets import load_dataset
from src.preprocessing import get_tokenizer, prepare_train_features
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## 1. Load Data and Tokenizer

In [None]:
dataset = load_dataset("squad", split="train[:10]")
tokenizer = get_tokenizer()
print(f"Loaded {len(dataset)} samples.")

## 2. Apply Preprocessing

We use the `prepare_train_features` function which handles:
- Truncation of long contexts.
- Mapping character-based answer start/end to token-based positions.
- Handling of the sliding window (stride) approach.

In [None]:
features = prepare_train_features(dataset, tokenizer)
print(f"Generated {len(features['input_ids'])} features from {len(dataset)} samples.")

## 3. Verify Results

Let's decode the predicted spans and compare them with the original answers.

In [None]:
results = []
for i in range(min(5, len(features['input_ids']))):
    start = features['start_positions'][i]
    end = features['end_positions'][i]
    input_ids = features['input_ids'][i]
    
    decoded_answer = tokenizer.decode(input_ids[start:end+1])
    
    results.append({
        "Feature Index": i,
        "Start Position": start,
        "End Position": end,
        "Decoded Span": decoded_answer
    })

pd.DataFrame(results)

## 4. Save Preprocessed Sample

In [None]:
import json
import os

os.makedirs("data", exist_ok=True)
sample_feature = {
    "input_ids": features["input_ids"][0],
    "start_positions": features["start_positions"][0],
    "end_positions": features["end_positions"][0]
}

with open("data/preprocessed_sample.json", "w") as f:
    json.dump(sample_feature, f)

print("Saved preprocessed sample to data/preprocessed_sample.json")