# Explore Wikipedia Bio Dataset

This notebook explores the processed Wikipedia dataset with category labels.

In [1]:
from datasets import load_from_disk
import pandas as pd
import tiktoken
from collections import Counter
import numpy as np

## Load Datasets

In [3]:
BASE_DIR = "data/datasets/wiki_bio_sample"

forget_ds = load_from_disk(f"{BASE_DIR}/forget")
adjacent_ds = load_from_disk(f"{BASE_DIR}/adjacent")
retain_ds = load_from_disk(f"{BASE_DIR}/retain")

print("Forget dataset:")
print(forget_ds)
print("\nAdjacent dataset:")
print(adjacent_ds)
print("\nRetain dataset:")
print(retain_ds)

Forget dataset:
DatasetDict({
    train: Dataset({
        features: ['category', 'input_ids', 'attention_mask'],
        num_rows: 2630
    })
    test: Dataset({
        features: ['category', 'input_ids', 'attention_mask'],
        num_rows: 627
    })
})

Adjacent dataset:
DatasetDict({
    train: Dataset({
        features: ['category', 'input_ids', 'attention_mask'],
        num_rows: 2346
    })
    test: Dataset({
        features: ['category', 'input_ids', 'attention_mask'],
        num_rows: 1627
    })
})

Retain dataset:
DatasetDict({
    train: Dataset({
        features: ['category', 'input_ids', 'attention_mask'],
        num_rows: 87883
    })
    test: Dataset({
        features: ['category', 'input_ids', 'attention_mask'],
        num_rows: 2336
    })
})


## Dataset Statistics

In [4]:
# Count samples
stats = {
    "Split": [],
    "Train Samples": [],
    "Test Samples": [],
    "Total Samples": [],
    "Train Tokens": [],
    "Test Tokens": [],
    "Total Tokens": [],
}

for name, ds in [("Forget", forget_ds), ("Adjacent", adjacent_ds), ("Retain", retain_ds)]:
    train_samples = len(ds["train"])
    test_samples = len(ds["test"])
    
    # Count tokens (each sample is already tokenized with fixed chunk size)
    chunk_size = len(ds["train"][0]["input_ids"])
    train_tokens = train_samples * chunk_size
    test_tokens = test_samples * chunk_size
    
    stats["Split"].append(name)
    stats["Train Samples"].append(train_samples)
    stats["Test Samples"].append(test_samples)
    stats["Total Samples"].append(train_samples + test_samples)
    stats["Train Tokens"].append(f"{train_tokens:,}")
    stats["Test Tokens"].append(f"{test_tokens:,}")
    stats["Total Tokens"].append(f"{train_tokens + test_tokens:,}")

df_stats = pd.DataFrame(stats)
df_stats

Unnamed: 0,Split,Train Samples,Test Samples,Total Samples,Train Tokens,Test Tokens,Total Tokens
0,Forget,2630,627,3257,2693120,642048,3335168
1,Adjacent,2346,1627,3973,2402304,1666048,4068352
2,Retain,87883,2336,90219,89992192,2392064,92384256


## Category Distribution

In [5]:
def show_category_distribution(ds, name):
    print(f"\n{name} Dataset - Category Distribution:")
    print("=" * 60)
    
    for split_name in ["train", "test"]:
        categories = ds[split_name]["category"]
        counter = Counter(categories)
        
        print(f"\n{split_name.upper()}:")
        for category, count in counter.most_common():
            percentage = (count / len(categories)) * 100
            print(f"  {category:40s}: {count:6d} samples ({percentage:5.1f}%)")

show_category_distribution(forget_ds, "Forget")
show_category_distribution(adjacent_ds, "Adjacent")
show_category_distribution(retain_ds, "Retain")


Forget Dataset - Category Distribution:

TRAIN:
  STEM.Biology                            :   2630 samples (100.0%)

TEST:
  STEM.Biology                            :    627 samples (100.0%)

Adjacent Dataset - Category Distribution:

TRAIN:
  STEM.Medicine_&_Health                  :   2346 samples (100.0%)

TEST:
  STEM.Medicine_&_Health                  :    566 samples ( 34.8%)
  STEM.Earth_and_environment              :    553 samples ( 34.0%)
  STEM.Chemistry                          :    508 samples ( 31.2%)

Retain Dataset - Category Distribution:

TRAIN:
  Culture                                 :  39463 samples ( 44.9%)
  Geography                               :  30527 samples ( 34.7%)
  History_and_Society                     :  13313 samples ( 15.1%)
  STEM                                    :   4580 samples (  5.2%)

TEST:
  STEM                                    :    697 samples ( 29.8%)
  History_and_Society                     :    606 samples ( 25.9%)
  Culture     

## Sample Examples

In [7]:
# Initialize tokenizer to decode examples
tokenizer = tiktoken.get_encoding("gpt2")

def show_examples(ds, name, n_examples=10):
    print(f"\n{'=' * 80}")
    print(f"{name} Dataset - Sample Examples")
    print(f"{'=' * 80}")
    
    for i in range(n_examples):
        example = ds["train"][i]
        tokens = example["input_ids"]
        category = example["category"]
        text = tokenizer.decode(tokens[:200])  # Show first 200 tokens
        
        print(f"\nExample {i+1} - Category: {category}")
        print("-" * 80)
        print(text + "...")
        print()

show_examples(forget_ds, "Forget")
show_examples(adjacent_ds, "Adjacent")
show_examples(retain_ds, "Retain")


Forget Dataset - Sample Examples

Example 1 - Category: STEM.Biology
--------------------------------------------------------------------------------
John Richard Krebs, Baron Krebs, FRS (born 11 April 1945) is an English zoologist researching in the field of behavioural ecology of birds. He was the principal of Jesus College, Oxford, from 2005 until 2015. Lord Krebs was President of the British Science Association from 2012 to 2013.

Early life and education
John Krebs is the son of Sir Hans Adolf Krebs, the German biochemist who described the uptake and release of energy in cells (the Krebs cycle). He was educated at the City of Oxford High School, and Pembroke College, Oxford, where he obtained a BA degree in 1966, upgraded to an MA degree in 1970, and received a DPhil degree in 1970.

Career
He held posts at the University of British Columbia and the University College of North Wales, before returning to Oxford as a University Lecturer in Zoology, with a fellowship at Wolfson Coll

## Verify Chunk Sizes

In [8]:
# Verify all chunks have the same size
for name, ds in [("Forget", forget_ds), ("Adjacent", adjacent_ds), ("Retain", retain_ds)]:
    train_lengths = [len(x["input_ids"]) for x in ds["train"]]
    test_lengths = [len(x["input_ids"]) for x in ds["test"]]
    
    print(f"\n{name}:")
    print(f"  Train chunk sizes: min={min(train_lengths)}, max={max(train_lengths)}, mean={np.mean(train_lengths):.1f}")
    print(f"  Test chunk sizes:  min={min(test_lengths)}, max={max(test_lengths)}, mean={np.mean(test_lengths):.1f}")


Forget:
  Train chunk sizes: min=1024, max=1024, mean=1024.0
  Test chunk sizes:  min=1024, max=1024, mean=1024.0

Adjacent:
  Train chunk sizes: min=1024, max=1024, mean=1024.0
  Test chunk sizes:  min=1024, max=1024, mean=1024.0

Retain:
  Train chunk sizes: min=1024, max=1024, mean=1024.0
  Test chunk sizes:  min=1024, max=1024, mean=1024.0


## Category Overlap Analysis

In [9]:
# Check that categories don't overlap between splits
forget_categories = set(forget_ds["train"]["category"])
adjacent_categories = set(adjacent_ds["train"]["category"])
retain_categories = set(retain_ds["train"]["category"])

print("Category Overlap Analysis:")
print("=" * 60)
print(f"\nForget categories: {sorted(forget_categories)}")
print(f"Adjacent categories: {sorted(adjacent_categories)}")
print(f"Retain categories (top-level): {sorted(retain_categories)}")

overlap_forget_adjacent = forget_categories & adjacent_categories
overlap_forget_retain = forget_categories & retain_categories
overlap_adjacent_retain = adjacent_categories & retain_categories

print(f"\nOverlap between Forget and Adjacent: {overlap_forget_adjacent if overlap_forget_adjacent else 'None'}")
print(f"Overlap between Forget and Retain: {overlap_forget_retain if overlap_forget_retain else 'None'}")
print(f"Overlap between Adjacent and Retain: {overlap_adjacent_retain if overlap_adjacent_retain else 'None'}")

Category Overlap Analysis:

Forget categories: ['STEM.Biology']
Adjacent categories: ['STEM.Medicine_&_Health']
Retain categories (top-level): ['Culture', 'Geography', 'History_and_Society', 'STEM']

Overlap between Forget and Adjacent: None
Overlap between Forget and Retain: None
Overlap between Adjacent and Retain: None
