# 1. Dataset Setup

In [13]:
from datasets import load_dataset
import re
# Load dataset
dataset = load_dataset("Salesforce/wikitext", "wikitext-2-v1")
print(f"Dataset splits: {dataset.keys()}")
print(f"Train set size: {len(dataset['train'])}")
print(f"Validation set size: {len(dataset['validation'])}")
print(f"Test set size: {len(dataset['test'])}")
train_texts = dataset['train']['text']
validation_texts = dataset['validation']['text'] 
test_texts = dataset['test']['text']

Dataset splits: dict_keys(['test', 'train', 'validation'])
Train set size: 36718
Validation set size: 3760
Test set size: 4358


In [3]:
# Find first non-empty sample
for i in range(10):
    if train_texts[i].strip():  # Check if not empty after stripping whitespace
        print(f"\nFirst non-empty sample at index {i}:")
        print(f"Text: {train_texts[i][:200]}...")
        


First non-empty sample at index 1:
Text:  = Valkyria Chronicles III = 
...

First non-empty sample at index 3:
Text:  Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playin...

First non-empty sample at index 4:
Text:  The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adju...

First non-empty sample at index 5:
Text:  It met with positive sales in Japan , and was praised by both Japanese and western critics . After release , it received downloadable content , along with an expanded edition in November of that year...

First non-empty sample at index 7:
Text:  = = Gameplay = = 
...

First non-empty sample at index 9:
Text:  As with previous <unk> Chronicles games , Valkyria Chronicles III is a tactica

# 2. Data Preprocessing

In [None]:
# 2.1 Remove Duplicates
unique_train = set(train_texts)

print(f"Original training samples: {len(train_texts)}")
print(f"After deduplication: {len(unique_train)}")


Original training samples: 36718
After deduplication: 21338


In [11]:
print("First 5 unique training samples:")
list(unique_train)[:5]


First 5 unique training samples:


['',
 ' The " off @-@ the @-@ cuff " conclusion to production was one of the reasons the album was titled Loose . It was named partly after the spontaneous decisions she made when creating the album . The album is also called Loose because it is " the opposite of calculated " and came naturally to Furtado and Timbaland ; she called him her " distant musical cousin because he was always pushing boundaries and always carving out his own path " , which she believed she was doing with Loose . " I think you have to keep surprising people as an artist , and I like that — I love doing that " , she said . Loose was also named partly for the R & B girl group TLC , who Furtado said she <unk> for " taking back their sexuality , showing they were complete women . " She said she wanted the album to be " assertive and cool " and " sexy but fun " , like TLC , MC <unk> , Queen <unk> and Janet Jackson , who inspired Furtado because , as she put it , she was " comfortable in her sexuality and womanhood 

In [8]:
""" 
NOTES: 
- It seems that each entry in the dataset is a text segment which is not necessairly structured. 
- In this project we are 'training' a BPE tokenizer, which learns subword patterns from the given text data. 
- Perhaps keeping the dataset in the format as it is is fine? But what if we were training a LLM, could we use each entry in the dataset as a training example?
"""

" \nNOTES: \n- It seems that each entry in the dataset is a text segment which is not necessairly structured. \n- In this project we are 'training' a BPE tokenizer, which learns subword patterns from the given text data. \n- Perhaps keeping the dataset in the format as it is is fine? But what if we were training a LLM, could we use each entry in the dataset as a training example?\n"

In [12]:

# 2.2 Filter Empty Samples (and very short ones)
MIN_LENGTH = 10 
unique_train = list(unique_train)
unique_train = [example for example in unique_train if len(example) > MIN_LENGTH]
print("Length of training data after removing short examples: ", len(unique_train))

Length of training data after removing short examples:  21271


**Purpose of Removing <unk> Tokens**

The <unk> token serves as a placeholder for unknown/out-of-vocabulary words in the original WikiText-2 dataset. Here's why we remove them:

1. They're Not Real Words
<unk> is a special marker, not actual text content
It represents words that were replaced during the original dataset creation
Including them would teach the tokenizer to treat <unk> as a meaningful word

2. Tokenizer Training Contamination
If we keep <unk> tokens, the BPE algorithm might learn to split them as "<", "unk", ">"
This creates artificial vocabulary entries that don't represent real language patterns
The tokenizer should learn to handle unknown words naturally, not through explicit <unk> markers

3. Real-World Usage
In production, you want the tokenizer to break down unknown words into subwords
Having <unk> in training teaches the model to use this placeholder instead of learning proper subword segmentation
A well-trained BPE tokenizer should rarely need <unk> tokens

4. Clean Learning Signal
Removing <unk> forces the tokenizer to learn from actual text patterns
The BPE algorithm will naturally create subword units that can handle new words
This leads to better generalization to unseen vocabulary

In [None]:
# 2.3 Clean the Data
def clean_text(text):
    # Remove <unk> tokens
    text = re.sub(r'<unk>', '', text)
    # Replace all sequences of whitespace (spaces, tabs, newlines) with a single space, and remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

unique_train = [clean_text(text) for text in unique_train]



In [15]:
# Verify cleaning worked
print(f"Final training samples: {len(unique_train)}")
print(f"Sample cleaned text: {unique_train[0][:200]}...")

# Check for any remaining issues
empty_count = sum(1 for text in unique_train if not text.strip())
print(f"Empty samples remaining: {empty_count}")

Final training samples: 21271
Sample cleaned text: The " off @-@ the @-@ cuff " conclusion to production was one of the reasons the album was titled Loose . It was named partly after the spontaneous decisions she made when creating the album . The alb...
Empty samples remaining: 1


In [17]:
# Remove again empty samples, we may have gotten one now do do removing <unk>
unique_train = [text for text in unique_train if len(text)> MIN_LENGTH]
empty_count = sum(1 for text in unique_train if not text.strip())
print(f"Empty samples remaining: {empty_count}")

Empty samples remaining: 0


In [18]:
# For training a BPE tokenizer, you should save your training set as a plain text file,
# with one document (string) per line. This is the standard format expected by most
# tokenizers, including Hugging Face's tokenizers library.
with open("wikitext2_train_cleaned.txt", "w", encoding="utf-8") as f:
    for doc in unique_train:
        f.write(doc + "\n")