**INSTALL THE REQUIREMENTS**

In [1]:
!pip install transformers datasets torch



**MOUNT TO DRIVE & IMPORT LIBRARIES**

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import libraries
from datasets import load_from_disk
from transformers import AutoTokenizer
import torch

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Mounted at /content/drive
Using device: cpu


**LOAD PREPROCESS DATA**

In [4]:
print("\nLoading datasets from Google Drive...")
drive_path = "/content/drive/MyDrive/AG_News_Project"

train_dataset = load_from_disk(f"{drive_path}/ag_news_train")
val_dataset = load_from_disk(f"{drive_path}/ag_news_val")
test_dataset = load_from_disk(f"{drive_path}/ag_news_test")

print(f"✓ Loaded train: {len(train_dataset)} samples")
print(f"✓ Loaded validation: {len(val_dataset)} samples")
print(f"✓ Loaded test: {len(test_dataset)} samples")


Loading datasets from Google Drive...
✓ Loaded train: 108000 samples
✓ Loaded validation: 12000 samples
✓ Loaded test: 7600 samples


**LOAD DISTILBERT TOKENIZER**

In [5]:
print("\nLoading DistilBERT tokenizer...")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"✓ Tokenizer loaded: {model_name}")
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Max length: {tokenizer.model_max_length}")

# Show special tokens
print(f"\nSpecial tokens:")
print(f"CLS token: {tokenizer.cls_token} (ID: {tokenizer.cls_token_id})")
print(f"SEP token: {tokenizer.sep_token} (ID: {tokenizer.sep_token_id})")
print(f"PAD token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")


Loading DistilBERT tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

✓ Tokenizer loaded: distilbert-base-uncased
Vocab size: 30522
Max length: 512

Special tokens:
CLS token: [CLS] (ID: 101)
SEP token: [SEP] (ID: 102)
PAD token: [PAD] (ID: 0)


**TESTING TOKENIZATION**

In [7]:
print("\n" + "="*50)
print("TESTING TOKENIZATION ON SAMPLE TEXT")
print("="*50)

sample_text = train_dataset[0]['text']
print(f"\nOriginal text: {sample_text[:200]}...")

# Tokenize sample
encoded = tokenizer(sample_text,
                   truncation=True,
                   padding='max_length',
                   max_length=128,
                   return_tensors='pt')

print(f"\nTokenized output:")
print(f"Input IDs shape: {encoded['input_ids'].shape}")
print(f"Attention mask shape: {encoded['attention_mask'].shape}")
print(f"\nFirst 20 token IDs: {encoded['input_ids'][0][:20].tolist()}")
print(f"First 20 attention masks: {encoded['attention_mask'][0][:20].tolist()}")

# Decode back to see what it looks like
decoded = tokenizer.decode(encoded['input_ids'][0])
print(f"\nDecoded text: {decoded[:200]}...")


TESTING TOKENIZATION ON SAMPLE TEXT

Original text: Despair and Anger in Small Russian Town After Siege.  BESLAN, Russia (Reuters) - The killing of more than 320  children, parents and teachers during the bloody end to a  53-hour school siege left bare...

Tokenized output:
Input IDs shape: torch.Size([1, 128])
Attention mask shape: torch.Size([1, 128])

First 20 token IDs: [101, 13905, 1998, 4963, 1999, 2235, 2845, 2237, 2044, 6859, 1012, 2022, 14540, 2319, 1010, 3607, 1006, 26665, 1007, 1011]
First 20 attention masks: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Decoded text: [CLS] despair and anger in small russian town after siege. beslan, russia ( reuters ) - the killing of more than 320 children, parents and teachers during the bloody end to a 53 - hour school siege le...


**TOKENIZE ALL DATASETS**

In [8]:
print("\n" + "="*50)
print("TOKENIZING ALL DATASETS")
print("="*50)

def tokenize_function(examples):
    """Tokenize texts with padding and truncation"""
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=128,  # Using 128 tokens to save memory; can increase to 256 or 512
        return_tensors=None
    )

print("\nTokenizing training set...")
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text', 'title', 'description']  # Remove text columns to save memory
)

print("Tokenizing validation set...")
tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text', 'title', 'description']
)

print("Tokenizing test set...")
tokenized_test = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text', 'title', 'description']
)

print("\n✓ Tokenization complete!")
print(f"Tokenized train columns: {tokenized_train.column_names}")
print(f"Tokenized train features: {tokenized_train.features}")


TOKENIZING ALL DATASETS

Tokenizing training set...


Map:   0%|          | 0/108000 [00:00<?, ? examples/s]

Tokenizing validation set...


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Tokenizing test set...


Map:   0%|          | 0/7600 [00:00<?, ? examples/s]


✓ Tokenization complete!
Tokenized train columns: ['label', 'input_ids', 'attention_mask']
Tokenized train features: {'label': Value('int64'), 'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8'))}


**SAVE FORMAT FOR PYTORCH**

In [9]:
print("\nSetting format for PyTorch...")
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_val.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("✓ Format set to PyTorch tensors")

# Show sample
print(f"\nSample tokenized data:")
print(f"Input IDs: {tokenized_train[0]['input_ids'].shape}")
print(f"Attention mask: {tokenized_train[0]['attention_mask'].shape}")
print(f"Label: {tokenized_train[0]['label']}")


Setting format for PyTorch...
✓ Format set to PyTorch tensors

Sample tokenized data:
Input IDs: torch.Size([128])
Attention mask: torch.Size([128])
Label: 0


**SAVE TOKENIZED DATASETS & TOKENIZER**

In [10]:
# ========== SAVE TOKENIZED DATASETS ==========
print("\n" + "="*50)
print("SAVING TOKENIZED DATASETS TO GOOGLE DRIVE")
print("="*50)

tokenized_train.save_to_disk(f"{drive_path}/tokenized_train")
tokenized_val.save_to_disk(f"{drive_path}/tokenized_val")
tokenized_test.save_to_disk(f"{drive_path}/tokenized_test")

print(f"\n✓ Saved tokenized train to: {drive_path}/tokenized_train")
print(f"✓ Saved tokenized val to: {drive_path}/tokenized_val")
print(f"✓ Saved tokenized test to: {drive_path}/tokenized_test")

# ========== SAVE TOKENIZER ==========
print("\nSaving tokenizer...")
tokenizer.save_pretrained(f"{drive_path}/tokenizer")
print(f"✓ Saved tokenizer to: {drive_path}/tokenizer")

print("\n" + "="*50)
print("TOKENIZATION COMPLETE! Ready for model training.")
print("="*50)


SAVING TOKENIZED DATASETS TO GOOGLE DRIVE


Saving the dataset (0/1 shards):   0%|          | 0/108000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7600 [00:00<?, ? examples/s]


✓ Saved tokenized train to: /content/drive/MyDrive/AG_News_Project/tokenized_train
✓ Saved tokenized val to: /content/drive/MyDrive/AG_News_Project/tokenized_val
✓ Saved tokenized test to: /content/drive/MyDrive/AG_News_Project/tokenized_test

Saving tokenizer...
✓ Saved tokenizer to: /content/drive/MyDrive/AG_News_Project/tokenizer

TOKENIZATION COMPLETE! Ready for model training.
