In [None]:
# Load sorted_sentences from file
with open('/content/curriculum_sorted.txt', 'r', encoding='utf-8') as f:
    sorted_sentences = [line.strip() for line in f.readlines()]
print(f"Loaded {len(sorted_sentences)} sorted sentences.")

curriculum_sentences = sorted_sentences

Loaded 447213 sorted sentences.


In [None]:
# 02_telugu_babylm_training.ipynb
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import random
import math

# Load datasets
curriculum_sentences = sorted_sentences  # from previous notebook
random_sentences = pd.read_csv('/content/telugu_books.csv')['text'].tolist()
child_sentences = pd.read_csv('/content/child_directed_telugu_1Lakh.csv')['sentence'].tolist()

print(f"Curriculum: {len(curriculum_sentences)} sentences")
print(f"Random: {len(random_sentences)} sentences")
print(f"Child-directed: {len(child_sentences)} sentences")

# Basic tokenizer (whitespace split)
# Basic tokenizer (whitespace split)
def tokenize(sentence):
    if not isinstance(sentence, str):
        sentence = str(sentence)
    return sentence.split()

# Clean sentences properly
def clean_sentences(sent_list):
    clean = []
    for s in sent_list:
        if isinstance(s, str) and s.strip() != '':
            clean.append(s)
    return clean

# Clean all datasets
curriculum_sentences = clean_sentences(curriculum_sentences)
random_sentences = clean_sentences(random_sentences)
child_sentences = clean_sentences(child_sentences)

# Now build vocab
# New code (replace here)

import re
from collections import Counter

# Tokenize sentences better (split by Telugu words properly)
def tokenize(sentence):
    if not isinstance(sentence, str):
        sentence = str(sentence)
    return re.findall(r'\w+', sentence)

# Clean sentences first
def clean_sentences(sent_list):
    clean = []
    for s in sent_list:
        if isinstance(s, str) and s.strip() != '':
            clean.append(s)
    return clean

# Clean all datasets
# Subsample datasets for fast training (for testing)
curriculum_sentences = curriculum_sentences[:500]
random_sentences = random_sentences[:100]
child_sentences = child_sentences[:500]

print(f"New dataset sizes: Curriculum {len(curriculum_sentences)}, Random {len(random_sentences)}, Child-directed {len(child_sentences)}")


# Build vocab smarter
word_counter = Counter()
for s in (curriculum_sentences + random_sentences + child_sentences):
    word_counter.update(tokenize(s))

# Keep only frequent words
MIN_FREQ = 5
vocab = [word for word, freq in word_counter.items() if freq >= MIN_FREQ]

word2idx = {w: i+1 for i, w in enumerate(vocab)}  # 0 = padding
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(word2idx) + 1
print(f"Final Vocab size after filtering: {vocab_size}")


# Encode sentences
def encode(sent):
    return [word2idx[w] for w in tokenize(sent) if w in word2idx]

# Model: simple Transformer
class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, heads=4, layers=6):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        x = self.transformer(x)
        x = self.fc(x)
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Train function
def train_model(train_sentences, epochs=2):
    model = SimpleTransformer(vocab_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        random.shuffle(train_sentences)
        for sent in train_sentences:
            ids = encode(sent)
            if len(ids) < 2:
                continue
            input_ids = torch.tensor([ids[:-1]], device=device)
            target_ids = torch.tensor([ids[1:]], device=device)
            optimizer.zero_grad()
            output = model(input_ids)
            loss = criterion(output.view(-1, vocab_size), target_ids.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_sentences):.4f}")
    return model

# Perplexity function
def calculate_perplexity(model, sentences):
    model.eval()
    total_loss = 0
    total_count = 0
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
    with torch.no_grad():
        for sent in sentences:
            ids = encode(sent)
            if len(ids) < 2:
                continue
            input_ids = torch.tensor([ids[:-1]], device=device)
            target_ids = torch.tensor([ids[1:]], device=device)
            output = model(input_ids)
            loss = criterion(output.view(-1, vocab_size), target_ids.view(-1))
            total_loss += loss.item()
            total_count += len(ids) - 1
    ppl = math.exp(total_loss / total_count)
    return ppl

# Train three models
print("Training on Curriculum...")
curr_model = train_model(curriculum_sentences)

print("Training on Random...")
rand_model = train_model(random_sentences)

print("Training on Child-directed...")
child_model = train_model(child_sentences)

# Evaluate
print("Evaluating...")
curr_ppl = calculate_perplexity(curr_model, curriculum_sentences[:1000])
rand_ppl = calculate_perplexity(rand_model, random_sentences[:1000])
child_ppl = calculate_perplexity(child_model, child_sentences[:1000])

print(f"Curriculum Perplexity: {curr_ppl:.2f}")
print(f"Random Perplexity: {rand_ppl:.2f}")
print(f"Child-directed Perplexity: {child_ppl:.2f}")

# Minimal pair testing
minimal_pairs = [
    ("అమ్మాయిలు బడికి వెళ్తున్నారు.", "అమ్మాయిలు బడికి వెళ్తుంది."),
    ("అతను పాలు తాగాడు.", "అతను పాలు తాగింది."),
]

for correct, wrong in minimal_pairs:
    c_ids = encode(correct)
    w_ids = encode(wrong)
    if len(c_ids) < 2 or len(w_ids) < 2:
        continue
    input_c = torch.tensor([c_ids[:-1]], device=device)
    target_c = torch.tensor([c_ids[1:]], device=device)
    input_w = torch.tensor([w_ids[:-1]], device=device)
    target_w = torch.tensor([w_ids[1:]], device=device)
    c_loss = nn.CrossEntropyLoss(ignore_index=0)(curr_model(input_c).view(-1, vocab_size), target_c.view(-1))
    w_loss = nn.CrossEntropyLoss(ignore_index=0)(curr_model(input_w).view(-1, vocab_size), target_w.view(-1))
    print(f"Minimal pair: Correct loss={c_loss.item():.4f}, Wrong loss={w_loss.item():.4f}")


Curriculum: 447213 sentences
Random: 25793 sentences
Child-directed: 100000 sentences
New dataset sizes: Curriculum 500, Random 100, Child-directed 500
Final Vocab size after filtering: 790
Using device: cpu
Training on Curriculum...




Epoch 1, Loss: 2.4102
Epoch 2, Loss: 2.6049
Training on Random...
Epoch 1, Loss: 4.2493
Epoch 2, Loss: 4.1448
Training on Child-directed...
Epoch 1, Loss: 3.0251
Epoch 2, Loss: 2.9251
Evaluating...
Curriculum Perplexity: 13.19
Random Perplexity: 62.69
Child-directed Perplexity: 17.75
Minimal pair: Correct loss=7.2830, Wrong loss=7.9648
Minimal pair: Correct loss=4.8156, Wrong loss=4.7088
