# Problem 2:

Do train-dev-test split and compute NLL loss on the three sets while training only on train set

### Step 1: Train-Dev-Test split (80-10-10)

In [2]:
import torch
from torch.utils.data import Dataset, random_split
import torch.nn.functional as F

In [None]:
import torch
from torch.utils.data import TensorDataset, random_split

def split_tensors(xs, ys, train_size=0.8, val_size=0.1, test_size=0.1, random_seed=42):
    """
    Split PyTorch tensors into train, validation, and test sets
    """
    xs = torch.Tensor(xs).long()
    ys = torch.Tensor(ys).long()

    # Create TensorDataset
    dataset = TensorDataset(xs, ys)

    # Calculate lengths for splits
    total_size = len(dataset)
    train_length = int(total_size * train_size)
    val_length = int(total_size * val_size)
    test_length = total_size - train_length - val_length

    # Split dataset
    train_dataset, val_dataset, test_dataset = random_split(
        dataset, 
        [train_length, val_length, test_length],
        generator=torch.Generator().manual_seed(random_seed)
    )

    # Return the splits
    return train_dataset, val_dataset, test_dataset

### Step 2.1: Create the bigram data and split it

In [53]:
with open('../names.txt', 'r') as file:
    names = file.read().splitlines()

len(names)

32033

In [54]:
chars = sorted(list(set("".join(name for name in names))))
chars

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [55]:
stoi, itos = {}, {}

stoi['.'] = 0
itos[0] = '.'

for i, ch in enumerate(chars):
    stoi[ch] = i + 1
    itos[i + 1] = ch

In [57]:
xs_bigram, ys_bigram, n = [], [], 0

for name in names:
    name = ['.'] + list(name) + ['.']

    for ch1, ch2 in zip(name, name[1:]):
        i1, i2 = stoi[ch1], stoi[ch2]
        xs_bigram.append(i1)
        ys_bigram.append(i2)
        n += 1

print(f"Number of bigrams: {n}")

Number of bigrams: 228146


In [59]:
train_bigram, val_bigram, test_bigram = split_tensors(xs_bigram, ys_bigram)

X_train_bigram = torch.stack([x for x, _ in train_bigram])
y_train_bigram = torch.stack([y for _, y in train_bigram])

X_val_bigram = torch.stack([x for x, _ in val_bigram])
y_val_bigram = torch.stack([y for _, y in val_bigram])

X_test_bigram = torch.stack([x for x, _ in test_bigram])
y_test_bigram = torch.stack([y for _, y in test_bigram])

In [60]:
X_train_bigram.shape, y_train_bigram.shape

(torch.Size([182516]), torch.Size([182516]))

In [61]:
X_train_bigram_enc = F.one_hot(X_train_bigram, num_classes=len(stoi)).float()
X_val_bigram_enc = F.one_hot(X_val_bigram, num_classes=len(stoi)).float()
X_test_bigram_enc = F.one_hot(X_test_bigram, num_classes=len(stoi)).float()

X_train_bigram_enc.shape, X_val_bigram_enc.shape, X_test_bigram_enc.shape

(torch.Size([182516, 27]), torch.Size([22814, 27]), torch.Size([22816, 27]))

### Step 2.2: Train bigram model on train

In [62]:
g = torch.Generator().manual_seed(2147483647)

bigramW = torch.randn((len(stoi), len(stoi)), generator=g, dtype=torch.float32, requires_grad=True)
bigramW.shape

torch.Size([27, 27])

In [63]:
for i in range(100):
    bigramW.grad = None

    logits = X_train_bigram_enc @ bigramW
    counts = torch.exp(logits)
    probs = counts / (counts.sum(dim=1, keepdim=True))

    pred = probs[torch.arange(0, len(y_train_bigram)), y_train_bigram]

    bigramNll = -torch.log(pred).mean()

    bigramNll.backward()
    with torch.no_grad():
        bigramW -= 50*bigramW.grad

    print(f"Epoch {i+1} NLL: {bigramNll.item()}")

Epoch 1 NLL: 3.758269786834717
Epoch 2 NLL: 3.3710391521453857
Epoch 3 NLL: 3.1540277004241943
Epoch 4 NLL: 3.020277976989746
Epoch 5 NLL: 2.9275436401367188
Epoch 6 NLL: 2.860156774520874
Epoch 7 NLL: 2.809420585632324
Epoch 8 NLL: 2.769749879837036
Epoch 9 NLL: 2.7376906871795654
Epoch 10 NLL: 2.7110939025878906
Epoch 11 NLL: 2.688584089279175
Epoch 12 NLL: 2.6692535877227783
Epoch 13 NLL: 2.6524770259857178
Epoch 14 NLL: 2.637803554534912
Epoch 15 NLL: 2.624891519546509
Epoch 16 NLL: 2.613471508026123
Epoch 17 NLL: 2.6033220291137695
Epoch 18 NLL: 2.5942602157592773
Epoch 19 NLL: 2.586132764816284
Epoch 20 NLL: 2.5788097381591797
Epoch 21 NLL: 2.5721826553344727
Epoch 22 NLL: 2.566159725189209
Epoch 23 NLL: 2.5606629848480225
Epoch 24 NLL: 2.555626630783081
Epoch 25 NLL: 2.5509958267211914
Epoch 26 NLL: 2.546722888946533
Epoch 27 NLL: 2.5427677631378174
Epoch 28 NLL: 2.5390961170196533
Epoch 29 NLL: 2.535680055618286
Epoch 30 NLL: 2.532493829727173
Epoch 31 NLL: 2.529515504837036
Ep

### Step 2.3: Compute NLL for val and test

In [64]:
def compute_nll_bigram(xenc, ys, W):
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / (counts.sum(dim=1, keepdim=True))

    pred = probs[torch.arange(0, len(ys)), ys]

    nll = -torch.log(pred).mean()

    return nll.item()

In [66]:
val_nll, test_nll = compute_nll_bigram(X_val_bigram_enc, y_val_bigram, bigramW),\
      compute_nll_bigram(X_test_bigram_enc, y_test_bigram, bigramW)

val_nll, test_nll

(2.4763457775115967, 2.476706027984619)

### Step 3.1: Split trigram dataset

In [112]:
with open('../names.txt', 'r') as file:
    names = file.read().splitlines()

len(names)

32033

In [113]:
chars = sorted(list(set("".join(name for name in names))))
chars

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [114]:
stoi, itos, btoi, itob = {}, {}, {}, {}

stoi['.'] = 0
itos[0] = '.'

for i, char in enumerate(chars):
    stoi[char] = i + 1
    itos[i + 1] = char

cnt = 0
btoi['..'] = 0
itob[0] = '..'
cnt += 1

for ch1 in (['.'] + chars):
    for ch2 in chars:
        bigram = ch1 + ch2
        btoi[bigram] = cnt
        itob[cnt] = bigram
        cnt += 1

In [115]:
xs_trigram, ys_trigram, n = [], [], 0

for name in names:
    name = ['.'] + list(name) + ['.']

    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        bigram = ch1 + ch2
        i1, i2 = btoi[bigram], stoi[ch3]
        xs_trigram.append(i1)
        ys_trigram.append(i2)
        n += 1

print(f"No of trigrams: {n}")    

No of trigrams: 196113


In [116]:
train_dataset_trigram, val_dataset_trigram, test_dataset_trigram = split_tensors(xs_trigram, ys_trigram)

X_train_trigram = torch.stack([x for x, _ in train_dataset_trigram])
y_train_trigram = torch.stack([y for _, y in train_dataset_trigram])

X_val_trigram = torch.stack([x for x, _ in val_dataset_trigram])
y_val_trigram = torch.stack([y for _, y in val_dataset_trigram])

X_test_trigram = torch.stack([x for x, _ in test_dataset_trigram])
y_test_trigram = torch.stack([y for _, y in test_dataset_trigram])

In [117]:
X_train_trigram.shape, y_train_trigram.shape

(torch.Size([156890]), torch.Size([156890]))

In [118]:
X_val_trigram.shape, y_val_trigram.shape

(torch.Size([19611]), torch.Size([19611]))

In [119]:
X_test_trigram.shape, y_test_trigram.shape

(torch.Size([19612]), torch.Size([19612]))

In [120]:
X_train_trigram_enc = F.one_hot(X_train_trigram, num_classes=len(btoi)).float()
X_val_trigram_enc = F.one_hot(X_val_trigram, num_classes=len(btoi)).float()
X_test_trigram_enc = F.one_hot(X_test_trigram, num_classes=len(btoi)).float()

X_train_trigram_enc.shape, X_val_trigram_enc.shape, X_test_trigram_enc.shape

(torch.Size([156890, 703]), torch.Size([19611, 703]), torch.Size([19612, 703]))

### Step 3.2: Train trigram model on `train_data`

In [121]:
g = torch.Generator().manual_seed(2147483647)

trigramW = torch.randn((len(btoi), len(stoi)), generator=g, dtype=torch.float32, requires_grad=True)
trigramW.shape

torch.Size([703, 27])

In [122]:
for i in range(200):
    trigramW.grad = None

    logits = X_train_trigram_enc @ trigramW
    counts = torch.exp(logits)
    probs = counts / (counts.sum(dim=1, keepdim=True))

    pred = probs[torch.arange(0, len(y_train_trigram)), y_train_trigram]

    trigramNll = -torch.log(pred).mean()

    trigramNll.backward()
    with torch.no_grad():
        trigramW -= 75*trigramW.grad

    print(f"Epoch {i+1} NLL: {trigramNll.item()}")

Epoch 1 NLL: 3.7393720149993896
Epoch 2 NLL: 3.6250696182250977
Epoch 3 NLL: 3.521294355392456
Epoch 4 NLL: 3.428551435470581
Epoch 5 NLL: 3.3469762802124023
Epoch 6 NLL: 3.2757656574249268
Epoch 7 NLL: 3.2133870124816895
Epoch 8 NLL: 3.158212661743164
Epoch 9 NLL: 3.108926773071289
Epoch 10 NLL: 3.064568281173706
Epoch 11 NLL: 3.0244274139404297
Epoch 12 NLL: 2.987946033477783
Epoch 13 NLL: 2.954658269882202
Epoch 14 NLL: 2.924161195755005
Epoch 15 NLL: 2.8961007595062256
Epoch 16 NLL: 2.8701677322387695
Epoch 17 NLL: 2.846097469329834
Epoch 18 NLL: 2.8236639499664307
Epoch 19 NLL: 2.8026764392852783
Epoch 20 NLL: 2.782975196838379
Epoch 21 NLL: 2.764425277709961
Epoch 22 NLL: 2.7469122409820557
Epoch 23 NLL: 2.730339288711548
Epoch 24 NLL: 2.7146220207214355
Epoch 25 NLL: 2.6996874809265137
Epoch 26 NLL: 2.6854729652404785
Epoch 27 NLL: 2.671922206878662
Epoch 28 NLL: 2.6589865684509277
Epoch 29 NLL: 2.6466221809387207
Epoch 30 NLL: 2.6347899436950684
Epoch 31 NLL: 2.6234548091888428

### Step 3.3 : Get NLL for `val` and `test` split

In [123]:
def get_nll(xenc, ys, W):
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / (counts.sum(dim=1, keepdim=True))

    pred = probs[torch.arange(0, len(ys)), ys]

    nll = -torch.log(pred).mean().item()

    return nll

In [124]:
val_nll = get_nll(X_val_trigram_enc, y_val_trigram, trigramW)
test_nll = get_nll(X_test_trigram_enc, y_test_trigram, trigramW)
val_nll, test_nll

(2.2296581268310547, 2.2140655517578125)

# Problem 3

Analyse the regularization hyperparameter and choose the best one for bigram and trigram model

### Step 1.1: Create the loop for L2 normalization parameter

In [125]:
for reg_param in [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]:
    g = torch.Generator().manual_seed(2147483647)
    bigramW = torch.randn((len(stoi), len(stoi)), generator=g, dtype=torch.float32, requires_grad=True)
  
    ## adjust lr since there is no convergence in the higher reg_param for very high lr
    lr = 1/reg_param
    if lr > 100:
        lr = 100
    elif lr < 0.01:
        lr = 0.01
        
    for epoch in range(100):
        bigramW.grad = None

        ## Train 
        logits = X_train_bigram_enc @ bigramW
        counts = torch.exp(logits)
        probs = counts / (counts.sum(dim=1, keepdim=True))

        pred = probs[torch.arange(0, len(y_train_bigram)), y_train_bigram]

        bigramNll = -torch.log(pred).mean() + reg_param * (((bigramW.data)**2).mean())

        bigramNll.backward()

        with torch.no_grad():
            bigramW -= lr*bigramW.grad

    ## Train and Val losses
    with torch.no_grad():
        logits = X_train_bigram_enc @ bigramW
        counts = torch.exp(logits)
        probs = counts / (counts.sum(dim=1, keepdim=True))

        pred = probs[torch.arange(0, len(y_train_bigram)), y_train_bigram]

        bigramNll_train = -torch.log(pred).mean()

        logits = X_val_bigram_enc @ bigramW
        counts = torch.exp(logits)
        probs = counts / (counts.sum(dim=1, keepdim=True))

        pred = probs[torch.arange(0, len(y_val_bigram)), y_val_bigram]

        bigramNll_val = -torch.log(pred).mean()

    print(f"Regularization param: {reg_param}, Train loss: {bigramNll_train}, Val loss: {bigramNll_val}")

Regularization param: 1, Train loss: 3.1852142810821533, Val loss: 3.1824824810028076
Regularization param: 0.5, Train loss: 2.947974681854248, Val loss: 2.944690704345703
Regularization param: 0.1, Train loss: 2.5745139122009277, Val loss: 2.5758018493652344
Regularization param: 0.05, Train loss: 2.508540153503418, Val loss: 2.5116376876831055
Regularization param: 0.01, Train loss: 2.4907517433166504, Val loss: 2.495285987854004
Regularization param: 0.005, Train loss: 2.4907517433166504, Val loss: 2.495285987854004
Regularization param: 0.001, Train loss: 2.4907517433166504, Val loss: 2.495285987854004


### Step 1.2: Take best L2 parameters and train

In [126]:
g = torch.Generator().manual_seed(2147483647)
bigramW_best = torch.randn((len(stoi), len(stoi)), generator=g, dtype=torch.float32, requires_grad=True)

reg_param = 0.01
lr = 100  

for epoch in range(100):
    bigramW_best.grad = None

    ## Train 
    logits = X_train_bigram_enc @ bigramW_best
    counts = torch.exp(logits)
    probs = counts / (counts.sum(dim=1, keepdim=True))

    pred = probs[torch.arange(0, len(y_train_bigram)), y_train_bigram]

    bigramNll = -torch.log(pred).mean() + reg_param * (((bigramW_best.data)**2).mean())

    bigramNll.backward()

    with torch.no_grad():
        bigramW_best -= lr*bigramW_best.grad

    ## Train and Val losses
    with torch.no_grad():
        logits = X_train_bigram_enc @ bigramW_best
        counts = torch.exp(logits)
        probs = counts / (counts.sum(dim=1, keepdim=True))

        pred = probs[torch.arange(0, len(y_train_bigram)), y_train_bigram]

        bigramNll_train = -torch.log(pred).mean().item()

        logits = X_val_bigram_enc @ bigramW_best
        counts = torch.exp(logits)
        probs = counts / (counts.sum(dim=1, keepdim=True))

        pred = probs[torch.arange(0, len(y_val_bigram)), y_val_bigram]

        bigramNll_val = -torch.log(pred).mean().item()

    print(f"Epoch {epoch+1}, Train NLL: {bigramNll_train}, Val NLL: {bigramNll_val}")

Epoch 1, Train NLL: 3.1295087337493896, Val NLL: 3.1260931491851807
Epoch 2, Train NLL: 2.9141311645507812, Val NLL: 2.910109758377075
Epoch 3, Train NLL: 2.7997078895568848, Val NLL: 2.7955095767974854
Epoch 4, Train NLL: 2.7334985733032227, Val NLL: 2.7308766841888428
Epoch 5, Train NLL: 2.686508893966675, Val NLL: 2.684903144836426
Epoch 6, Train NLL: 2.6544017791748047, Val NLL: 2.653481960296631
Epoch 7, Train NLL: 2.6291565895080566, Val NLL: 2.6294138431549072
Epoch 8, Train NLL: 2.617499351501465, Val NLL: 2.617274045944214
Epoch 9, Train NLL: 2.605008840560913, Val NLL: 2.606895685195923
Epoch 10, Train NLL: 2.614349842071533, Val NLL: 2.6142373085021973
Epoch 11, Train NLL: 2.582610845565796, Val NLL: 2.5848169326782227
Epoch 12, Train NLL: 2.577045202255249, Val NLL: 2.577923059463501
Epoch 13, Train NLL: 2.5659255981445312, Val NLL: 2.5690176486968994
Epoch 14, Train NLL: 2.578768014907837, Val NLL: 2.5796923637390137
Epoch 15, Train NLL: 2.551987648010254, Val NLL: 2.55506

### Step 1.3: Get the test NLL

In [127]:
with torch.no_grad():
    logits = X_test_bigram_enc @ bigramW_best
    counts = torch.exp(logits)
    probs = counts / (counts.sum(dim=1, keepdim=True))

    pred = probs[torch.arange(0, len(y_test_bigram)), y_test_bigram]

    bigramNll_test = -torch.log(pred).mean().item()

bigramNll_test

2.4967715740203857

### Step 2.1: Create the loop for L2 normalization parameter trigram

In [128]:
for reg_param in [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]:
    g = torch.Generator().manual_seed(2147483647)
    trigramW = torch.randn((len(btoi), len(stoi)), generator=g, dtype=torch.float32, requires_grad=True)
  
    ## adjust lr since there is no convergence in the higher reg_param for very high lr
    lr = 1/reg_param
    if lr > 100:
        lr = 100
    
    for epoch in range(100):
        trigramW.grad = None

        ## Train 
        logits = X_train_trigram_enc @ trigramW
        counts = torch.exp(logits)
        probs = counts / (counts.sum(dim=1, keepdim=True))

        pred = probs[torch.arange(0, len(y_train_trigram)), y_train_trigram]

        trigramNll = -torch.log(pred).mean() + reg_param * (((trigramW.data)**2).mean())

        trigramNll.backward()

        with torch.no_grad():
            trigramW -= lr*trigramW.grad

    ## Train and Val losses
    with torch.no_grad():
        logits = X_train_trigram_enc @ trigramW
        counts = torch.exp(logits)
        probs = counts / (counts.sum(dim=1, keepdim=True))

        pred = probs[torch.arange(0, len(y_train_trigram)), y_train_trigram]

        trigramNll_train = -torch.log(pred).mean()

        logits = X_val_trigram_enc @ trigramW
        counts = torch.exp(logits)
        probs = counts / (counts.sum(dim=1, keepdim=True))

        pred = probs[torch.arange(0, len(y_val_trigram)), y_val_trigram]

        trigramNll_val = -torch.log(pred).mean()

    print(f"Regularization param: {reg_param}, Train loss: {trigramNll_train}, Val loss: {trigramNll_val}")

Regularization param: 1, Train loss: 3.5922605991363525, Val loss: 3.6011605262756348
Regularization param: 0.5, Train loss: 3.4635202884674072, Val loss: 3.472970724105835
Regularization param: 0.1, Train loss: 2.9193384647369385, Val loss: 2.9296774864196777
Regularization param: 0.05, Train loss: 2.6655161380767822, Val loss: 2.67805814743042
Regularization param: 0.01, Train loss: 2.2568893432617188, Val loss: 2.2825212478637695
Regularization param: 0.005, Train loss: 2.2568893432617188, Val loss: 2.2825212478637695
Regularization param: 0.001, Train loss: 2.2568893432617188, Val loss: 2.2825212478637695


### Step 1.2: Take best L2 parameters and train

In [129]:
g = torch.Generator().manual_seed(2147483647)
trigramW_best = torch.randn((len(btoi), len(stoi)), generator=g, dtype=torch.float32, requires_grad=True)

reg_param = 0.01
lr = 100  

for epoch in range(100):
    trigramW_best.grad = None

    ## Train 
    logits = X_train_trigram_enc @ trigramW_best
    counts = torch.exp(logits)
    probs = counts / (counts.sum(dim=1, keepdim=True))

    pred = probs[torch.arange(0, len(y_train_trigram)), y_train_trigram]

    trigramNll = -torch.log(pred).mean() + reg_param * (((trigramW_best.data)**2).mean())

    trigramNll.backward()

    with torch.no_grad():
        trigramW_best -= lr*trigramW_best.grad


    ## Train and Val losses
    with torch.no_grad():
        logits = X_train_trigram_enc @ trigramW_best
        counts = torch.exp(logits)
        probs = counts / (counts.sum(dim=1, keepdim=True))

        pred = probs[torch.arange(0, len(y_train_trigram)), y_train_trigram]

        trigramNll_train = -torch.log(pred).mean().item()

        logits = X_val_trigram_enc @ trigramW_best
        counts = torch.exp(logits)
        probs = counts / (counts.sum(dim=1, keepdim=True))

        pred = probs[torch.arange(0, len(y_val_trigram)), y_val_trigram]

        trigramNll_val = -torch.log(pred).mean().item()

    print(f"Epoch {epoch+1}, Train NLL: {trigramNll_train}, Val NLL: {trigramNll_val}")

Epoch 1, Train NLL: 3.5882608890533447, Val NLL: 3.5972273349761963
Epoch 2, Train NLL: 3.4563732147216797, Val NLL: 3.4659299850463867
Epoch 3, Train NLL: 3.3446061611175537, Val NLL: 3.3545033931732178
Epoch 4, Train NLL: 3.2515594959259033, Val NLL: 3.2616055011749268
Epoch 5, Train NLL: 3.173459768295288, Val NLL: 3.1835429668426514
Epoch 6, Train NLL: 3.106626510620117, Val NLL: 3.1167030334472656
Epoch 7, Train NLL: 3.048572301864624, Val NLL: 3.058645725250244
Epoch 8, Train NLL: 2.997666835784912, Val NLL: 3.007766008377075
Epoch 9, Train NLL: 2.9527089595794678, Val NLL: 2.9628734588623047
Epoch 10, Train NLL: 2.912720203399658, Val NLL: 2.9229929447174072
Epoch 11, Train NLL: 2.876875638961792, Val NLL: 2.887296438217163
Epoch 12, Train NLL: 2.8444905281066895, Val NLL: 2.855093240737915
Epoch 13, Train NLL: 2.8150100708007812, Val NLL: 2.8258211612701416
Epoch 14, Train NLL: 2.7879936695098877, Val NLL: 2.799031972885132
Epoch 15, Train NLL: 2.763092517852783, Val NLL: 2.774

### Step 1.3: Get the test NLL

In [130]:
with torch.no_grad():
    logits = X_test_trigram_enc @ trigramW_best
    counts = torch.exp(logits)
    probs = counts / (counts.sum(dim=1, keepdim=True))

    pred = probs[torch.arange(0, len(y_test_trigram)), y_test_trigram]

    trigramNll_test = -torch.log(pred).mean().item()

trigramNll_test

2.2652206420898438