# Assignment 2

In this assignment, you will continue with the Bigram Language Model from the Lecture. Make the training loop and inference for the model.

## Importing Libraries

In [4]:
import os
import math
from dataclasses import dataclass
import torch
from torch.nn import functional as F
from utils import load_text, set_seed

## Configuration

In [9]:
@dataclass
class BigramConfig:
    root_dir: str = os.getcwd() + "/../../"
    dataset_path: str = "names.txt"

    # Tokenizer
    vocab_size: int = 0  # Set later

    seed: int = 101

config = BigramConfig()

## Reproducibility

In [6]:
set_seed(config.seed)

Random seed set to 101


## Dataset

In [10]:
names = load_text(config.dataset_path).splitlines()

Loaded text data from names.txt (length: 228145 characters).


## Preprocessing

In [11]:
# Add special token
names = ["." + name + "." for name in names]

## Tokenizer

In [12]:
chars = [chr(i) for i in range(97, 123)]  # all alphabet characters
chars.insert(0, ".")  # Add special token
config.vocab_size = len(chars)
str2idx = {char: idx for idx, char in enumerate(chars)}
idx2str = {idx: char for char, idx in str2idx.items()}

## Model

In [13]:
# Initialize weights
W = torch.randn(config.vocab_size, config.vocab_size, requires_grad=True)
b = torch.randn(config.vocab_size, requires_grad=True)
params = [W, b]

## Training

#### Task 1: Train Bigram Language Model (Neural Network Approach)

Make the training loop for the Bigram Language Model.

In [14]:
# Set of Input, Target pairs
inputs, targets = [], []
for name in names:
    for char1, char2 in zip(name, name[1:]):
        input = str2idx[char1]
        target = str2idx[char2]
        inputs.append(input)
        targets.append(target)

# Convert to tensor
inputs = torch.tensor(inputs, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

In [15]:
print(f"Number of Input, Target pairs: {len(inputs)}")
print(f"Input shape: {inputs.shape}")
print(f"Target shape: {targets.shape}")
print(f"First (Input, Target): ({inputs[0]}, {targets[0]})")
print(f"Second (Input, Target): ({inputs[1]}, {targets[1]})")

Number of Input, Target pairs: 228146
Input shape: torch.Size([228146])
Target shape: torch.Size([228146])
First (Input, Target): (0, 5)
Second (Input, Target): (5, 13)


In [18]:
################################################################################
# TODO:                                                                        #
# One-hot encode the input tensor.                                             #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
inputs_encoded = F.one_hot(inputs, num_classes=config.vocab_size)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# Convert data type to float
inputs_encoded = inputs_encoded.float()

In [19]:
# Training Loop
steps = 100
lr = 10

for step in range(1, steps + 1):
    # Forward pass
    ################################################################################
    # TODO:                                                                        #
    # Implement the forward pass.                                                  #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    logits = inputs_encoded @ W + b
    probs = F.softmax(logits, dim=1)
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    # loss
    log_probs = torch.log(probs + 1e-9)  # Add small value to prevent log(0)
    loss = -log_probs[torch.arange(len(targets)), targets].mean()

    # Backward pass
    ################################################################################
    # TODO:                                                                        #
    # Implement the backward pass.                                                 #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    for param in params:
        if param.grad is not None:
            param.grad.zero_()
    loss.backward()
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    # Update weights
    ################################################################################
    # TODO:                                                                        #
    # Update the weights of the model using the gradients.                         #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    if step % 10 == 0:
        print(f"Step {step}, Loss {loss.item():.4f}")

Step 10, Loss 2.8777
Step 20, Loss 2.7156
Step 30, Loss 2.6413
Step 40, Loss 2.5989
Step 50, Loss 2.5718
Step 60, Loss 2.5532
Step 70, Loss 2.5397
Step 80, Loss 2.5294
Step 90, Loss 2.5214
Step 100, Loss 2.5150


## Inference

#### Task 2: Generate a Name

Create a function to generate a name using the trained Bigram Language Model.

In [20]:
# Create a function to generate a name
def generate_name():
    new_name = []
    start_idx = str2idx["."]

    while True:
        ################################################################################
        # TODO:                                                                        #
        # 1. Forward pass                                                              #
        # 2. Sample the next token                                                     #
        # 3. Decode the token                                                          #
        # 4. Update the start_idx                                                      #
        # 5. Break if the next character is "."                                        #
        ################################################################################
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        # Forward pass
        x = F.one_hot(torch.tensor([start_idx]), num_classes=config.vocab_size).float()
        logits = x @ W + b
        probs = F.softmax(logits, dim=1)

        # Sample
        next_idx = torch.multinomial(probs, num_samples=1).item()

        # Decode
        next_char = idx2str[next_idx]

        # Update
        if next_char == ".":
            break
        new_name.append(next_char)

        # Break if "."
        start_idx = next_idx

        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return ''.join(new_name)

# Generate 5 names
for _ in range(5):
    print(generate_name())

aymienzeta
dhchatt
rneara
gn
todeleanion


## Extra Credit

We have already made our own custom auto-grad Tensor class. Let's use it!

Train the Bigram Language Model using our custom auto-grad Tensor class.

**Do not use any built-in PyTorch functions.** (other deep learning libraries are also prohibited)

In [21]:
class Tensor:
    def __init__(self, data, _children=(), _operation=''):
        self.data = data
        self._prev = set(_children)
        self.gradient = 0
        self._backward = lambda: None

    def __repr__(self):
        return f"tensor=({self.data})"

    def __add__(self, other):  # self + other
        output = Tensor(self.data + other.data, (self, other), '+')
        def _backward():
            self.gradient = 1 * output.gradient
            other.gradient = 1 * output.gradient
        output._backward = _backward
        return output

    def __mul__(self, other):  # self * other
        output = Tensor(self.data * other.data, (self, other), '*')
        def _backward():
            self.gradient = other.data * output.gradient
            other.gradient = self.data * output.gradient
        output._backward = _backward
        return output

    def tanh(self):  # tanh(self)
        output = Tensor(math.tanh(self.data), (self,), 'tanh')
        def _backward():
            self.gradient = (1.0 - math.tanh(self.data) ** 2) * output.gradient
        output._backward = _backward
        return output

    def __pow__(self, power):  # self ** power
        assert isinstance(power, (int, float)), "Power must be an int or a float"
        output = Tensor(self.data ** power, (self,), f'**{power}')
        def _backward():
            self.gradient = power * (self.data ** (power - 1)) * output.gradient
        output._backward = _backward
        return output

    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.gradient = 1
        for node in reversed(topo):
            node._backward()

    def __neg__(self): # -self
        return self * Tensor(-1.0)

    def __sub__(self, other): # self - other
        return self + (-other)

In [39]:
################################################################################
# TODO:                                                                        #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
import random
import math

# 데이터셋: 문자 사전 생성 (a-z + ".")
chars = ['.'] + [chr(i) for i in range(97, 123)]
vocab_size = len(chars)

# 문자 → 인덱스 변환
str2idx = {char: idx for idx, char in enumerate(chars)}
idx2str = {idx: char for char, idx in str2idx.items()}

# 훈련 데이터 준비
names = ["hello", "world", "chat", "bot"]
names = ["." + name + "." for name in names]  # 시작과 끝을 '.'으로 감싸기

# 빅램 (Bigram) 데이터셋 생성
inputs, targets = [], []
for name in names:
    for ch1, ch2 in zip(name, name[1:]):
        inputs.append(str2idx[ch1])
        targets.append(str2idx[ch2])

# 커스텀 Tensor 클래스를 사용한 가중치 초기화
W = [[Tensor(random.uniform(-1, 1)) for _ in range(vocab_size)] for _ in range(vocab_size)]
b = [Tensor(random.uniform(-1, 1)) for _ in range(vocab_size)]

# 확률 계산을 위한 소프트맥스 함수
def softmax(logits):
    exps = [math.exp(logit.data) for logit in logits]  # e^x
    total = sum(exps)  # Tensor 덧셈 사용
    return [Tensor(exp / total) for exp in exps]

# 하이퍼파라미터 설정
epochs = 500
lr = 0.1

# 훈련 루프
for epoch in range(epochs):
    # 순전파 (Forward)
    loss = Tensor(0.0)
    for i in range(len(inputs)):
        x = inputs[i]
        y = targets[i]

        # 선형 변환: logits = W[x] + b
        logits = [W[x][j] + b[j] for j in range(vocab_size)]

        probs = softmax(logits)

        # Cross-Entropy Loss 대체 (로그 없이)
        loss = loss + -(probs[y].tanh())  # 확률 값 자체를 사용하여 손실 계산

    # 🎯 역전파 (Backward)
    loss.backward()

    # 🔄 SGD 업데이트
    for i in range(vocab_size):
        for j in range(vocab_size):
            W[i][j] = W[i][j] - Tensor(lr) * Tensor(W[i][j].gradient)
        b[i] = b[i] - Tensor(lr) * Tensor(b[i].gradient)

    # 🖥️ 학습 진행 출력
    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.data:.4f}")

# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

Epoch 0, Loss: -0.7923
Epoch 50, Loss: -0.7923
Epoch 100, Loss: -0.7923
Epoch 150, Loss: -0.7923
Epoch 200, Loss: -0.7923
Epoch 250, Loss: -0.7923
Epoch 300, Loss: -0.7923
Epoch 350, Loss: -0.7923
Epoch 400, Loss: -0.7923
Epoch 450, Loss: -0.7923
