In [1]:
import pandas as pd
import re
import numpy as np

df = pd.read_csv("/Users/seanwoo/CSC311_final_project/cleaned_data/train_clean.csv")

In [2]:
for col in ["tasks_use_model", "suboptimal_example"]:
    df[col] = df[col].fillna("").astype(str).str.strip()

df["text"] = df["tasks_use_model"] + " [SEP] " + df["suboptimal_example"]

In [3]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    tokens = text.split()
    return tokens

tokenized = df["text"].apply(tokenize)

In [4]:
from collections import Counter

counter = Counter()
for toks in tokenized:
    counter.update(toks)

max_vocab = 10000  # or whatever
most_common = counter.most_common(max_vocab - 2)

word2id = {"<PAD>": 0, "<UNK>": 1}
for i, (w, c) in enumerate(most_common, start=2):
    word2id[w] = i

V = len(word2id)

In [5]:
def encode(tokens):
    return [word2id.get(t, 1) for t in tokens]  # 1 = <UNK>

encoded = tokenized.apply(encode)

In [6]:
max_len = 80  # pick based on length distribution

def pad(seq, max_len=max_len):
    seq = seq[:max_len]
    return seq + [0] * (max_len - len(seq))  # 0 = <PAD>

X_ids = np.array([pad(seq) for seq in encoded], dtype=np.int64)  # shape (N, T)

In [7]:
labels, y = np.unique(df["label"], return_inverse=True)
C = len(labels)      # number of classes
N, T = X_ids.shape   # N samples, T tokens per sample

In [8]:
rng = np.random.default_rng(0)

D = 50   # embedding dimension
H = 64   # hidden units

E  = rng.normal(0, 0.1, size=(V, D))     # embeddings
W1 = rng.normal(0, 0.1, size=(D, H))
b1 = np.zeros(H)
W2 = rng.normal(0, 0.1, size=(H, C))
b2 = np.zeros(C)

In [9]:
def forward(X_ids):
    # X_ids: (N, T)
    emb = E[X_ids]                    # (N, T, D)
    mask = (X_ids != 0)[..., None]    # (N, T, 1), 0 for PAD

    # sum embeddings of non-pad tokens
    sum_emb = (emb * mask).sum(axis=1)            # (N, D)
    lengths = np.clip(mask.sum(axis=1), 1, None)  # (N, 1) avoid /0
    pooled = sum_emb / lengths                    # (N, D)

    # hidden layer
    h_preact = pooled @ W1 + b1       # (N, H)
    h = np.maximum(h_preact, 0.0)     # ReLU

    # output logits
    logits = h @ W2 + b2              # (N, C)
    return pooled, h, logits

In [10]:
def softmax(logits):
    logits = logits - logits.max(axis=1, keepdims=True)
    exps = np.exp(logits)
    return exps / exps.sum(axis=1, keepdims=True)

def cross_entropy(probs, y):
    # y: (N,) integer class labels
    N = probs.shape[0]
    return -np.log(probs[np.arange(N), y] + 1e-12).mean()


In [11]:
def loss_and_grads(X_ids, y):
    global E, W1, b1, W2, b2

    N, T = X_ids.shape
    pooled, h, logits = forward(X_ids)
    probs = softmax(logits)

    # loss
    loss = cross_entropy(probs, y)

    # one-hot labels
    y_onehot = np.zeros_like(probs)
    y_onehot[np.arange(N), y] = 1

    # dL/dlogits
    dlogits = (probs - y_onehot) / N    # (N, C)

    # grads for W2, b2
    dW2 = h.T @ dlogits                 # (H, C)
    db2 = dlogits.sum(axis=0)           # (C,)

    # back to hidden
    dh = dlogits @ W2.T                 # (N, H)
    dh_preact = dh * (h > 0)            # ReLU derivative

    # grads W1, b1
    dW1 = pooled.T @ dh_preact          # (D, H)
    db1 = dh_preact.sum(axis=0)         # (H,)

    # grad wrt pooled embeddings
    dpooled = dh_preact @ W1.T          # (N, D)

    # back through mean pooling to token embeddings
    emb = E[X_ids]                      # (N, T, D)
    mask = (X_ids != 0)[..., None]                                  # (N, T, 1)
    lengths = np.clip((X_ids != 0).sum(axis=1, keepdims=True), 1, None)  # (N, 1)
    lengths = lengths[..., None]                                    # (N, 1, 1)
    demb = (dpooled[:, None, :] * mask) / lengths                   # (N, T, D)


    # accumulate into dE (embedding matrix)
    dE = np.zeros_like(E)
    # vectorized "scatter add"
    np.add.at(dE, X_ids.ravel(), demb.reshape(-1, D))
    dE[0] = 0.0  # ignore PAD row

    return loss, (dE, dW1, db1, dW2, db2)


In [12]:
lr = 0.01
num_epochs = 50

for epoch in range(num_epochs):
    loss, (dE, dW1, db1, dW2, db2) = loss_and_grads(X_ids, y)

    E  -= lr * dE
    W1 -= lr * dW1
    b1 -= lr * db1
    W2 -= lr * dW2
    b2 -= lr * db2

    if epoch % 5 == 0:
        print(f"epoch {epoch:02d}, loss {loss:.4f}")


epoch 00, loss 1.0986
epoch 05, loss 1.0986
epoch 10, loss 1.0986
epoch 15, loss 1.0986
epoch 20, loss 1.0985
epoch 25, loss 1.0985
epoch 30, loss 1.0985
epoch 35, loss 1.0985
epoch 40, loss 1.0985
epoch 45, loss 1.0985
