In [21]:
import re
import numpy as np
import pandas as pd
from collections import Counter

df = pd.read_csv("/Users/seanwoo/CSC311_final_project/cleaned_data/train_clean.csv")

for col in ["tasks_use_model", "suboptimal_example"]:
    df[col] = df[col].fillna("").astype(str).str.strip()

def tokenize(s: str):
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    return s.split()

tokens_tasks = df["tasks_use_model"].apply(tokenize)
tokens_sub   = df["suboptimal_example"].apply(tokenize)

# ---- build separate vocabularies ----
def build_vocab(token_series, max_vocab=10000):
    counter = Counter()
    for toks in token_series:
        counter.update(toks)
    most_common = counter.most_common(max_vocab - 2)
    word2id = {"<PAD>": 0, "<UNK>": 1}
    for i, (w, _) in enumerate(most_common, start=2):
        word2id[w] = i
    return word2id

word2id_tasks = build_vocab(tokens_tasks, max_vocab=8000)
word2id_sub   = build_vocab(tokens_sub,   max_vocab=8000)

V_tasks = len(word2id_tasks)
V_sub   = len(word2id_sub)

def encode(tokens, word2id):
    return [word2id.get(t, 1) for t in tokens]  # 1 = <UNK>

encoded_tasks = tokens_tasks.apply(lambda ts: encode(ts, word2id_tasks))
encoded_sub   = tokens_sub.apply(lambda ts: encode(ts, word2id_sub))

# ---- pad to fixed lengths (can be different) ----
max_len_tasks = 40
max_len_sub   = 80

def pad(seq, max_len):
    seq = seq[:max_len]
    return seq + [0] * (max_len - len(seq))   # 0 = <PAD>

X_tasks_ids = np.array([pad(s, max_len_tasks) for s in encoded_tasks], dtype=np.int64)
X_sub_ids   = np.array([pad(s, max_len_sub)   for s in encoded_sub],   dtype=np.int64)

N = len(df)
print(X_tasks_ids.shape, X_sub_ids.shape)  # (N, T1), (N, T2)

labels, y = np.unique(df["label"], return_inverse=True)
C = len(labels)


(576, 40) (576, 80)


In [22]:
rng = np.random.default_rng(0)

D = 50   # embedding dim for each field
H = 64   # hidden units

E_tasks = rng.normal(0, 0.1, size=(V_tasks, D))
E_sub   = rng.normal(0, 0.1, size=(V_sub,   D))

# pooled_tasks (D) + pooled_sub (D) -> concat (2D)
W1 = rng.normal(0, 0.1, size=(2 * D, H))
b1 = np.zeros(H)
W2 = rng.normal(0, 0.1, size=(H, C))
b2 = np.zeros(C)

In [23]:
def forward(X_tasks_ids, X_sub_ids):
    # ----- tasks stream -----
    emb_t = E_tasks[X_tasks_ids]               # (N, T1, D)
    mask_t = (X_tasks_ids != 0)[..., None]     # (N, T1, 1)

    sum_emb_t = (emb_t * mask_t).sum(axis=1)   # (N, D)
    len_t = np.clip(mask_t.sum(axis=1), 1, None)  # (N, 1)
    pooled_t = sum_emb_t / len_t               # (N, D)

    # ----- suboptimal stream -----
    emb_s = E_sub[X_sub_ids]                   # (N, T2, D)
    mask_s = (X_sub_ids != 0)[..., None]       # (N, T2, 1)

    sum_emb_s = (emb_s * mask_s).sum(axis=1)   # (N, D)
    len_s = np.clip(mask_s.sum(axis=1), 1, None)  # (N, 1)
    pooled_s = sum_emb_s / len_s               # (N, D)

    # concat pooled representations
    pooled = np.concatenate([pooled_t, pooled_s], axis=1)  # (N, 2D)

    # hidden â†’ logits
    h_preact = pooled @ W1 + b1               # (N, H)
    h = np.maximum(h_preact, 0.0)             # ReLU
    logits = h @ W2 + b2                      # (N, C)

    return pooled_t, pooled_s, pooled, h, logits

def softmax(logits):
    logits = logits - logits.max(axis=1, keepdims=True)
    exps = np.exp(logits)
    return exps / exps.sum(axis=1, keepdims=True)

def cross_entropy(probs, y):
    N = probs.shape[0]
    return -np.log(probs[np.arange(N), y] + 1e-12).mean()


In [24]:
def loss_and_grads(X_tasks_ids, X_sub_ids, y):
    global E_tasks, E_sub, W1, b1, W2, b2

    N = X_tasks_ids.shape[0]

    pooled_t, pooled_s, pooled, h, logits = forward(X_tasks_ids, X_sub_ids)
    probs = softmax(logits)

    loss = cross_entropy(probs, y)

    # one-hot labels
    y_onehot = np.zeros_like(probs)
    y_onehot[np.arange(N), y] = 1

    # dL/dlogits
    dlogits = (probs - y_onehot) / N          # (N, C)

    # W2, b2
    dW2 = h.T @ dlogits                       # (H, C)
    db2 = dlogits.sum(axis=0)                # (C,)

    # back to hidden
    dh = dlogits @ W2.T                      # (N, H)
    dh_preact = dh * (h > 0)                 # ReLU

    # W1, b1
    dW1 = pooled.T @ dh_preact               # (2D, H)
    db1 = dh_preact.sum(axis=0)              # (H,)

    # gradient wrt concatenated pooled representation
    dpooled = dh_preact @ W1.T               # (N, 2D)
    dpooled_t = dpooled[:, :D]               # (N, D)
    dpooled_s = dpooled[:, D:]               # (N, D)

    # ----- back to token embeddings: tasks -----
    mask_t = (X_tasks_ids != 0)[..., None]   # (N, T1, 1)
    len_t = np.clip((X_tasks_ids != 0).sum(axis=1, keepdims=True), 1, None)
    len_t = len_t[..., None]                 # (N, 1, 1)

    demb_t = (dpooled_t[:, None, :] * mask_t) / len_t  # (N, T1, D)

    dE_tasks = np.zeros_like(E_tasks)
    np.add.at(dE_tasks, X_tasks_ids.ravel(), demb_t.reshape(-1, D))
    dE_tasks[0] = 0.0   # don't update PAD

    # ----- back to token embeddings: suboptimal -----
    mask_s = (X_sub_ids != 0)[..., None]     # (N, T2, 1)
    len_s = np.clip((X_sub_ids != 0).sum(axis=1, keepdims=True), 1, None)
    len_s = len_s[..., None]                 # (N, 1, 1)

    demb_s = (dpooled_s[:, None, :] * mask_s) / len_s  # (N, T2, D)

    dE_sub = np.zeros_like(E_sub)
    np.add.at(dE_sub, X_sub_ids.ravel(), demb_s.reshape(-1, D))
    dE_sub[0] = 0.0   # don't update PAD

    return loss, (dE_tasks, dE_sub, dW1, db1, dW2, db2)


In [25]:
def accuracy(X_tasks, X_sub, y_true):
    _, _, _, _, logits = forward(X_tasks, X_sub)
    probs = softmax(logits)
    y_pred = probs.argmax(axis=1)
    return (y_pred == y_true).mean()

In [26]:
lr = 0.01
num_epochs = 50

for epoch in range(num_epochs):
    # train step on TRAIN SET
    loss_train, (dE_tasks, dE_sub, dW1, db1, dW2, db2) = loss_and_grads(
        X_tasks_ids, X_sub_ids, y
    )

    # parameter updates
    E_tasks -= lr * dE_tasks
    E_sub   -= lr * dE_sub
    W1      -= lr * dW1
    b1      -= lr * db1
    W2      -= lr * dW2
    b2      -= lr * db2

    if epoch % 5 == 0:
        train_acc = accuracy(X_tasks_ids, X_sub_ids, y)

        print(
            f"epoch {epoch:02d} | "
            f"train loss {loss_train:.4f}, train acc {train_acc:.3f} | "
        )


epoch 00 | train loss 1.0998, train acc 0.333 | 
epoch 05 | train loss 1.0998, train acc 0.333 | 
epoch 10 | train loss 1.0997, train acc 0.337 | 
epoch 15 | train loss 1.0997, train acc 0.337 | 
epoch 20 | train loss 1.0997, train acc 0.339 | 
epoch 25 | train loss 1.0996, train acc 0.344 | 
epoch 30 | train loss 1.0996, train acc 0.344 | 
epoch 35 | train loss 1.0995, train acc 0.344 | 
epoch 40 | train loss 1.0995, train acc 0.345 | 
epoch 45 | train loss 1.0995, train acc 0.340 | 


In [27]:
_, _, _, _, logits_test = forward(X_tasks_ids, X_sub_ids)
probs_test = softmax(logits_test)
y_pred = probs_test.argmax(axis=1)
predicted_labels = labels[y_pred]
print(predicted_labels)

[1 2 0 1 0 1 1 2 2 2 1 1 1 1 0 1 1 1 1 2 0 1 0 0 1 0 1 0 2 2 1 1 0 0 2 2 2
 0 1 1 2 0 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 2 1 0
 0 0 0 0 1 0 2 2 1 2 2 1 2 1 0 0 0 0 2 1 1 0 1 1 2 2 1 0 0 1 0 1 1 0 1 0 2
 1 0 1 0 0 1 0 1 2 0 1 0 0 1 0 1 1 1 0 1 1 1 0 1 0 1 0 1 1 0 1 0 1 1 0 1 0
 0 0 0 0 1 0 0 0 1 1 1 0 0 1 0 1 1 2 1 1 1 0 0 0 1 0 0 1 1 0 2 0 1 1 2 0 2
 0 1 0 2 1 1 0 1 1 1 1 0 2 0 0 1 1 1 1 1 2 1 1 2 0 1 1 0 2 0 0 0 1 1 0 1 0
 0 2 0 1 0 1 1 1 0 1 1 0 0 0 1 1 2 1 1 2 0 0 0 0 0 0 1 2 0 0 0 2 2 1 0 0 0
 1 0 1 2 2 0 2 0 1 1 0 1 2 2 1 1 1 1 2 1 1 0 1 0 1 1 0 0 1 1 1 0 1 0 0 1 1
 1 0 0 1 0 1 0 1 2 2 0 1 1 1 0 0 0 1 2 0 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 0 0
 0 1 1 0 1 1 1 0 0 1 1 2 0 1 0 2 0 0 0 1 2 1 0 0 0 1 0 0 1 2 0 1 1 1 1 1 0
 0 0 1 0 1 1 0 0 1 0 1 1 1 2 0 1 1 1 2 0 0 0 1 0 0 1 0 2 1 0 1 1 0 0 1 1 2
 2 0 0 1 0 0 0 1 0 2 0 1 0 0 0 0 1 1 1 2 0 2 0 1 0 1 0 1 2 2 1 0 0 1 1 1 2
 1 0 1 1 1 0 1 0 1 1 2 1 1 1 0 1 1 0 0 1 0 1 0 1 1 2 1 0 1 0 0 1 0 0 1 1 2
 1 0 1 1 0 0 0 1 1 1 1 1 