# Clean Dataset and Feature Engineering

## Import 

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

import os
import re
from collections import Counter

In [2]:
curr_dir = os.getcwd()
data_dir = 'cleaned_data'
train_file = 'train_clean.csv'
valid_file = 'validation_clean.csv'
path_to_train = os.path.join(curr_dir, data_dir, train_file)
path_to_valid = os.path.join(curr_dir, data_dir, valid_file)
train_df = pd.read_csv(path_to_train)
valid_df = pd.read_csv(path_to_valid)
train_df = pd.DataFrame(train_df)
valid_df = pd.DataFrame(valid_df)

In [3]:
train_df.head()
train_df.dtypes

student_id                                                            int64
tasks_use_model                                                      object
academic_use_likelihood                                               int64
suboptimal_frequency                                                float64
suboptimal_example                                                   object
reference_expectation                                               float64
verify_frequency                                                    float64
verify_method                                                        object
label                                                                 int64
best_task_types_brainstorming_or_generating_creative_ideas            int64
best_task_types_converting_content_between_formats                    int64
best_task_types_data_processing_or_analysis                           int64
best_task_types_drafting_professional_text                            int64
best_task_ty

In [34]:
for col in ["tasks_use_model", "suboptimal_example"]:
    train_df[col] = train_df[col].fillna("").astype(str).str.strip()
    valid_df[col] = valid_df[col].fillna("").astype(str).str.strip()

In [35]:
def tokenize(s):
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    return s.split()

In [36]:
tokens_tasks_train = train_df["tasks_use_model"].apply(tokenize)
tokens_sub_train   = train_df["suboptimal_example"].apply(tokenize)
tokens_tasks_val = valid_df["tasks_use_model"].apply(tokenize)
tokens_sub_val   = valid_df["suboptimal_example"].apply(tokenize)


In [37]:
def build_vocab(token_series, max_vocab=10000):
    counter = Counter()
    for toks in token_series:
        counter.update(toks)
    most_common = counter.most_common(max_vocab - 2)
    word2id = {"<PAD>": 0, "<UNK>": 1}
    for i, (w, _) in enumerate(most_common, start=2):
        word2id[w] = i
    return word2id

word2id_tasks = build_vocab(tokens_tasks_train, max_vocab=8000)
word2id_sub   = build_vocab(tokens_sub_train,   max_vocab=8000)

In [38]:
V_tasks = len(word2id_tasks)
V_sub   = len(word2id_sub)

def encode(tokens, word2id):
    return [word2id.get(t, 1) for t in tokens]  # 1 = <UNK>

encoded_tasks_train = tokens_tasks_train.apply(lambda ts: encode(ts, word2id_tasks))
encoded_sub_train   = tokens_sub_train.apply(lambda ts: encode(ts, word2id_sub))
encoded_tasks_val = tokens_tasks_val.apply(lambda ts: [word2id_tasks.get(t, 1) for t in ts])
encoded_sub_val   = tokens_sub_val.apply(lambda ts: [word2id_sub.get(t, 1) for t in ts])

In [39]:
# ---- pad to fixed lengths (can be different) ----
max_len_tasks = 40
max_len_sub   = 80

def pad(seq, max_len):
    seq = seq[:max_len]
    return seq + [0] * (max_len - len(seq))   # 0 = <PAD>

X_tasks_ids_train = np.array([pad(s, max_len_tasks) for s in encoded_tasks_train], dtype=np.int64)
X_sub_ids_train  = np.array([pad(s, max_len_sub)   for s in encoded_sub_train],   dtype=np.int64)
X_tasks_ids_val = np.array([pad(s, max_len_tasks) for s in encoded_tasks_val], dtype=np.int64)
X_sub_ids_val   = np.array([pad(s, max_len_sub)   for s in encoded_sub_val],   dtype=np.int64)

N = len(train_df)
print(X_tasks_ids_train.shape, X_sub_ids_train.shape)  # (N, T1), (N, T2)

# labels, y = np.unique(train_df["label"], return_inverse=True)
# C = len(labels)

(576, 40) (576, 80)


In [40]:
target_col = "label"

text_cols = ["tasks_use_model", "suboptimal_example"]
cat_cols = ["verify_method"]        # simple categorical weâ€™ll encode
drop_cols = text_cols + [target_col]

num_cols = [c for c in train_df.columns 
            if c not in drop_cols + cat_cols]

In [41]:
df = train_df.copy()

for c in cat_cols:
    df[c] = df[c].astype("category")
    
# save mappings for later use in final numpy-only script
cat_mapping = {
    c: dict(enumerate(df[c].cat.categories))  # index -> category
    for c in cat_cols
}
# and reverse mapping (category string -> code)
cat_inverse_mapping = {
    c: {v: k for k, v in mapping.items()}
    for c, mapping in cat_mapping.items()
}

df[cat_cols] = df[cat_cols].apply(lambda col: col.cat.codes)

In [42]:
# Split into features and target
X_train = df[num_cols + cat_cols].copy()
y_train = df[target_col].values

In [43]:
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[num_cols])

# replace numeric columns with scaled values
X_train[num_cols] = X_train_num

# final numpy arrays
X_train = X_train.to_numpy().astype(np.float32)
y_train = y_train.astype(np.int64)

input_dim = X_train.shape[1]
num_classes = len(np.unique(y_train))
input_dim, num_classes

(22, 3)

In [44]:
class CombinedDataset(torch.utils.data.Dataset):
    def __init__(self, X_num, tasks_ids, sub_ids, y):
        self.X_num = torch.from_numpy(X_num).float()
        self.tasks = torch.from_numpy(tasks_ids).long()
        self.sub = torch.from_numpy(sub_ids).long()
        self.y = torch.from_numpy(y).long()
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X_num[idx], self.tasks[idx], self.sub[idx], self.y[idx]

In [45]:
class MLP(nn.Module):
    def __init__(self, num_input_dim, V_tasks, V_sub, emb_dim=32, hidden_dim=64, num_classes=3):
        super().__init__()
        self.emb_tasks = nn.Embedding(V_tasks, emb_dim, padding_idx=0)
        self.emb_sub   = nn.Embedding(V_sub,   emb_dim, padding_idx=0)
        self.num_proj  = nn.Linear(num_input_dim, hidden_dim)
        # after concatenation: hidden_dim + emb_dim*2
        self.fc1 = nn.Linear(hidden_dim + emb_dim*2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x_num, tasks_ids, sub_ids):
        # x_num: (B, num_input_dim)
        # tasks_ids: (B, T1), sub_ids: (B, T2)
        emb_t = self.emb_tasks(tasks_ids)  # (B, T1, d)
        emb_s = self.emb_sub(sub_ids)      # (B, T2, d)

        # mask PAD (id==0) when averaging
        mask_t = (tasks_ids != 0).unsqueeze(-1).float()
        mask_s = (sub_ids != 0).unsqueeze(-1).float()

        # safe mean: sum / count (clamp to avoid div0)
        sum_t = (emb_t * mask_t).sum(dim=1)
        denom_t = mask_t.sum(dim=1).clamp(min=1.0)
        mean_t = sum_t / denom_t

        sum_s = (emb_s * mask_s).sum(dim=1)
        denom_s = mask_s.sum(dim=1).clamp(min=1.0)
        mean_s = sum_s / denom_s

        x_num_proj = self.relu(self.num_proj(x_num))
        x = torch.cat([x_num_proj, mean_t, mean_s], dim=1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.out(x)


In [46]:
batch_size = 32

train_ds = TensorDataset(
    torch.from_numpy(X_train), torch.from_numpy(y_train)
)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [47]:
emb_dim = 32
hidden_dim = 64

# ensure X_train, X_tasks_ids, X_sub_ids, y_train already exist as numpy arrays
ds = CombinedDataset(X_train, X_tasks_ids_train, X_sub_ids_train, y_train)
train_loader = DataLoader(ds, batch_size=32, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP(num_input_dim=input_dim, V_tasks=V_tasks, V_sub=V_sub,
                    emb_dim=emb_dim, hidden_dim=hidden_dim, num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# prepare validation features / labels (use same preprocessing and mappings as train)
# ensure text cols cleaned / tokenized / encoded / padded already (X_tasks_ids_val, X_sub_ids_val exist)
# map categorical columns using the same mapping (fallback to 0 if unseen)
for c in cat_cols:
    if c in valid_df.columns:
        mapping = cat_inverse_mapping[c]  # category string -> code
        valid_df[c] = valid_df[c].map(lambda v: mapping.get(v, 0)).astype(int)
    else:
        valid_df[c] = 0

# Ensure numeric columns exist in validation
for nc in num_cols:
    if nc not in valid_df.columns:
        valid_df[nc] = 0.0

X_val_df = valid_df[num_cols + cat_cols].copy()
# scale numeric columns using previously fitted scaler
X_val_df[num_cols] = scaler.transform(X_val_df[num_cols])
X_val = X_val_df.to_numpy().astype(np.float32)

y_val = valid_df[target_col].values.astype(np.int64)

val_ds = CombinedDataset(X_val, X_tasks_ids_val, X_sub_ids_val, y_val)
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False)

def evaluate(loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb_num, xb_tasks, xb_sub, yb in loader:
            xb_num, xb_tasks, xb_sub, yb = xb_num.to(device), xb_tasks.to(device), xb_sub.to(device), yb.to(device)
            preds = model(xb_num, xb_tasks, xb_sub).argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    return correct / total if total > 0 else 0.0

num_epochs = 50
for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss = 0.0

    for xb_num, xb_tasks, xb_sub, yb in train_loader:
        xb_num, xb_tasks, xb_sub, yb = xb_num.to(device), xb_tasks.to(device), xb_sub.to(device), yb.to(device)

        optimizer.zero_grad()
        logits = model(xb_num, xb_tasks, xb_sub)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * xb_num.size(0)

    train_loss = running_loss / len(train_loader.dataset)
    # compute train accuracy
    train_acc = evaluate(train_loader)
    # compute validation accuracy
    val_acc = evaluate(val_loader)

    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch:3d} | train_loss={train_loss:.4f} | train_acc={train_acc:.4f} | val_acc={val_acc:.4f}")

Epoch   1 | train_loss=1.1626 | train_acc=0.3524 | val_acc=0.3333
Epoch   5 | train_loss=1.0216 | train_acc=0.5208 | val_acc=0.5610
Epoch  10 | train_loss=0.8248 | train_acc=0.5955 | val_acc=0.5610
Epoch  15 | train_loss=0.7273 | train_acc=0.7396 | val_acc=0.6179
Epoch  20 | train_loss=0.6452 | train_acc=0.7083 | val_acc=0.5772
Epoch  25 | train_loss=0.4139 | train_acc=0.8281 | val_acc=0.5772
Epoch  30 | train_loss=0.3053 | train_acc=0.8889 | val_acc=0.5610
Epoch  35 | train_loss=0.2873 | train_acc=0.9045 | val_acc=0.5691
Epoch  40 | train_loss=0.3935 | train_acc=0.8229 | val_acc=0.5772
Epoch  45 | train_loss=0.1016 | train_acc=0.9670 | val_acc=0.5772
Epoch  50 | train_loss=0.0765 | train_acc=0.9844 | val_acc=0.5772
