# Clean Dataset and Feature Engineering

## Import and Settings

In [13]:
import os
import re
import copy
from collections import Counter
import json
import random
import itertools

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Neural Network Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 0.0005
EPOCHS = 80
VOCAB_SIZE = 500  # Prevent overfitting

In [14]:
curr_dir = os.getcwd()
data_dir = 'cleaned_data'
train_file = 'train_clean.csv'
valid_file = 'validation_clean.csv'
test_file = 'test_clean.csv'
path_to_train = os.path.join(curr_dir, data_dir, train_file)
path_to_valid = os.path.join(curr_dir, data_dir, valid_file)
path_to_test = os.path.join(curr_dir, data_dir, test_file)
train_df = pd.read_csv(path_to_train)
valid_df = pd.read_csv(path_to_valid)
test_df = pd.read_csv(path_to_test)
train_df = pd.DataFrame(train_df)
valid_df = pd.DataFrame(valid_df)
test_df = pd.DataFrame(test_df)

## Data Cleaning

In [15]:
import re

# suboptimal_example is too noisy will thus be dropped
text_cols = ["tasks_use_model", "verify_method"]
numeric_cols = ["academic_use_likelihood", "suboptimal_frequency", 
                "reference_expectation", "verify_frequency"]
binary_cols = [c for c in train_df.columns if "task_types" in c]

best_task_cols = [c for c in train_df.columns if "best_task_types" in c]
suboptimal_task_cols = [c for c in train_df.columns if "suboptimal_task_types" in c]

def add_task_sum(df):
    df['best_task_count'] = df[best_task_cols].sum(axis=1)
    df['suboptimal_task_count'] = df[suboptimal_task_cols].sum(axis=1)
    return df

train_df = add_task_sum(train_df)
valid_df = add_task_sum(valid_df)
test_df = add_task_sum(test_df)

def clean_text(s):
    # Convert input to string, handling NaNs, floats, etc.
    if s is None:
        s = ""
    s = str(s)  
    
    # Your original cleaning logic
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

for col in text_cols:
    # Use .astype(str) on the column before applying for extra safety, 
    # or just use the improved clean_text function.
    train_df[col] = train_df[col].apply(clean_text) 
    valid_df[col] = valid_df[col].apply(clean_text)
    test_df[col] = test_df[col].apply(clean_text)

In [16]:
def combine_text(df):
    df["full_text"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

train_df = combine_text(train_df)
valid_df = combine_text(valid_df)
test_df = combine_text(test_df)

# combined_text = train_df["full_text"] + test_df["full_text"]

In [17]:
# Create Vocabulary (Top 300 words only)
word_counts = Counter(train_df["full_text"].str.cat(sep=" ").split())
vocab_list = sorted([w for w, c in word_counts.most_common(VOCAB_SIZE)])
vocab_map = {w: i for i, w in enumerate(vocab_list)}

In [18]:
def get_features_labels(df, text_series, vocab_map, is_train=True):
    
    # Extract original numeric cols
    orig_nums = df[["academic_use_likelihood", "suboptimal_frequency", 
                    "reference_expectation", "verify_frequency"]].values
    scaled_orig = (orig_nums - 3.0) / 1.2
    
    # Scale the new feature separately
    task_count = df[["best_task_count", "suboptimal_task_count"]].values
    scaled_count = (task_count - 2.0) / 1.5 

    X_num = np.hstack([scaled_orig, scaled_count])
    
    # 2. Binary Features
    X_bin = df[binary_cols].values
    
    # 3. Bag of Words (Log Scaled)
    X_bow = np.zeros((len(df), len(vocab_map)), dtype=np.float32)
    for i, text in enumerate(text_series):
        words = text.split()
        for w in words:
            if w in vocab_map:
                X_bow[i, vocab_map[w]] += 1
    X_bow = np.log1p(X_bow)
    
    # Combine
    X = np.hstack([X_num, X_bin, X_bow]).astype(np.float32)
    
    if is_train or 'label' in df.columns:
        y = df['label'].values.astype(np.int64)
        return X, y
    return X, None

X_train, y_train = get_features_labels(train_df, train_df["full_text"], vocab_map)
X_valid, y_valid = get_features_labels(valid_df, valid_df["full_text"], vocab_map)
X_test, y_test = get_features_labels(test_df, test_df["full_text"], vocab_map)

# Loaders
train_tensor = torch.utils.data.TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
train_loader = torch.utils.data.DataLoader(train_tensor, batch_size=BATCH_SIZE, shuffle=True)
valid_tensor = torch.utils.data.TensorDataset(torch.tensor(X_valid), torch.tensor(y_valid))
valid_loader = torch.utils.data.DataLoader(valid_tensor, batch_size=BATCH_SIZE, shuffle=True)


In [19]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 3)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4) # High dropout for regularization

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

model = MLP(input_dim=X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3)

## Grid Search for best learning rate

In [20]:
# def train_model_search(lr):
#     # Re-initialize model for each run to start fresh
#     model = MLP(input_dim=X_train.shape[1]) 
#     criterion = nn.CrossEntropyLoss()
    
#     # Using AdamW is generally better for stabilization
#     optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)
    
#     # Scheduler (optional for search, but good to keep if used in final model)
#     # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

#     best_acc = 0.0
#     best_model_wts = copy.deepcopy(model.state_dict())

#     for epoch in range(EPOCHS):
#         # --- TRAIN PHASE ---
#         model.train()
#         for X_batch, y_batch in train_loader:
#             optimizer.zero_grad()
#             outputs = model(X_batch)
#             loss = criterion(outputs, y_batch)
#             loss.backward()
#             optimizer.step()
        
#         # --- VALIDATION PHASE ---
#         model.eval()
#         val_correct = 0
#         val_total = 0
#         with torch.no_grad():
#             for X_valid, y_valid in valid_loader:
#                 outputs = model(X_valid)
#                 preds = torch.argmax(outputs, dim=1)
#                 val_correct += (preds == y_valid).sum().item()
#                 val_total += y_valid.size(0)
                
#         val_acc = val_correct / val_total

#         # Checkpoint based on Validation Accuracy
#         if val_acc > best_acc:
#             best_acc = val_acc
#             best_model_wts = copy.deepcopy(model.state_dict())

#     # Load best weights to evaluate on Test Set
#     model.load_state_dict(best_model_wts)
#     model.eval()
#     with torch.no_grad():
#         # Ensure X_test is a tensor
#         test_tensor = torch.tensor(X_test) if not isinstance(X_test, torch.Tensor) else X_test
#         logits = model(test_tensor)
#         preds = torch.argmax(logits, dim=1)
#         # Calculate final test accuracy
#         final_acc = (preds == torch.tensor(y_test)).float().mean().item()
    
#     return final_acc, best_model_wts

# # --- GRID SEARCH LOOP ---
# learning_rates = [0.005, 0.002, 0.001, 0.0005, 0.0001]
# best_overall_acc = 0.0
# best_lr = 0.0

# print(f"{'Learning Rate':<15} | {'Test Accuracy':<15}")
# print("-" * 35)

# for lr in learning_rates:
#     acc, wts = train_model_search(lr)
#     print(f"{lr:<15} | {acc:.4f}")
    
#     if acc > best_overall_acc:
#         best_overall_acc = acc
#         best_lr = lr
#         best_wts = wts

# print("-" * 35)
# print(f"Best Accuracy: {best_overall_acc:.4f} with Learning Rate: {best_lr}")

## Find Best Validation Accuracy:

In [None]:
best_acc = 0.0
best_model_wts = copy.deepcopy(model.state_dict())

print(f"{'Epoch':<6} | {'Train Acc':<10} | {'Val Acc':<10}")
print("-" * 30)

for epoch in range(EPOCHS):
    # Train
    model.train()
    correct, total = 0, 0
    for X_b, y_b in train_loader:
        optimizer.zero_grad()
        outputs = model(X_b)
        loss = criterion(outputs, y_b)
        loss.backward()
        optimizer.step()
        
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == y_b).sum().item()
        total += y_b.size(0)
    train_acc = correct / total
    
    # Validate
    model.eval()
    v_correct, v_total = 0, 0
    with torch.no_grad():
        for X_v, y_v in valid_loader:
            outputs = model(X_v)
            preds = torch.argmax(outputs, dim=1)
            v_correct += (preds == y_v).sum().item()
            v_total += y_v.size(0)
    val_acc = v_correct / v_total
    
    # Checkpoint
    if val_acc > best_acc:
        best_acc = val_acc
        best_model_wts = copy.deepcopy(model.state_dict())
        print(f"{epoch+1:<6} | {train_acc:<10.4f} | {val_acc:<10.4f} *")
    elif (epoch+1) % 10 == 0:
        print(f"{epoch+1:<6} | {train_acc:<10.4f} | {val_acc:<10.4f}")

model.load_state_dict(best_model_wts)

Epoch  | Train Acc  | Val Acc   
------------------------------
1      | 0.3420     | 0.3740     *
2      | 0.4792     | 0.5285     *
3      | 0.5312     | 0.5366     *
4      | 0.5764     | 0.6016     *
5      | 0.6510     | 0.6748     *
6      | 0.6858     | 0.7154     *
9      | 0.7847     | 0.7236     *
10     | 0.8090     | 0.6829    
20     | 0.9132     | 0.6992    
30     | 0.9653     | 0.6585    
40     | 0.9809     | 0.6504    
50     | 0.9861     | 0.6341    
60     | 0.9861     | 0.6423    
70     | 0.9948     | 0.6423    
80     | 0.9913     | 0.6423    


In [29]:
model.eval()
with torch.no_grad():
    # Ensure X_test is a tensor
    test_tensor = torch.tensor(X_test) if not isinstance(X_test, torch.Tensor) else X_test
    logits = model(test_tensor)
    preds = torch.argmax(logits, dim=1)
    # Calculate final test accuracy
    final_acc = (preds == torch.tensor(y_test)).float().mean().item()
    print(f"Test Accuracy: {final_acc}")

Test Accuracy: 0.7142857313156128


Load Model

In [26]:
# model = MLP(input_dim=X_train.shape[1])
# model.load_state_dict(best_wts)

# Extract weights (Transposed for x @ W + b shape)
# We convert numpy arrays to lists so they can be serialized to JSON
artifacts = {
    "W1": model.fc1.weight.detach().numpy().T.tolist(),
    "b1": model.fc1.bias.detach().numpy().tolist(),
    "W2": model.fc2.weight.detach().numpy().T.tolist(),
    "b2": model.fc2.bias.detach().numpy().tolist(),
    "W3": model.fc3.weight.detach().numpy().T.tolist(),
    "b3": model.fc3.bias.detach().numpy().tolist(),
    "vocab_list": vocab_list,
    "binary_cols": binary_cols,
    "best_task_cols": best_task_cols,
    "suboptimal_task_cols": suboptimal_task_cols
}

# Save to JSON file
with open('model_artifacts.json', 'w') as f:
    json.dump(artifacts, f)

print("Successfully saved 'model_artifacts.json'.")

Successfully saved 'model_artifacts.json'.
