# Clean Dataset and Feature Engineering

## Import and Settings

In [1]:
import os
import re
import copy
from collections import Counter
import json
import random
import itertools

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Neural Network Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 0.0005
EPOCHS = 80
VOCAB_SIZE = 500  # Prevent overfitting

In [None]:
curr_dir = os.getcwd()
data_dir = 'cleaned_data'
train_file = 'train_clean.csv'
valid_file = 'validation_clean.csv'
test_file = 'test_clean.csv'
path_to_train = os.path.join(curr_dir, data_dir, train_file)
path_to_valid = os.path.join(curr_dir, data_dir, valid_file)
path_to_test = os.path.join(curr_dir, data_dir, test_file)
train_df = pd.read_csv(path_to_train)
valid_df = pd.read_csv(path_to_valid)
test_df = pd.read_csv(path_to_test)
train_df = pd.DataFrame(train_df)
valid_df = pd.DataFrame(valid_df)
test_df = pd.DataFrame(test_df)

## Data Cleaning

In [3]:
import re

# suboptimal_example is too noisy will thus be dropped
text_cols = ["tasks_use_model", "verify_method"]
numeric_cols = ["academic_use_likelihood", "suboptimal_frequency", 
                "reference_expectation", "verify_frequency"]
binary_cols = [c for c in train_df.columns if "task_types" in c]

best_task_cols = [c for c in train_df.columns if "best_task_types" in c]
suboptimal_task_cols = [c for c in train_df.columns if "suboptimal_task_types" in c]

def add_task_sum(df):
    df['best_task_count'] = df[best_task_cols].sum(axis=1)
    df['suboptimal_task_count'] = df[suboptimal_task_cols].sum(axis=1)
    return df

train_df = add_task_sum(train_df)
valid_df = add_task_sum(valid_df)
test_df = add_task_sum(test_df)

def clean_text(s):
    # Convert input to string, handling NaNs, floats, etc.
    if s is None:
        s = ""
    s = str(s)  
    
    # Your original cleaning logic
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

for col in text_cols:
    # Use .astype(str) on the column before applying for extra safety, 
    # or just use the improved clean_text function.
    train_df[col] = train_df[col].apply(clean_text) 
    valid_df[col] = valid_df[col].apply(clean_text)
    test_df[col] = test_df[col].apply(clean_text)

In [4]:
def combine_text(df):
    df["full_text"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

train_df = combine_text(train_df)
valid_df = combine_text(valid_df)
test_df = combine_text(test_df)

# combined_text = train_df["full_text"] + test_df["full_text"]

In [5]:
# Create Vocabulary (Top 300 words only)
word_counts = Counter(train_df["full_text"].str.cat(sep=" ").split())
vocab_list = sorted([w for w, c in word_counts.most_common(VOCAB_SIZE)])
vocab_map = {w: i for i, w in enumerate(vocab_list)}

In [6]:
# def get_features_labels(df, text_series, vocab_map, is_train=True):
    
#     # Extract original numeric cols
#     orig_nums = df[["academic_use_likelihood", "suboptimal_frequency", 
#                     "reference_expectation", "verify_frequency"]].values
#     scaled_orig = (orig_nums - 3.0) / 1.2
    
#     # Scale the new feature separately
#     task_count = df[["best_task_count", "suboptimal_task_count"]].values
#     scaled_count = (task_count - 2.0) / 1.5 

#     X_num = np.hstack([scaled_orig, scaled_count])
    
#     # 2. Binary Features
#     X_bin = df[binary_cols].values
    
#     # 3. Bag of Words (Log Scaled)
#     X_bow = np.zeros((len(df), len(vocab_map)), dtype=np.float32)
#     for i, text in enumerate(text_series):
#         words = text.split()
#         for w in words:
#             if w in vocab_map:
#                 X_bow[i, vocab_map[w]] += 1
#     X_bow = np.log1p(X_bow)
    
#     # Combine
#     X = np.hstack([X_num, X_bin, X_bow]).astype(np.float32)
    
#     if is_train or 'label' in df.columns:
#         y = df['label'].values.astype(np.int64)
#         return X, y
#     return X, None

# X_train, y_train = get_features_labels(train_df, train_df["full_text"], vocab_map)
# X_valid, y_valid = get_features_labels(valid_df, valid_df["full_text"], vocab_map)
# X_test, y_test = get_features_labels(test_df, test_df["full_text"], vocab_map)

In [7]:
# Initialize and Fit Scalers on TRAIN data only
scaler_num = StandardScaler()
scaler_count = StandardScaler()

# Extract raw data first to fit scalers
train_orig_nums = train_df[["academic_use_likelihood", "suboptimal_frequency", 
                            "reference_expectation", "verify_frequency"]].values
train_task_count = train_df[["best_task_count", "suboptimal_task_count"]].values

scaler_num.fit(train_orig_nums)
scaler_count.fit(train_task_count)

def get_features_labels_robust(df, text_series, vocab_map, scaler_num, scaler_count):
    # 1. Numeric Features (Use the fitted scalers)
    orig_nums = df[["academic_use_likelihood", "suboptimal_frequency", 
                    "reference_expectation", "verify_frequency"]].values
    # Fill NaNs with 0 or mean before scaling to prevent errors
    orig_nums = np.nan_to_num(orig_nums) 
    scaled_orig = scaler_num.transform(orig_nums)
    
    task_count = df[["best_task_count", "suboptimal_task_count"]].values
    task_count = np.nan_to_num(task_count)
    scaled_count = scaler_count.transform(task_count)

    X_num = np.hstack([scaled_orig, scaled_count])
    
    # 2. Binary Features (Ensure columns exist, fill missing with 0)
    # Uses the global 'binary_cols' list from the training setup
    try:
        X_bin = df[binary_cols].fillna(0).values
    except KeyError:
        # Create missing columns if they don't exist in test
        for c in binary_cols:
            if c not in df.columns:
                df[c] = 0
        X_bin = df[binary_cols].values
    
    # 3. Bag of Words (Same as before)
    X_bow = np.zeros((len(df), len(vocab_map)), dtype=np.float32)
    for i, text in enumerate(text_series):
        words = str(text).split() # Ensure string
        for w in words:
            if w in vocab_map:
                X_bow[i, vocab_map[w]] += 1
    X_bow = np.log1p(X_bow)
    
    X = np.hstack([X_num, X_bin, X_bow]).astype(np.float32)
    
    y = None
    if 'label' in df.columns:
        y = df['label'].values.astype(np.int64)
        
    return X, y

# Re-generate datasets using the robust function
X_train, y_train = get_features_labels_robust(train_df, train_df["full_text"], vocab_map, scaler_num, scaler_count)
X_valid, y_valid = get_features_labels_robust(valid_df, valid_df["full_text"], vocab_map, scaler_num, scaler_count)
X_test, y_test = get_features_labels_robust(test_df, test_df["full_text"], vocab_map, scaler_num, scaler_count)

In [8]:
# Loaders
train_tensor = torch.utils.data.TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
train_loader = torch.utils.data.DataLoader(train_tensor, batch_size=BATCH_SIZE, shuffle=True)
valid_tensor = torch.utils.data.TensorDataset(torch.tensor(X_valid), torch.tensor(y_valid))
valid_loader = torch.utils.data.DataLoader(valid_tensor, batch_size=BATCH_SIZE, shuffle=True)

In [9]:
# class MLP(nn.Module):
#     def __init__(self, input_dim):
#         super().__init__()
#         self.fc1 = nn.Linear(input_dim, 128)
#         self.fc2 = nn.Linear(128, 64)
#         self.fc3 = nn.Linear(64, 3)
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(0.4) # High dropout for regularization

#     def forward(self, x):
#         x = self.relu(self.fc1(x))
#         x = self.dropout(x)
#         x = self.relu(self.fc2(x))
#         x = self.dropout(x)
#         x = self.fc3(x)
#         return x

# model = MLP(input_dim=X_train.shape[1])
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3)

## Grid Search for Hyperparameter Tuning

In [10]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_layers=[128, 64], dropout_rate=0.4, output_dim=3):
        super().__init__()
        layers = []
        in_dim = input_dim
        
        for h_dim in hidden_layers:
            layers.append(nn.Linear(in_dim, h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            in_dim = h_dim
            
        layers.append(nn.Linear(in_dim, output_dim))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

In [11]:
def grid_search(params, train_loader, valid_loader, input_dim, patience=8):
    model = MLP(
        input_dim=input_dim,
        hidden_layers=params['hidden_layers'],
        dropout_rate=params['dropout']
    )
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    
    best_val_acc = 0.0
    best_model_state = None
    patience_counter = 0
    
    for epoch in range(params['epochs']):
        # Train
        model.train()
        for X_b, y_b in train_loader:
            optimizer.zero_grad()
            outputs = model(X_b)
            loss = criterion(outputs, y_b)
            loss.backward()
            optimizer.step()
        
        # Validate
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for X_v, y_v in valid_loader:
                outputs = model(X_v)
                preds = torch.argmax(outputs, dim=1)
                correct += (preds == y_v).sum().item()
                total += y_v.size(0)
        
        val_acc = correct / total
        
        # Early Stopping Logic
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = copy.deepcopy(model.state_dict())
            patience_counter = 0 
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            break
            
    return best_val_acc, best_model_state

In [12]:
# Hyperparameter Search Space
param_grid = {
    'lr': [0.001, 0.0005, 0.0001],
    'hidden_layers': [
        [128, 64],       # Original structure
        [256, 128],      # Wider
        [128, 64, 32],   # Deeper
        [64]             # Simpler
    ],
    'dropout': [0.3, 0.5],
    'weight_decay': [1e-3, 1e-4],
    'epochs': [80] # High cap, controlled by early stopping
}

best_overall_acc = 0.0
best_config = {}
best_weights = None
input_dim = X_train.shape[1]

# Generate combinations
keys, values = zip(*param_grid.items())
combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
print(f"Testing {len(combinations)} configurations...")
print(f"{'LR':<8} | {'Layers':<15} | {'Drop':<5} | {'WD':<8} | {'Val Acc':<8}")
print("-" * 60)

for params in combinations:
    val_acc, weights = grid_search(params, train_loader, valid_loader, input_dim)
    
    layer_str = str(params['hidden_layers'])
    print(f"{params['lr']:<8} | {layer_str:<15} | {params['dropout']:<5} | {params['weight_decay']:<8} | {val_acc:.4f}")
    
    if val_acc > best_overall_acc:
        best_overall_acc = val_acc
        best_config = params
        best_weights = weights

print("-" * 60)
print(f"Best Accuracy: {best_overall_acc:.4f}")
print(f"Best Config: {best_config}")

# Load best weights into a model instance for final testing
model = MLP(input_dim, best_config['hidden_layers'], best_config['dropout'])
model.load_state_dict(best_weights)

Testing 48 configurations...
LR       | Layers          | Drop  | WD       | Val Acc 
------------------------------------------------------------
0.001    | [128, 64]       | 0.3   | 0.001    | 0.7317
0.001    | [128, 64]       | 0.3   | 0.0001   | 0.7317
0.001    | [128, 64]       | 0.5   | 0.001    | 0.7236
0.001    | [128, 64]       | 0.5   | 0.0001   | 0.7236
0.001    | [256, 128]      | 0.3   | 0.001    | 0.7154
0.001    | [256, 128]      | 0.3   | 0.0001   | 0.7317
0.001    | [256, 128]      | 0.5   | 0.001    | 0.7236
0.001    | [256, 128]      | 0.5   | 0.0001   | 0.7317
0.001    | [128, 64, 32]   | 0.3   | 0.001    | 0.7154
0.001    | [128, 64, 32]   | 0.3   | 0.0001   | 0.6992
0.001    | [128, 64, 32]   | 0.5   | 0.001    | 0.7480
0.001    | [128, 64, 32]   | 0.5   | 0.0001   | 0.7317
0.001    | [64]            | 0.3   | 0.001    | 0.7561
0.001    | [64]            | 0.3   | 0.0001   | 0.7561
0.001    | [64]            | 0.5   | 0.001    | 0.7480
0.001    | [64]            |

<All keys matched successfully>

In [13]:
model.eval()
with torch.no_grad():
    # Ensure X_test is a tensor
    test_tensor = torch.tensor(X_test) if not isinstance(X_test, torch.Tensor) else X_test
    logits = model(test_tensor)
    preds = torch.argmax(logits, dim=1)
    # Calculate final test accuracy
    final_acc = (preds == torch.tensor(y_test)).float().mean().item()
    print(f"Test Accuracy: {final_acc}")

Test Accuracy: 0.7063491940498352


## Find Best Validation Accuracy:

In [14]:
# best_acc = 0.0
# best_model_wts = copy.deepcopy(model.state_dict())

# print(f"{'Epoch':<6} | {'Train Acc':<10} | {'Val Acc':<10}")
# print("-" * 30)

# for epoch in range(EPOCHS):
#     # Train
#     model.train()
#     correct, total = 0, 0
#     for X_b, y_b in train_loader:
#         optimizer.zero_grad()
#         outputs = model(X_b)
#         loss = criterion(outputs, y_b)
#         loss.backward()
#         optimizer.step()
        
#         preds = torch.argmax(outputs, dim=1)
#         correct += (preds == y_b).sum().item()
#         total += y_b.size(0)
#     train_acc = correct / total
    
#     # Validate
#     model.eval()
#     v_correct, v_total = 0, 0
#     with torch.no_grad():
#         for X_v, y_v in valid_loader:
#             outputs = model(X_v)
#             preds = torch.argmax(outputs, dim=1)
#             v_correct += (preds == y_v).sum().item()
#             v_total += y_v.size(0)
#     val_acc = v_correct / v_total
    
#     # Checkpoint
#     if val_acc > best_acc:
#         best_acc = val_acc
#         best_model_wts = copy.deepcopy(model.state_dict())
#         print(f"{epoch+1:<6} | {train_acc:<10.4f} | {val_acc:<10.4f} *")
#     elif (epoch+1) % 10 == 0:
#         print(f"{epoch+1:<6} | {train_acc:<10.4f} | {val_acc:<10.4f}")

# model.load_state_dict(best_model_wts)

In [15]:
# model.eval()
# with torch.no_grad():
#     # Ensure X_test is a tensor
#     test_tensor = torch.tensor(X_test) if not isinstance(X_test, torch.Tensor) else X_test
#     logits = model(test_tensor)
#     preds = torch.argmax(logits, dim=1)
#     # Calculate final test accuracy
#     final_acc = (preds == torch.tensor(y_test)).float().mean().item()
#     print(f"Test Accuracy: {final_acc}")

## Export Model

In [17]:
# # model = MLP(input_dim=X_train.shape[1])
# # model.load_state_dict(best_wts)

# # Extract weights (Transposed for x @ W + b shape)
# # We convert numpy arrays to lists so they can be serialized to JSON
# artifacts = {
#     "W1": model.fc1.weight.detach().numpy().T.tolist(),
#     "b1": model.fc1.bias.detach().numpy().tolist(),
#     "W2": model.fc2.weight.detach().numpy().T.tolist(),
#     "b2": model.fc2.bias.detach().numpy().tolist(),
#     "W3": model.fc3.weight.detach().numpy().T.tolist(),
#     "b3": model.fc3.bias.detach().numpy().tolist(),
#     "vocab_list": vocab_list,
#     "binary_cols": binary_cols,
#     "best_task_cols": best_task_cols,
#     "suboptimal_task_cols": suboptimal_task_cols
# }

# # Save to JSON file
# with open('model_artifacts.json', 'w') as f:
#     json.dump(artifacts, f)

# print("Successfully saved 'model_artifacts.json'.")

In [18]:
import json

# 1. Extract Model Weights & Biases
# We transpose weights (.T) so the math becomes (input @ weights + bias)
# This is standard for NumPy inference.
weights = []
biases = []

for key, param in model.state_dict().items():
    if 'weight' in key:
        weights.append(param.cpu().detach().numpy().T.tolist())
    elif 'bias' in key:
        biases.append(param.cpu().detach().numpy().tolist())

# 2. Extract Scaler Statistics (Critical for correct input scaling)
# Check if scalers are fitted; if using the manual method, you might need to hardcode these
scaler_data = {
    "num_mean": scaler_num.mean_.tolist(),
    "num_scale": scaler_num.scale_.tolist(),
    "count_mean": scaler_count.mean_.tolist(),
    "count_scale": scaler_count.scale_.tolist()
}

# 3. Bundle Everything
artifacts = {
    "weights": weights,
    "biases": biases,
    "vocab_map": vocab_map,
    "binary_cols": binary_cols, # List of binary column names
    "scalers": scaler_data
}

with open('model_artifacts.json', 'w') as f:
    json.dump(artifacts, f)

print("Export complete: model_artifacts.json")

Export complete: model_artifacts.json
