# Clean Dataset and Feature Engineering

## Import and Settings

In [82]:
import os
import re
import copy
from collections import Counter

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import json

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Neural Network Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 0.0005
EPOCHS = 80
VOCAB_SIZE = 500  # Prevent overfitting

In [83]:
curr_dir = os.getcwd()
data_dir = 'cleaned_data'
train_file = 'train_clean.csv'
valid_file = 'validation_clean.csv'
test_file = 'test_clean.csv'
path_to_train = os.path.join(curr_dir, data_dir, train_file)
path_to_valid = os.path.join(curr_dir, data_dir, valid_file)
path_to_test = os.path.join(curr_dir, data_dir, test_file)
train_df = pd.read_csv(path_to_train)
valid_df = pd.read_csv(path_to_valid)
test_df = pd.read_csv(path_to_test)
train_df = pd.DataFrame(train_df)
valid_df = pd.DataFrame(valid_df)
test_df = pd.DataFrame(test_df)

## Data Cleaning

In [84]:
import re

# suboptimal_example is too noisy will thus be dropped
text_cols = ["tasks_use_model", "verify_method"]
numeric_cols = ["academic_use_likelihood", "suboptimal_frequency", 
                "reference_expectation", "verify_frequency"]
binary_cols = [c for c in train_df.columns if "task_types" in c]

best_task_cols = [c for c in train_df.columns if "best_task_types" in c]
suboptimal_task_cols = [c for c in train_df.columns if "suboptimal_task_types" in c]

def add_task_sum(df):
    df['best_task_count'] = df[best_task_cols].sum(axis=1)
    df['suboptimal_task_count'] = df[suboptimal_task_cols].sum(axis=1)
    return df

train_df = add_task_sum(train_df)
valid_df = add_task_sum(valid_df)
test_df = add_task_sum(test_df)

def clean_text(s):
    # Convert input to string, handling NaNs, floats, etc.
    if s is None:
        s = ""
    s = str(s)  
    
    # Your original cleaning logic
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

for col in text_cols:
    # Use .astype(str) on the column before applying for extra safety, 
    # or just use the improved clean_text function.
    train_df[col] = train_df[col].apply(clean_text) 
    valid_df[col] = valid_df[col].apply(clean_text)
    test_df[col] = test_df[col].apply(clean_text)

In [85]:
def combine_text(df):
    df["full_text"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

train_df = combine_text(train_df)
valid_df = combine_text(valid_df)
test_df = combine_text(test_df)

# combined_text = train_df["full_text"] + test_df["full_text"]

In [86]:
# Create Vocabulary (Top 300 words only)
word_counts = Counter(train_df["full_text"].str.cat(sep=" ").split())
vocab_list = sorted([w for w, c in word_counts.most_common(VOCAB_SIZE)])
vocab_map = {w: i for i, w in enumerate(vocab_list)}

## SKIP THESE CODE BLOCKS. KEEPING IT STILL JUST FOR REFERENCE

In [29]:
# def encode_text_to_bow(text_series, vocab_vector):
#     """
#     Converts a pandas Series of text into a raw Bag-of-Words count NumPy array
#     based on a provided vocabulary.
#     """
    
#     # 1. Create a dictionary map for fast vocabulary lookup
#     # This maps the word to its column index in the final matrix
#     vocab_map = {word: i for i, word in enumerate(vocab_vector)}
#     vocab_size = len(vocab_vector)
#     num_documents = len(text_series)
    
#     # Initialize the count matrix (BoW)
#     # Using integer type for simple counts
#     X_bow = np.zeros((num_documents, vocab_size), dtype=np.int32)
    
#     # 2. Fill the BoW count matrix
#     for doc_index, document in enumerate(text_series):
#         # The text is assumed to be cleaned and lowercased already
#         words = document.split()
        
#         for word in words:
#             if word in vocab_map:
#                 word_index = vocab_map[word]
#                 # Increment the count for this word in this document
#                 X_bow[doc_index, word_index] += 1
                
#     return X_bow

# X_train_bow = encode_text_to_bow(train_df['full_text'], vocab_vector)
# X_valid_bow = encode_text_to_bow(valid_df['full_text'], vocab_vector)
# X_test_bow = encode_text_to_bow(test_df['full_text'], vocab_vector)

# print(f"Shape of Training BoW Matrix: {X_train_bow.shape}")
# print(f"Example of first row (document counts): {X_train_bow[0, :5]}")

In [30]:
# train_encoded = pd.DataFrame([train_df['academic_use_likelihood'], 
#                         train_df['suboptimal_frequency'], 
#                         train_df['reference_expectation'],
#                         train_df['verify_frequency']]).transpose()

# valid_encoded = pd.DataFrame([valid_df['academic_use_likelihood'], 
#                         valid_df['suboptimal_frequency'], 
#                         valid_df['reference_expectation'],
#                         valid_df['verify_frequency']]).transpose()

# test_encoded = pd.DataFrame([test_df['academic_use_likelihood'], 
#                         test_df['suboptimal_frequency'], 
#                         test_df['reference_expectation'],
#                         test_df['verify_frequency']]).transpose()

In [31]:
# train_encoded = pd.concat([train_encoded, pd.DataFrame(X_train_bow)],ignore_index=True, sort=False, axis=1)
# valid_encoded = pd.concat([valid_encoded, pd.DataFrame(X_valid_bow)],ignore_index=True, sort=False, axis=1)
# test_encoded = pd.concat([test_encoded, pd.DataFrame(X_test_bow)],ignore_index=True, sort=False, axis=1)

In [32]:
# train_t = np.stack([train_df['label']], axis=1).reshape(-1)
# valid_t = np.stack([valid_df['label']], axis=1).reshape(-1)
# test_t = np.stack([test_df['label']], axis=1).reshape(-1)

In [33]:
# # Separate features and target
# X_num_cat = train_df[num_cols + cat_cols].copy()
# y = train_df[target_col].values

# # Scale numeric columns (on full dataset)
# scaler = StandardScaler()
# if num_cols:
#     X_num_scaled = scaler.fit_transform(X_num_cat[num_cols])
#     X_num_cat[num_cols] = X_num_scaled

# X_num_cat = X_num_cat.to_numpy().astype(np.float32)  # numeric + cat
# X_bow = bow_matrix.astype(np.float32)

# # Final feature matrix: concat [numeric+cat, BoW]
# X = np.concatenate([X_num_cat, X_bow], axis=1)
# y = y.astype(np.int64)

# N, input_dim = X.shape
# num_classes = len(np.unique(y))

# print("X shape:", X.shape)
# print("y shape:", y.shape)
# print("num_classes:", num_classes)

In [34]:
# X_val_num_cat = valid_df[num_cols + cat_cols].copy()

# # Scale numeric columns with the same scaler
# if num_cols:
#     X_val_num = scaler.transform(X_val_num_cat[num_cols])
#     X_val_num_cat[num_cols] = X_val_num

# X_val_num_cat = X_val_num_cat.to_numpy().astype(np.float32)
# X_val = np.concatenate([X_val_num_cat, bow_matrix_val], axis=1)

# y_val = valid_df[target_col].astype(np.int64).values

In [35]:
# batch_size = 32

# train_ds = TensorDataset(
#     torch.from_numpy(train_encoded.to_numpy().astype(np.float32)),
#     torch.from_numpy(train_t.astype(np.int64))
# )

# train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [36]:
# batch_size = 32

# val_ds = TensorDataset(
#     torch.from_numpy(valid_encoded.to_numpy().astype(np.float32)),
#     torch.from_numpy(valid_t.astype(np.int64))
# )

# val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

In [37]:
# batch_size = 32

# test_ds = TensorDataset(
#     torch.from_numpy(test_encoded.to_numpy().astype(np.float32)),
#     torch.from_numpy(test_t.astype(np.int64))
# )

# test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [38]:
# input_dim=train_encoded.to_numpy().shape[1]
# num_classes=3

In [39]:
# next(iter(train_loader))[0].shape

In [40]:
# class MLPBoW(nn.Module):
#     def __init__(self, input_dim, hidden_dim=64, num_classes=3, dropout_p=0.3):
#         super().__init__()
#         self.fc1 = nn.Linear(input_dim, hidden_dim)
#         # self.fc2 = nn.Linear(hidden_dim, hidden_dim)
#         self.out = nn.Linear(hidden_dim, num_classes)
#         self.relu = nn.ReLU()
#         # self.dropout = nn.Dropout(dropout_p)

#     def forward(self, x):
#         x = self.relu(self.fc1(x))
#         # x = self.relu(self.fc2(x))
#         # x = self.dropout(x)
#         x = self.out(x)
#         return x

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = MLPBoW(input_dim=input_dim, hidden_dim=64, num_classes=num_classes, dropout_p=0.3).to(device)
# model

In [41]:
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# def evaluate(loader):
#     model.eval()
#     correct, total, running_loss = 0, 0, 0.0
#     with torch.no_grad():
#         for xb, yb in loader:
#             xb = xb.to(device)
#             yb = yb.to(device)
#             logits = model(xb)
#             loss = criterion(logits, yb)
#             running_loss += loss.item() * xb.size(0)
#             preds = logits.argmax(dim=1)
#             correct += (preds == yb).sum().item()
#             total += yb.size(0)
#     return running_loss / total, correct / total

In [42]:
# import copy

# num_epochs = 100
# patience = 10
# min_delta = 0.0

# best_val_loss = float("inf")
# best_epoch = 0
# epochs_no_improve = 0
# best_state_dict = copy.deepcopy(model.state_dict())

# history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}

# for epoch in range(1, num_epochs + 1):
#     model.train()
#     running_loss = 0.0

#     for xb, yb in train_loader:
#         xb = xb.to(device)
#         yb = yb.to(device)

#         optimizer.zero_grad()
#         logits = model(xb)
#         loss = criterion(logits, yb)
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item() * xb.size(0)

#     # === After epoch: evaluate ===
#     train_loss, train_acc = evaluate(train_loader)
#     val_loss, val_acc = evaluate(val_loader)

#     history["train_loss"].append(train_loss)
#     history["train_acc"].append(train_acc)
#     history["val_loss"].append(val_loss)
#     history["val_acc"].append(val_acc)

#     if epoch % 5 == 0 or epoch == 1:
#         print(
#             f"Epoch {epoch:3d} | "
#             f"train_loss={train_loss:.4f} | train_acc={train_acc:.4f} | "
#             f"val_loss={val_loss:.4f} | val_acc={val_acc:.4f}"
#         )

# # restore best model
# model.load_state_dict(best_state_dict)

## END SKIP

In [87]:
def get_features_labels(df, text_series, vocab_map, is_train=True):
    
    # Extract original numeric cols
    orig_nums = df[["academic_use_likelihood", "suboptimal_frequency", 
                    "reference_expectation", "verify_frequency"]].values
    scaled_orig = (orig_nums - 3.0) / 1.2
    
    # Scale the new feature separately
    task_count = df[["best_task_count", "suboptimal_task_count"]].values
    scaled_count = (task_count - 2.0) / 1.5 

    X_num = np.hstack([scaled_orig, scaled_count])
    
    # 2. Binary Features
    X_bin = df[binary_cols].values
    
    # 3. Bag of Words (Log Scaled)
    X_bow = np.zeros((len(df), len(vocab_map)), dtype=np.float32)
    for i, text in enumerate(text_series):
        words = text.split()
        for w in words:
            if w in vocab_map:
                X_bow[i, vocab_map[w]] += 1
    X_bow = np.log1p(X_bow)
    
    # Combine
    X = np.hstack([X_num, X_bin, X_bow]).astype(np.float32)
    
    if is_train or 'label' in df.columns:
        y = df['label'].values.astype(np.int64)
        return X, y
    return X, None

X_train, y_train = get_features_labels(train_df, train_df["full_text"], vocab_map)
X_valid, y_valid = get_features_labels(valid_df, valid_df["full_text"], vocab_map)
X_test, y_test = get_features_labels(test_df, test_df["full_text"], vocab_map)

# Loaders
train_tensor = torch.utils.data.TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
train_loader = torch.utils.data.DataLoader(train_tensor, batch_size=BATCH_SIZE, shuffle=True)
valid_tensor = torch.utils.data.TensorDataset(torch.tensor(X_valid), torch.tensor(y_valid))
valid_loader = torch.utils.data.DataLoader(valid_tensor, batch_size=BATCH_SIZE, shuffle=True)


In [88]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 3)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4) # High dropout for regularization

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [81]:
def train_model(lr, step_size):
    model = MLP(input_dim=X_train.shape[1])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-3)
    # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=0.5)
    

    # 1. Setup tracking for the best model
    best_acc = 0.0
    best_model_wts = copy.deepcopy(model.state_dict()) # Initialize with current weights

    for epoch in range(EPOCHS):
        # --- TRAIN PHASE ---
        model.train() # Set model to training mode
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            # Track training metrics
            train_loss += loss.item() * X_batch.size(0)
            preds = torch.argmax(outputs, dim=1)
            train_correct += (preds == y_batch).sum().item()
            train_total += y_batch.size(0)
            
        # Calculate average training loss and accuracy
        epoch_train_loss = train_loss / train_total
        epoch_train_acc = train_correct / train_total

        # --- VALIDATION PHASE ---
        model.eval() # Set model to evaluation mode (critical for Dropout/BatchNorm)
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad(): # Disable gradient calculation for speed
            for X_valid, y_valid in valid_loader:
                # Forward pass only
                outputs = model(X_valid)
                loss = criterion(outputs, y_valid)
                
                # Track validation metrics
                val_loss += loss.item() * X_valid.size(0)
                preds = torch.argmax(outputs, dim=1)
                val_correct += (preds == y_valid).sum().item()
                val_total += y_valid.size(0)
                
        epoch_val_loss = val_loss / val_total
        epoch_val_acc = val_correct / val_total
        
        # # Step the scheduler (if you are using one)
        # if 'scheduler' in locals():
        #     scheduler.step()

        # --- CHECKPOINTING ---
        # If this epoch's validation accuracy is better than the best we've seen, save it!
        if epoch_val_acc > best_acc:
            best_acc = epoch_val_acc
            best_model_wts = copy.deepcopy(model.state_dict()) # Save a deep copy of weights
            # print(f"Epoch {epoch+1:2d} | Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f} | Val Loss: {epoch_val_loss:.4f} Acc: {epoch_val_acc:.4f} *NEW BEST*")
    #     elif (epoch+1) % 5 == 0:
    #         # Print status every 5 epochs even if not a new best
    #         print(f"Epoch {epoch+1:2d} | Train Loss: {epoch_train_loss:.4f} Acc: {epoch_train_acc:.4f} | Val Loss: {epoch_val_loss:.4f} Acc: {epoch_val_acc:.4f}")

        # Final Evaluation
        model.load_state_dict(best_model_wts)
        model.eval()
        with torch.no_grad():
            logits = model(torch.tensor(X_test))
            preds = torch.argmax(logits, dim=1)
            acc = (preds == torch.tensor(y_test)).float().mean()
            # print(f"\nFinal Test Accuracy: {acc.item():.4f}")
    return acc, lr

best_acc = 0
best_lr = 0
learning_rates = [0.002, 0.0005, 0.000003]
for lr in learning_rates:
    acc, lr = train_model(lr, 20)
    if acc > best_acc:
        best_acc = acc
        best_lr = lr
print(f"Best Test Accuracy: {best_acc} | Best Learning Rate: {lr}")

Best Test Accuracy: 0.6904761791229248 | Best Learning Rate: 3e-06


## Test Code

In [None]:
# # Final Evaluation
# model.eval()
# with torch.no_grad():
#     logits = model(torch.tensor(X_test))
#     preds = torch.argmax(logits, dim=1)
#     acc = (preds == torch.tensor(y_test)).float().mean()
#     print(f"\nFinal Test Accuracy: {acc.item():.4f}")


Final Test Accuracy: 0.6587
