# Clean Dataset and Feature Engineering

## Import and Settings

In [133]:
import os
import re
from collections import Counter

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import json

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Neural Network Hyperparameters
BATCH_SIZE = 16
LEARNING_RATE = 0.002
EPOCHS = 80
VOCAB_SIZE = 300  # Prevent overfitting

In [98]:
curr_dir = os.getcwd()
data_dir = 'cleaned_data'
train_file = 'train_clean.csv'
valid_file = 'validation_clean.csv'
test_file = 'test_clean.csv'
path_to_train = os.path.join(curr_dir, data_dir, train_file)
path_to_valid = os.path.join(curr_dir, data_dir, valid_file)
path_to_test = os.path.join(curr_dir, data_dir, test_file)
train_df = pd.read_csv(path_to_train)
valid_df = pd.read_csv(path_to_valid)
test_df = pd.read_csv(path_to_test)
train_df = pd.DataFrame(train_df)
valid_df = pd.DataFrame(valid_df)
test_df = pd.DataFrame(test_df)

## Data Cleaning

In [99]:
train_df = pd.concat([train_df, valid_df], ignore_index=True)

In [134]:
import re

# suboptimal_example is too noisy will thus be dropped
text_cols = ["tasks_use_model", "verify_method"]
numeric_cols = ["academic_use_likelihood", "suboptimal_frequency", 
                "reference_expectation", "verify_frequency"]
binary_cols = [c for c in train_df.columns if "task_types" in c]

best_task_cols = [c for c in train_df.columns if "best_task_types" in c]
suboptimal_task_cols = [c for c in train_df.columns if "suboptimal_task_types" in c]

def add_task_sum(df):
    df['best_task_count'] = df[best_task_cols].sum(axis=1)
    df['suboptimal_task_count'] = df[suboptimal_task_cols].sum(axis=1)
    return df

train_df = add_task_sum(train_df)
test_df = add_task_sum(test_df)

def clean_text(s):
    # Convert input to string, handling NaNs, floats, etc.
    if s is None:
        s = ""
    s = str(s)  
    
    # Your original cleaning logic
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

for col in text_cols:
    # Use .astype(str) on the column before applying for extra safety, 
    # or just use the improved clean_text function.
    train_df[col] = train_df[col].apply(clean_text) 
    test_df[col] = test_df[col].apply(clean_text)

In [124]:
def combine_text(df):
    df["full_text"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

train_df = combine_text(train_df)
test_df = combine_text(test_df)

# combined_text = train_df["full_text"] + test_df["full_text"]

In [102]:
# Create Vocabulary (Top 300 words only)
word_counts = Counter(train_df["full_text"].str.cat(sep=" ").split())
vocab_list = sorted([w for w, c in word_counts.most_common(VOCAB_SIZE)])
vocab_map = {w: i for i, w in enumerate(vocab_list)}

## SKIP THESE CODE BLOCKS. KEEPING IT STILL JUST FOR REFERENCE

In [None]:
# def encode_text_to_bow(text_series, vocab_vector):
#     """
#     Converts a pandas Series of text into a raw Bag-of-Words count NumPy array
#     based on a provided vocabulary.
#     """
    
#     # 1. Create a dictionary map for fast vocabulary lookup
#     # This maps the word to its column index in the final matrix
#     vocab_map = {word: i for i, word in enumerate(vocab_vector)}
#     vocab_size = len(vocab_vector)
#     num_documents = len(text_series)
    
#     # Initialize the count matrix (BoW)
#     # Using integer type for simple counts
#     X_bow = np.zeros((num_documents, vocab_size), dtype=np.int32)
    
#     # 2. Fill the BoW count matrix
#     for doc_index, document in enumerate(text_series):
#         # The text is assumed to be cleaned and lowercased already
#         words = document.split()
        
#         for word in words:
#             if word in vocab_map:
#                 word_index = vocab_map[word]
#                 # Increment the count for this word in this document
#                 X_bow[doc_index, word_index] += 1
                
#     return X_bow

# X_train_bow = encode_text_to_bow(train_df['full_text'], vocab_vector)
# X_valid_bow = encode_text_to_bow(valid_df['full_text'], vocab_vector)
# X_test_bow = encode_text_to_bow(test_df['full_text'], vocab_vector)

# print(f"Shape of Training BoW Matrix: {X_train_bow.shape}")
# print(f"Example of first row (document counts): {X_train_bow[0, :5]}")

Shape of Training BoW Matrix: (576, 500)
Example of first row (document counts): [0 1 1 0 0]


In [None]:
# train_encoded = pd.DataFrame([train_df['academic_use_likelihood'], 
#                         train_df['suboptimal_frequency'], 
#                         train_df['reference_expectation'],
#                         train_df['verify_frequency']]).transpose()

# valid_encoded = pd.DataFrame([valid_df['academic_use_likelihood'], 
#                         valid_df['suboptimal_frequency'], 
#                         valid_df['reference_expectation'],
#                         valid_df['verify_frequency']]).transpose()

# test_encoded = pd.DataFrame([test_df['academic_use_likelihood'], 
#                         test_df['suboptimal_frequency'], 
#                         test_df['reference_expectation'],
#                         test_df['verify_frequency']]).transpose()

In [None]:
# train_encoded = pd.concat([train_encoded, pd.DataFrame(X_train_bow)],ignore_index=True, sort=False, axis=1)
# valid_encoded = pd.concat([valid_encoded, pd.DataFrame(X_valid_bow)],ignore_index=True, sort=False, axis=1)
# test_encoded = pd.concat([test_encoded, pd.DataFrame(X_test_bow)],ignore_index=True, sort=False, axis=1)

In [None]:
# train_t = np.stack([train_df['label']], axis=1).reshape(-1)
# valid_t = np.stack([valid_df['label']], axis=1).reshape(-1)
# test_t = np.stack([test_df['label']], axis=1).reshape(-1)

In [21]:
# # Separate features and target
# X_num_cat = train_df[num_cols + cat_cols].copy()
# y = train_df[target_col].values

# # Scale numeric columns (on full dataset)
# scaler = StandardScaler()
# if num_cols:
#     X_num_scaled = scaler.fit_transform(X_num_cat[num_cols])
#     X_num_cat[num_cols] = X_num_scaled

# X_num_cat = X_num_cat.to_numpy().astype(np.float32)  # numeric + cat
# X_bow = bow_matrix.astype(np.float32)

# # Final feature matrix: concat [numeric+cat, BoW]
# X = np.concatenate([X_num_cat, X_bow], axis=1)
# y = y.astype(np.int64)

# N, input_dim = X.shape
# num_classes = len(np.unique(y))

# print("X shape:", X.shape)
# print("y shape:", y.shape)
# print("num_classes:", num_classes)

In [22]:
# X_val_num_cat = valid_df[num_cols + cat_cols].copy()

# # Scale numeric columns with the same scaler
# if num_cols:
#     X_val_num = scaler.transform(X_val_num_cat[num_cols])
#     X_val_num_cat[num_cols] = X_val_num

# X_val_num_cat = X_val_num_cat.to_numpy().astype(np.float32)
# X_val = np.concatenate([X_val_num_cat, bow_matrix_val], axis=1)

# y_val = valid_df[target_col].astype(np.int64).values

In [None]:
# batch_size = 32

# train_ds = TensorDataset(
#     torch.from_numpy(train_encoded.to_numpy().astype(np.float32)),
#     torch.from_numpy(train_t.astype(np.int64))
# )

# train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [None]:
# batch_size = 32

# val_ds = TensorDataset(
#     torch.from_numpy(valid_encoded.to_numpy().astype(np.float32)),
#     torch.from_numpy(valid_t.astype(np.int64))
# )

# val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

In [None]:
# batch_size = 32

# test_ds = TensorDataset(
#     torch.from_numpy(test_encoded.to_numpy().astype(np.float32)),
#     torch.from_numpy(test_t.astype(np.int64))
# )

# test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [None]:
# input_dim=train_encoded.to_numpy().shape[1]
# num_classes=3

In [None]:
# next(iter(train_loader))[0].shape

torch.Size([32, 504])

In [None]:
# class MLPBoW(nn.Module):
#     def __init__(self, input_dim, hidden_dim=64, num_classes=3, dropout_p=0.3):
#         super().__init__()
#         self.fc1 = nn.Linear(input_dim, hidden_dim)
#         # self.fc2 = nn.Linear(hidden_dim, hidden_dim)
#         self.out = nn.Linear(hidden_dim, num_classes)
#         self.relu = nn.ReLU()
#         # self.dropout = nn.Dropout(dropout_p)

#     def forward(self, x):
#         x = self.relu(self.fc1(x))
#         # x = self.relu(self.fc2(x))
#         # x = self.dropout(x)
#         x = self.out(x)
#         return x

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = MLPBoW(input_dim=input_dim, hidden_dim=64, num_classes=num_classes, dropout_p=0.3).to(device)
# model

MLPBoW(
  (fc1): Linear(in_features=504, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=3, bias=True)
  (relu): ReLU()
)

In [None]:
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# def evaluate(loader):
#     model.eval()
#     correct, total, running_loss = 0, 0, 0.0
#     with torch.no_grad():
#         for xb, yb in loader:
#             xb = xb.to(device)
#             yb = yb.to(device)
#             logits = model(xb)
#             loss = criterion(logits, yb)
#             running_loss += loss.item() * xb.size(0)
#             preds = logits.argmax(dim=1)
#             correct += (preds == yb).sum().item()
#             total += yb.size(0)
#     return running_loss / total, correct / total

In [None]:
# import copy

# num_epochs = 100
# patience = 10
# min_delta = 0.0

# best_val_loss = float("inf")
# best_epoch = 0
# epochs_no_improve = 0
# best_state_dict = copy.deepcopy(model.state_dict())

# history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}

# for epoch in range(1, num_epochs + 1):
#     model.train()
#     running_loss = 0.0

#     for xb, yb in train_loader:
#         xb = xb.to(device)
#         yb = yb.to(device)

#         optimizer.zero_grad()
#         logits = model(xb)
#         loss = criterion(logits, yb)
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item() * xb.size(0)

#     # === After epoch: evaluate ===
#     train_loss, train_acc = evaluate(train_loader)
#     val_loss, val_acc = evaluate(val_loader)

#     history["train_loss"].append(train_loss)
#     history["train_acc"].append(train_acc)
#     history["val_loss"].append(val_loss)
#     history["val_acc"].append(val_acc)

#     if epoch % 5 == 0 or epoch == 1:
#         print(
#             f"Epoch {epoch:3d} | "
#             f"train_loss={train_loss:.4f} | train_acc={train_acc:.4f} | "
#             f"val_loss={val_loss:.4f} | val_acc={val_acc:.4f}"
#         )

# # restore best model
# model.load_state_dict(best_state_dict)

Epoch   1 | train_loss=1.0124 | train_acc=0.6545 | val_loss=1.0461 | val_acc=0.5772
Epoch   5 | train_loss=0.5773 | train_acc=0.8368 | val_loss=0.8682 | val_acc=0.5935
Epoch  10 | train_loss=0.2974 | train_acc=0.9288 | val_loss=0.9481 | val_acc=0.5772
Epoch  15 | train_loss=0.1709 | train_acc=0.9705 | val_loss=1.2010 | val_acc=0.5691
Epoch  20 | train_loss=0.1095 | train_acc=0.9792 | val_loss=1.4643 | val_acc=0.5691
Epoch  25 | train_loss=0.0768 | train_acc=0.9809 | val_loss=1.7373 | val_acc=0.5447
Epoch  30 | train_loss=0.0583 | train_acc=0.9878 | val_loss=1.9237 | val_acc=0.5366
Epoch  35 | train_loss=0.0472 | train_acc=0.9896 | val_loss=2.1184 | val_acc=0.5041
Epoch  40 | train_loss=0.0398 | train_acc=0.9878 | val_loss=2.2762 | val_acc=0.4959
Epoch  45 | train_loss=0.0367 | train_acc=0.9896 | val_loss=2.3879 | val_acc=0.4878
Epoch  50 | train_loss=0.0307 | train_acc=0.9931 | val_loss=2.5347 | val_acc=0.5041
Epoch  55 | train_loss=0.0279 | train_acc=0.9913 | val_loss=2.6296 | val_acc

<All keys matched successfully>

## END SKIP

In [135]:
def get_features_labels(df, text_series, vocab_map, is_train=True):
    
    # Extract original numeric cols
    orig_nums = df[["academic_use_likelihood", "suboptimal_frequency", 
                    "reference_expectation", "verify_frequency"]].values
    scaled_orig = (orig_nums - 3.0) / 1.2
    
    # Scale the new feature separately
    task_count = df[["best_task_count", "suboptimal_task_count"]].values
    scaled_count = (task_count - 2.0) / 1.5 
    
    X_num = np.hstack([scaled_orig, scaled_count])
    
    # 2. Binary Features
    X_bin = df[binary_cols].values
    
    # 3. Bag of Words (Log Scaled)
    X_bow = np.zeros((len(df), len(vocab_map)), dtype=np.float32)
    for i, text in enumerate(text_series):
        words = text.split()
        for w in words:
            if w in vocab_map:
                X_bow[i, vocab_map[w]] += 1
    X_bow = np.log1p(X_bow)
    
    # Combine
    X = np.hstack([X_num, X_bin, X_bow]).astype(np.float32)
    
    if is_train or 'label' in df.columns:
        y = df['label'].values.astype(np.int64)
        return X, y
    return X, None

X_train, y_train = get_features_labels(train_df, train_df["full_text"], vocab_map)
X_test, y_test = get_features_labels(test_df, test_df["full_text"], vocab_map)

# Loaders
train_tensor = torch.utils.data.TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
train_loader = torch.utils.data.DataLoader(train_tensor, batch_size=BATCH_SIZE, shuffle=True)

In [136]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 3)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4) # High dropout for regularization

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

model = MLP(input_dim=X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.5)

In [137]:
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    correct = 0
    total = 0
    
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)
        
    scheduler.step()
    
    if (epoch+1) % 5 == 0:
        print(f"Epoch {epoch+1:2d} | Loss: {total_loss/len(train_loader):.4f} | Train Acc: {correct/total:.4f}")

# Final Evaluation
model.eval()
with torch.no_grad():
    logits = model(torch.tensor(X_test))
    preds = torch.argmax(logits, dim=1)
    acc = (preds == torch.tensor(y_test)).float().mean()
    print(f"\nFinal Test Accuracy: {acc.item():.4f}")

Epoch  5 | Loss: 0.5885 | Train Acc: 0.7582
Epoch 10 | Loss: 0.4186 | Train Acc: 0.8484
Epoch 15 | Loss: 0.2755 | Train Acc: 0.9070
Epoch 20 | Loss: 0.1981 | Train Acc: 0.9328
Epoch 25 | Loss: 0.1431 | Train Acc: 0.9557
Epoch 30 | Loss: 0.1151 | Train Acc: 0.9642
Epoch 35 | Loss: 0.0918 | Train Acc: 0.9714
Epoch 40 | Loss: 0.0714 | Train Acc: 0.9814
Epoch 45 | Loss: 0.0731 | Train Acc: 0.9757
Epoch 50 | Loss: 0.0796 | Train Acc: 0.9757
Epoch 55 | Loss: 0.0728 | Train Acc: 0.9814
Epoch 60 | Loss: 0.0557 | Train Acc: 0.9857
Epoch 65 | Loss: 0.0521 | Train Acc: 0.9814
Epoch 70 | Loss: 0.0443 | Train Acc: 0.9914
Epoch 75 | Loss: 0.0567 | Train Acc: 0.9800
Epoch 80 | Loss: 0.0530 | Train Acc: 0.9814

Final Test Accuracy: 0.6349
