# Clean Dataset and Feature Engineering

## Import 

In [1]:
import os
import re
from collections import Counter

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

In [2]:
curr_dir = os.getcwd()
data_dir = 'cleaned_data'
train_file = 'train_clean.csv'
valid_file = 'validation_clean.csv'
test_file = 'test_clean.csv'
path_to_train = os.path.join(curr_dir, data_dir, train_file)
path_to_valid = os.path.join(curr_dir, data_dir, valid_file)
path_to_test = os.path.join(curr_dir, data_dir, test_file)
train_df = pd.read_csv(path_to_train)
valid_df = pd.read_csv(path_to_valid)
test_df = pd.read_csv(path_to_test)
train_df = pd.DataFrame(train_df)
valid_df = pd.DataFrame(valid_df)
test_df = pd.DataFrame(test_df)

In [3]:
train_df.head()
train_df.dtypes

student_id                                                            int64
tasks_use_model                                                      object
academic_use_likelihood                                               int64
suboptimal_frequency                                                float64
suboptimal_example                                                   object
reference_expectation                                               float64
verify_frequency                                                    float64
verify_method                                                        object
label                                                                 int64
best_task_types_brainstorming_or_generating_creative_ideas            int64
best_task_types_converting_content_between_formats                    int64
best_task_types_data_processing_or_analysis                           int64
best_task_types_drafting_professional_text                            int64
best_task_ty

In [4]:
import re

text_cols = ["tasks_use_model", "suboptimal_example", "verify_method"]

def clean_text(s):
    # Convert input to string, handling NaNs, floats, etc.
    if s is None:
        s = ""
    s = str(s)  
    
    # Your original cleaning logic
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

for col in text_cols:
    # Use .astype(str) on the column before applying for extra safety, 
    # or just use the improved clean_text function.
    train_df[col] = train_df[col].apply(clean_text) 
    valid_df[col] = valid_df[col].apply(clean_text)
    test_df[col] = test_df[col].apply(clean_text)

In [5]:
text_cols = ["tasks_use_model", "suboptimal_example", "verify_method"]

def combine_text(df):
    df["full_text"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

train_df = combine_text(train_df)
valid_df = combine_text(valid_df)
test_df = combine_text(test_df)

combined_text = train_df["full_text"] + valid_df["full_text"] + test_df["full_text"]

In [6]:
full_corpus = combined_text.str.cat(sep=' ')

all_words = full_corpus.split()

vocab_vector = np.unique(all_words)

In [7]:
import numpy as np

def encode_text_to_bow(text_series, vocab_vector):
    """
    Converts a pandas Series of text into a raw Bag-of-Words count NumPy array
    based on a provided vocabulary.
    """
    
    # 1. Create a dictionary map for fast vocabulary lookup
    # This maps the word to its column index in the final matrix
    vocab_map = {word: i for i, word in enumerate(vocab_vector)}
    vocab_size = len(vocab_vector)
    num_documents = len(text_series)
    
    # Initialize the count matrix (BoW)
    # Using integer type for simple counts
    X_bow = np.zeros((num_documents, vocab_size), dtype=np.int32)
    
    # 2. Fill the BoW count matrix
    for doc_index, document in enumerate(text_series):
        # The text is assumed to be cleaned and lowercased already
        words = document.split()
        
        for word in words:
            if word in vocab_map:
                word_index = vocab_map[word]
                # Increment the count for this word in this document
                X_bow[doc_index, word_index] += 1
                
    return X_bow

X_train_bow = encode_text_to_bow(train_df['full_text'], vocab_vector)
X_valid_bow = encode_text_to_bow(valid_df['full_text'], vocab_vector)
X_test_bow = encode_text_to_bow(test_df['full_text'], vocab_vector)

print(f"Shape of Training BoW Matrix: {X_train_bow.shape}")
print(f"Example of first row (document counts): {X_train_bow[0, :5]}")

Shape of Training BoW Matrix: (576, 2738)
Example of first row (document counts): [0 0 0 0 0]


In [8]:
train_encoded = pd.DataFrame([train_df['academic_use_likelihood'], 
                        train_df['suboptimal_frequency'], 
                        train_df['reference_expectation'],
                        train_df['verify_frequency']]).transpose()

valid_encoded = pd.DataFrame([valid_df['academic_use_likelihood'], 
                        valid_df['suboptimal_frequency'], 
                        valid_df['reference_expectation'],
                        valid_df['verify_frequency']]).transpose()

test_encoded = pd.DataFrame([test_df['academic_use_likelihood'], 
                        test_df['suboptimal_frequency'], 
                        test_df['reference_expectation'],
                        test_df['verify_frequency']]).transpose()

In [9]:
train_encoded = pd.concat([train_encoded, pd.DataFrame(X_train_bow)],ignore_index=True, sort=False, axis=1)
valid_encoded = pd.concat([valid_encoded, pd.DataFrame(X_valid_bow)],ignore_index=True, sort=False, axis=1)
test_encoded = pd.concat([test_encoded, pd.DataFrame(X_test_bow)],ignore_index=True, sort=False, axis=1)

In [10]:
train_t = np.stack([train_df['label']], axis=1).reshape(-1)
valid_t = np.stack([valid_df['label']], axis=1).reshape(-1)
test_t = np.stack([test_df['label']], axis=1).reshape(-1)

SKIP THESE CODE BLOCKS. KEEPING IT STILL JUST FOR REFERENCE

In [None]:
# Separate features and target
X_num_cat = train_df[num_cols + cat_cols].copy()
y = train_df[target_col].values

# Scale numeric columns (on full dataset)
scaler = StandardScaler()
if num_cols:
    X_num_scaled = scaler.fit_transform(X_num_cat[num_cols])
    X_num_cat[num_cols] = X_num_scaled

X_num_cat = X_num_cat.to_numpy().astype(np.float32)  # numeric + cat
X_bow = bow_matrix.astype(np.float32)

# Final feature matrix: concat [numeric+cat, BoW]
X = np.concatenate([X_num_cat, X_bow], axis=1)
y = y.astype(np.int64)

N, input_dim = X.shape
num_classes = len(np.unique(y))

print("X shape:", X.shape)
print("y shape:", y.shape)
print("num_classes:", num_classes)

X shape: (576, 1620)
y shape: (576,)
num_classes: 3


In [None]:
X_val_num_cat = valid_df[num_cols + cat_cols].copy()

# Scale numeric columns with the same scaler
if num_cols:
    X_val_num = scaler.transform(X_val_num_cat[num_cols])
    X_val_num_cat[num_cols] = X_val_num

X_val_num_cat = X_val_num_cat.to_numpy().astype(np.float32)
X_val = np.concatenate([X_val_num_cat, bow_matrix_val], axis=1)

y_val = valid_df[target_col].astype(np.int64).values

END SKIP

In [33]:
batch_size = 32

train_ds = TensorDataset(
    torch.from_numpy(train_encoded.to_numpy().astype(np.float32)),
    torch.from_numpy(train_t.astype(np.int64))
)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [34]:
batch_size = 32

val_ds = TensorDataset(
    torch.from_numpy(valid_encoded.to_numpy().astype(np.float32)),
    torch.from_numpy(valid_t.astype(np.int64))
)

val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

In [35]:
batch_size = 32

test_ds = TensorDataset(
    torch.from_numpy(test_encoded.to_numpy().astype(np.float32)),
    torch.from_numpy(test_t.astype(np.int64))
)

test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [36]:
input_dim=train_encoded.to_numpy().shape[1]
num_classes=3

In [37]:
next(iter(train_loader))[0].shape

torch.Size([32, 2742])

In [48]:
class MLPBoW(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_classes=3, dropout_p=0.3):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        # self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, num_classes)
        self.relu = nn.ReLU()
        # self.dropout = nn.Dropout(dropout_p)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        # x = self.relu(self.fc2(x))
        # x = self.dropout(x)
        x = self.out(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLPBoW(input_dim=input_dim, hidden_dim=64, num_classes=num_classes, dropout_p=0.3).to(device)
model

MLPBoW(
  (fc1): Linear(in_features=2742, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=3, bias=True)
  (relu): ReLU()
)

In [45]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def evaluate(loader):
    model.eval()
    correct, total, running_loss = 0, 0, 0.0
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)
            logits = model(xb)
            loss = criterion(logits, yb)
            running_loss += loss.item() * xb.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    return running_loss / total, correct / total

In [49]:
import copy

num_epochs = 100
patience = 10
min_delta = 0.0

best_val_loss = float("inf")
best_epoch = 0
epochs_no_improve = 0
best_state_dict = copy.deepcopy(model.state_dict())

history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}

for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss = 0.0

    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * xb.size(0)

    # === After epoch: evaluate ===
    train_loss, train_acc = evaluate(train_loader)
    val_loss, val_acc = evaluate(val_loader)

    history["train_loss"].append(train_loss)
    history["train_acc"].append(train_acc)
    history["val_loss"].append(val_loss)
    history["val_acc"].append(val_acc)

    if epoch % 5 == 0 or epoch == 1:
        print(
            f"Epoch {epoch:3d} | "
            f"train_loss={train_loss:.4f} | train_acc={train_acc:.4f} | "
            f"val_loss={val_loss:.4f} | val_acc={val_acc:.4f}"
        )

# restore best model
model.load_state_dict(best_state_dict)

Epoch   1 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc=0.3333
Epoch   5 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc=0.3333
Epoch  10 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc=0.3333
Epoch  15 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc=0.3333
Epoch  20 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc=0.3333
Epoch  25 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc=0.3333
Epoch  30 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc=0.3333
Epoch  35 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc=0.3333
Epoch  40 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc=0.3333
Epoch  45 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc=0.3333
Epoch  50 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc=0.3333
Epoch  55 | train_loss=1.0984 | train_acc=0.3524 | val_loss=1.0993 | val_acc

<All keys matched successfully>

In [50]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        logits = model(xb)
        preds = logits.argmax(dim=1) if logits.dim() > 1 else (logits.view(-1) > 0.5).long()
        correct += (preds == yb).sum().item()
        total += yb.size(0)
test_acc = float(correct) / float(total) if total else 0.0

print(f"Test accuracy: {test_acc:.6f}")

Test accuracy: 0.293651
