In [249]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm import tqdm
from torch.nn import functional as F
import matplotlib.pyplot as plt

from torch.optim.lr_scheduler import LambdaLR


# 設定設備 (使用 GPU 如果可用)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [250]:
### 1️⃣ 數據處理: K-Means 量化 + Min-Max Normalization
class TabularPreprocessor:
    def __init__(self, num_clusters=10):
        self.num_clusters = num_clusters
        self.scalers = {}  # Min-Max Scalers
        self.kmeans_models = {}  # K-Means 模型
    
    def fit_transform(self, df, num_cols, cat_cols):
        """對數值型數據進行 K-Means 量化 & Min-Max Normalization"""
        # df_transformed = df.copy()
        df_transformed = pd.DataFrame()

        # 數值型數據：K-Means 量化 + Min-Max Normalization
        for col in num_cols:
            kmeans = KMeans(n_clusters=self.num_clusters, random_state=42)
            cluster_labels = kmeans.fit_predict(df[[col]].values)
            self.kmeans_models[col] = kmeans
            
            # Min-Max Normalization
            scaler = MinMaxScaler()
            normalized_values = scaler.fit_transform(cluster_labels.reshape(-1, 1))
            df_transformed[col] = normalized_values.flatten()
            self.scalers[col] = scaler
        

        return df_transformed
        # return df_transformed.astype(np.float32)

    def transform(self, df, num_cols, cat_cols):
        """對新數據進行同樣的轉換"""
        # df_transformed = df.copy()
        df_transformed = pd.DataFrame()

        for col in num_cols:
            cluster_labels = self.kmeans_models[col].predict(df[[col]].values)
            df_transformed[col] = self.scalers[col].transform(cluster_labels.reshape(-1, 1)).flatten()


        return df_transformed
        # return df_transformed.astype(np.float32)
        
    def inverse_transform(self, df_transformed, num_cols):
        """將標準化的數據轉回原始數據格式"""
        df_reversed = pd.DataFrame()

        for col in num_cols:
            # 反向 Min-Max Normalization
            cluster_labels = self.scalers[col].inverse_transform(df_transformed[[col]].values)

            # 反向 K-Means 找回最近的數值
            cluster_centers = self.kmeans_models[col].cluster_centers_.flatten()

            # 確保索引不會超過範圍
            original_values = np.array([cluster_centers[int(np.clip(round(label), 0, len(cluster_centers)-1))] 
                                        for label in cluster_labels.flatten()])

            df_reversed[col] = original_values

        return df_reversed


In [251]:
### 2️⃣ TabMT 模型 (Masked Transformer)
class TabMT(nn.Module):
    def __init__(self, input_dim, d_model=128, num_heads=8, num_layers=4, dropout=0.1):
        super(TabMT, self).__init__()
        self.input_dim = input_dim
        self.d_model = d_model

        # Embedding for each field
        self.embedding = nn.Linear(input_dim, d_model)
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Output layer
        self.output_layer = nn.Linear(d_model, input_dim)  

    def forward(self, x):
        """
        x: (batch_size, input_dim)
        mask: (batch_size, input_dim), 1 = masked, 0 = unmasked
        """
        x = self.embedding(x)  # (batch_size, input_dim, d_model)
        x = self.transformer(x)  # Transformer Encoder
        x = self.output_layer(x)  # (batch_size, input_dim)
        return x


In [252]:
### 3️⃣ Masking 機制
def random_masking(batch, mask_prob=0.3, training = True):
    """
    對 batch 進行隨機遮罩 (Random Masking)
    """
    
    if training:
        seed = torch.randint(0, 2, (1,))
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
    else:
        torch.manual_seed(42)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(42)
    
    
    mask = torch.rand(batch.shape, device=batch.device) < mask_prob
    masked_batch = batch.clone()
    masked_batch[mask] = 0  # 用 0 來代表 MASK
    return masked_batch, mask

In [253]:
def evaluate_tabmt(model, valid_loader):
    model.eval()
    mes_losses = []

    with torch.no_grad():
        for batch in valid_loader:
            batch = batch[0].to(device).float()
            masked_batch, mask = random_masking(batch, 0.1, False)

            outputs = model(masked_batch)
            mse_per_feature = ((outputs[mask] - batch[mask]) ** 2).mean(dim=0)  # 計算每個 feature 的 MSE
            mes_losses.append(mse_per_feature.item())

    print("Valid  MSE:", np.mean(mes_losses))
    print()

In [254]:


### 4️⃣ 訓練函數
def train_tabmt(model, data_loader, data_loader_valid, preprocessor, numerical_score_vars, epochs=50, lr=1e-3):
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.MSELoss()  # 只對 Masked tokens 計算 Loss
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        
        mse_losses = []
        
        model.train()

        for batch in data_loader:
            batch = batch[0].to(device).float()  # 轉為 Tensor
            
            # 生成 Masked Input
            masked_batch, mask = random_masking(batch, mask_prob=0.1)

            # 預測
            outputs = model(masked_batch)
            
            batch_df = pd.DataFrame(batch.detach().cpu().numpy(), columns=numerical_score_vars)
            outputs_df = pd.DataFrame(outputs.detach().cpu().numpy(), columns=numerical_score_vars)

            original_values = preprocessor.inverse_transform(batch_df, numerical_score_vars)
            predicted_values = preprocessor.inverse_transform(outputs_df, numerical_score_vars)

            
            # 計算 Loss (只對 Masked 部分計算)
            loss = criterion(outputs[mask], batch[mask])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            
            mse_per_feature = ((outputs[mask] - batch[mask]) ** 2).mean(dim=0)  # 計算每個 feature 的 MSE
            mse_losses.append(mse_per_feature.item())
            

        print("Training Per-feature MSE:", np.mean(mse_losses))
        
        # print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(data_loader):.4f}")
        evaluate_tabmt(model, data_loader_valid)
    
    print(f"ori: {type(original_values)}")    
    print(f"pred: {type(predicted_values)}")    

    print()
    
    print(f"ori: {original_values.iloc[0, :]}")
    print('---------')  
    print(f"pred: {predicted_values.iloc[0, :]}")    
    
    return model


In [255]:
df = pd.read_csv("./Students_Grading_Dataset.csv")

# remove useless attribute
unimportant_attribute = ['Student_ID', 'First_Name', 'Last_Name', 'Email']
filtered_df = df.drop(unimportant_attribute, axis=1)

# define vars type
category_vars = ['Gender', 'Department', 'Grade', 'Extracurricular_Activities', 'Internet_Access_at_Home', 'Parent_Education_Level', 'Family_Income_Level']
numerical_score_vars = ['Attendance (%)', 'Midterm_Score', 'Final_Score', 'Assignments_Avg', 'Quizzes_Avg', 'Participation_Score', 'Projects_Score', 'Total_Score', 'Stress_Level (1-10)']
numerical_scalar_vars = list(set(filtered_df.columns) - set(category_vars) - set(numerical_score_vars))

nan_rows = filtered_df.isna().any(axis=1)

# Nan rows
df_nan = filtered_df[nan_rows]
print(f"row with Nan: {df_nan.shape}")
# Complete rows
df_complete = filtered_df[~nan_rows]
print(f"row without Nan: {df_complete.shape}")

df_train, df_valid, _, _ = train_test_split(df_complete, df_complete, test_size=0.3, random_state=0)

print(f"df_train: {df_train.shape}")
print(f"df_valid: {df_valid.shape}")

row with Nan: (2419, 19)
row without Nan: (2581, 19)
df_train: (1806, 19)
df_valid: (775, 19)


In [256]:
# 預處理數據
preprocessor = TabularPreprocessor(num_clusters=50)
df_transformed = preprocessor.fit_transform(df_train, num_cols=numerical_score_vars, cat_cols=category_vars)
df_transformed_valid = preprocessor.transform(df_valid, num_cols=numerical_score_vars, cat_cols=category_vars)

# 轉換為 PyTorch Tensor
dataset = TensorDataset(torch.tensor(df_transformed.values))
data_loader = DataLoader(dataset, batch_size=256, shuffle=True)

dataset = TensorDataset(torch.tensor(df_transformed_valid.values))
data_loader_valid = DataLoader(dataset, batch_size=256, shuffle=True)


# 訓練模型
tabmt = TabMT(input_dim=df_transformed.shape[1]).to(device)
tabmt = train_tabmt(tabmt, data_loader, data_loader_valid, preprocessor, numerical_score_vars, epochs=3000)

  return fit_method(estimator, *args, **kwargs)


Training Per-feature MSE: 0.5120200607925653
Valid  MSE: 0.17832684516906738

Training Per-feature MSE: 0.1355664897710085
Valid  MSE: 0.1274215690791607

Training Per-feature MSE: 0.11242064088582993
Valid  MSE: 0.09414102137088776

Training Per-feature MSE: 0.09561207890510559
Valid  MSE: 0.09521926380693913

Training Per-feature MSE: 0.08841738989576697
Valid  MSE: 0.08279376849532127

Training Per-feature MSE: 0.07924612192437053
Valid  MSE: 0.09146656095981598

Training Per-feature MSE: 0.07624122151173651
Valid  MSE: 0.08536043204367161

Training Per-feature MSE: 0.07367256545694545
Valid  MSE: 0.0875404067337513

Training Per-feature MSE: 0.0710652353009209
Valid  MSE: 0.08697316981852055

Training Per-feature MSE: 0.07038846382056363
Valid  MSE: 0.08702417276799679

Training Per-feature MSE: 0.06948159565945389
Valid  MSE: 0.0878992136567831

Training Per-feature MSE: 0.0689416930617881
Valid  MSE: 0.08848796226084232

Training Per-feature MSE: 0.06853015855995181
Valid  MSE: 0