In [None]:
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm import tqdm
from torch.nn import functional as F
import matplotlib.pyplot as plt

from torch.optim.lr_scheduler import LambdaLR

### Load Data

In [None]:
df = pd.read_csv("./Students_Grading_Dataset.csv")
# df

# for att in df:
#     print(f"{att}: {df[att][0]}")

## Manually select Cols (attribute)

In [None]:

unimportant_attribute = ['Student_ID', 'First_Name', 'Last_Name', 'Email', 'Participation_Score']

filtered_df = df.drop(unimportant_attribute, axis=1)
filtered_df

In [None]:
category_vars = ['Gender', 'Department', 'Grade', 'Extracurricular_Activities', 'Internet_Access_at_Home', 'Parent_Education_Level', 'Family_Income_Level']
# numerical_score_vars = ['Attendance (%)', 'Midterm_Score', 'Final_Score', 'Assignments_Avg', 'Quizzes_Avg', 'Participation_Score', 'Projects_Score', 'Total_Score', 'Stress_Level (1-10)']
numerical_score_vars = ['Attendance (%)', 'Midterm_Score', 'Final_Score', 'Assignments_Avg', 'Quizzes_Avg', 'Projects_Score', 'Total_Score']

numerical_scalar_vars = list(set(filtered_df.columns) - set(category_vars) - set(numerical_score_vars))

## Separate rows with-Nan and without-Nan

In [None]:
nan_rows = filtered_df.isna().any(axis=1)

# Nan rows
df_nan = filtered_df[nan_rows]
print(f"row with Nan: {df_nan.shape}")
# Complete rows
df_complete = filtered_df[~nan_rows]
print(f"row without Nan: {df_complete.shape}")

In [None]:
# split df_complete into train/valid
# data_amount = int(len(df_complete) * 0.8)
# df_train = df_complete.iloc[:data_amount, :]
# df_valid = df_complete.iloc[data_amount:, :]

df_train, df_valid, _, _ = train_test_split(df_complete, df_complete, test_size=0.3, random_state=0)

print(f"df_train: {df_train.shape}")
print(f"df_valid: {df_valid.shape}")

## Preprocessing: 
1. category to numerical
2. max-min norm

In [None]:
def category_to_numerical(data):
    le = LabelEncoder()
    le.fit(data)
    num_data = le.transform(data)
    
    return num_data, le

# def max_min_norm(data, train_params = None, process_type = 'train'):
    
#     if process_type == 'train':
#         data_max = np.max(data)
#         data_min = np.min(data)
#     else:
#         data_max = train_params['Age'][0]
#         data_min = train_params['Age'][1]
        
#     norm_data = (data - data_min) / (data_max - data_min + 1e-3)    
    
#     if process_type == 'train':
#         return norm_data, data_max, data_min
#     else:
#         return norm_data
    

def max_min_norm_score(data, train_params = None, process_type = 'train'):
    
    if process_type == 'train':
        data_max = 100
        data_min = 0
    else:
        data_max = 100
        data_min = 0
        
    norm_data = (data - data_min) / (data_max - data_min)    
    
    if process_type == 'train':
        return norm_data, data_max, data_min
    else:
        return norm_data
    
def max_min_norm_scalar(data, train_params = None, process_type = 'train'):
    
    if process_type == 'train':
        data_max = 10
        data_min = 0
    else:
        data_max = 10
        data_min = 0
        
    norm_data = (data - data_min) / (data_max - data_min)    
    
    if process_type == 'train':
        return norm_data, data_max, data_min
    else:
        return norm_data

    
def preprocessing(df, train_params = None, process_type = 'train'):
    
    new_df = pd.DataFrame()
    
    if process_type == 'train':
        train_params = {}
        category_var_len = {}

    # Category 
    for cat_name in category_vars:
        cat_var = df[cat_name]
        if process_type == 'train':
            cat_var, le = category_to_numerical(cat_var)
            train_params[f'{cat_name}_le'] = le
            category_var_len[f'{cat_name}'] = len(np.unique(cat_var))
        else:
            cat_var = train_params[f'{cat_name}_le'].transform(cat_var)
        new_df[f'{cat_name}'] = cat_var
    
    # Numerical score
    for num_name in numerical_score_vars:
        num_var = df[num_name]
        if process_type == 'train':
            num_var, data_max, data_min = max_min_norm_score(num_var, process_type = 'train')
            train_params[num_name] = [data_max, data_min]
        else:
            num_var = max_min_norm_score(num_var, train_params, process_type = 'valid')
        new_df[num_name] = num_var.values
    
    # Numerical scalar
    for num_name in numerical_scalar_vars:
        num_var = df[num_name]
        num_var = np.log(num_var)
        if process_type == 'train':
            num_var, data_max, data_min = max_min_norm_scalar(num_var, process_type = 'train')
            train_params[num_name] = [data_max, data_min]
        else:
            num_var = max_min_norm_scalar(num_var, train_params, process_type = 'valid')
        new_df[num_name] = num_var.values
        
        
    if process_type == 'train':
        return new_df, train_params, category_var_len
    else:
        return new_df


In [None]:
processed_df_train, train_params, category_var_len = preprocessing(df_train, process_type = 'train')
# train_params
print(f"category_var_len: {category_var_len}")
print(f"processed_df_train: {processed_df_train.shape}")
processed_df_train.head()

In [None]:
processed_df_valid = preprocessing(df_valid, train_params, process_type = 'valid')
print(f"processed_df_valid: {processed_df_valid.shape}")
processed_df_valid.head()

In [None]:
class TableDataset(Dataset):
    def __init__(self, data):
        self.data = np.array(data)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, id):
        dat = self.data[id, :]
        dat = torch.from_numpy(dat)
        return dat

In [None]:
BATCH_SIZE = 256
DROPOUT = 0.3

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


train_dataset = TableDataset(processed_df_train)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=4, shuffle=True)

valid_dataset = TableDataset(processed_df_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, num_workers=4, shuffle=True)

In [None]:
# category_var_len: {'Gender': 2, 'Department': 4, 'Grade': 5, 
                #    'Extracurricular_Activities': 2, 'Internet_Access_at_Home': 2, 
                #    'Parent_Education_Level': 4, 'Family_Income_Level': 3}


In [None]:
class tableModel(nn.Module):
    def __init__(self, category_var_len):
        super(tableModel,self).__init__()
        self.num_category_var = len(category_var_len)
        self.num_numerical_var = 12
        self.category_emb_size = 1
        self.category_dict = category_var_len
        self.batch_size = BATCH_SIZE
                
        self.mask_prob = 0.05

        self.transformer_layer = 4
        self.transformer_emb_size = 32
        self.num_head = 4
        self.seq_len = (self.num_category_var * self.category_emb_size) + self.num_numerical_var
        

        ''' linear for numerical '''
        self.linear_numerical1 = nn.Linear(self.num_numerical_var, 32, bias = False)
        self.linear_numerical2 = nn.Linear(32, 64 , bias = False)
        self.linear_numerical3 = nn.Linear(64, 32 , bias = False)
        self.linear_numerical4 = nn.Linear(32, self.num_numerical_var , bias = False)

        self.relu = nn.ReLU()
        
        self.dropout = nn.Dropout(DROPOUT)
        
    def masking_table(self, x, seed=42, training = True):
        """
        x: (batch_size, num_var = 19)
        """
        # Set random seed for reproducibility
        
        if training:
            seed = torch.randint(0, 5, (1,))
            torch.manual_seed(seed)
            if torch.cuda.is_available():
                torch.cuda.manual_seed_all(seed)
        else:
            torch.manual_seed(42)
            if torch.cuda.is_available():
                torch.cuda.manual_seed_all(42)

        self.masking_prob = self.mask_prob 
        device = x.device  # Get the device from input tensor

        # Category masking
        category_var = x[:, :self.num_category_var].long()  # Ensure category_var is integer type
        random_cat = torch.rand_like(category_var, dtype=torch.float, device=device)
        masking_cat = random_cat < self.masking_prob
        mask_token = torch.tensor([2, 4, 5, 2, 2, 4, 3], device=device, dtype=torch.long).expand_as(category_var)

        # Apply mask in-place (avoiding memory allocation overhead)
        masked_category_var = category_var.clone()  # Clone to avoid modifying input
        masked_category_var[masking_cat] = mask_token[masking_cat]

        # Numerical masking
        numerical_var = x[:, -self.num_numerical_var:].float()  # Ensure numerical_var is float type
        random_numerical = torch.rand_like(numerical_var, dtype=torch.float, device=device)
        masking_numerical = random_numerical < self.masking_prob

        masked_numerical_var = numerical_var.clone()  # Clone to avoid modifying input
        masked_numerical_var[masking_numerical] = 1.5   # Directly set masked values to zero
        # masked_numerical_var[masking_numerical] = torch.rand((1,), device = device)  # Directly set masked values to zero

        masking_position = {
            'masking_category': masking_cat,
            'masking_numerical': masking_numerical,
        }

        # Concatenating the masked category and numerical variables
        # return torch.cat([masked_category_var, masked_numerical_var], dim=1), masking_position
        return torch.cat([category_var, masked_numerical_var], dim=1), masking_position

                        
    def forward(self, x, training):
        '''
        x: [batch_size, 19]
        
        category vars size = 7 --- embedding ---> category vars size = 14
        numerical vars size = 12
        
        x: [batch_size, 14 + 12]
        '''
        
        ''' masking'''
        x, masking_position = self.masking_table(x, training)
        
        cat_vars = x[:, :self.num_category_var * self.category_emb_size]
        num_vars = x[:, - self.num_numerical_var:]
        
        num_vars_ori = num_vars
        num_vars = self.relu(self.linear_numerical1(num_vars))
        num_vars = self.dropout(num_vars)
        num_vars = self.relu(self.linear_numerical2(num_vars))
        num_vars = self.dropout(num_vars)
        num_vars = self.relu(self.linear_numerical3(num_vars))
        num_vars = self.dropout(num_vars)
        num_vars = self.linear_numerical4(num_vars)
        num_vars = num_vars + num_vars_ori
        

        return num_vars, cat_vars, masking_position
    
    # def inference(self, x):
    #     '''
    #     DO NOT mask during inference
    #     '''
    #     ''' masking'''
    #     x = self.masking_table(x)
        
    #     # category vars
    #     cat_vars = []
    #     for c_id, encode_fn in zip(range(self.num_category_var), self.encoders):
    #         emb_c = encode_fn(x[:,c_id].long())
    #         emb_c = self.encode_dropout(emb_c)
    #         cat_vars.append(emb_c)
    #     cat_vars = torch.cat(cat_vars, dim = 1).float()

    #     # numerical vars
    #     num_vars = x[:, - self.num_numerical_var:].float()

    #     # combine category and numerical vars        
    #     x = torch.cat([cat_vars, num_vars], dim = 1)

    #     '''
    #     Transformer
    #     '''
    #     x = torch.unsqueeze(x, dim = 2)
    #     x = self.gpt(x)
    #     x = torch.squeeze(x, dim = 2)
        
        
    #     ''' Decode category ''' 
    #     # split numerical and category
    #     num_vars = x[:, - self.num_numerical_var:]
    #     cat_vars = x[:, :self.num_category_var * 2]
        
    #     # category vars
    #     decoded_cat_vars = []
    #     for c_id, decode_fn in zip(range(self.num_category_var), self.decoders):
    #         emb_c = cat_vars[:, c_id * self.emb_size: (c_id + 1) * self.emb_size]
    #         c_var = decode_fn(emb_c)
    #         c_var = self.decode_dropout(c_var)
    #         pred_c = torch.argmax(c_var, dim = -1)
    #         decoded_cat_vars.append(pred_c)
        
    #     decoded_cat_vars = torch.stack(decoded_cat_vars, dim = 1)

    #     pred = torch.cat([decoded_cat_vars, num_vars], dim = 1)

    #     return pred

In [None]:

train_dataset = TableDataset(processed_df_train)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=4, shuffle=True)

valid_dataset = TableDataset(processed_df_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, num_workers=4, shuffle=True)

In [None]:
def linear_warmup_decay_lr(lr_init, lr_final, num_warmup_steps, num_training_steps):
    """
    Returns a lambda function for LambdaLR.
    - lr_init: 初始學習率
    - lr_final: 最終學習率（不是 0）
    - num_warmup_steps: 預熱步數
    - num_training_steps: 總訓練步數
    """
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return current_step / num_warmup_steps  # 線性預熱
        else:
            progress = (current_step - num_warmup_steps) / (num_training_steps - num_warmup_steps)
            return (1 - progress) * (1 - lr_final / lr_init) + (lr_final / lr_init)  # 線性衰減到 lr_final
    return lr_lambda

In [None]:
LEARNING_RATE = 2e-3
EPOCHS = 5000


model = tableModel(category_var_len).to(device)
# optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.95), eps=1e-6, weight_decay=1e-3)

scheduler = LambdaLR(optimizer, lr_lambda=linear_warmup_decay_lr(lr_init = LEARNING_RATE, lr_final = LEARNING_RATE * 1e-2, num_warmup_steps = 100, num_training_steps = EPOCHS))


In [None]:
# MSE_loss_fn = nn.MSELoss()
# CE_loss_fn = nn.CrossEntropyLoss()

# def loss_fn(pred_numerical, pred_category, label, masking_position):
    
#     num_numerical = 12
#     num_category = 7
#     ratio_numerical = num_numerical / (num_numerical + num_｀category)
#     ratio_category = 1 / (num_numerical + num_category)
    
#     label_category = label[:, :num_category]
#     label_numerical = label[:, -num_numerical:]
    
#     total_loss = torch.zeros(1).to(device)
    
#     mse_loss = MSE_loss_fn(pred_numerical, label_numerical)
#     total_loss += (mse_loss * ratio_numerical)
    
#     for i in range(num_category):
#         pred = pred_category[i]
#         loss = CE_loss_fn(pred, label_category[:, i].long())
#         total_loss += (loss * ratio_category)
        
#     return total_loss, mse_loss, (total_loss - mse_loss)

MSE_loss_fn = nn.MSELoss(reduction='none')  # 逐元素 MSE
CE_loss_fn = nn.CrossEntropyLoss(reduction='none')  # 逐元素 CrossEntropy

def loss_fn(pred_numerical, pred_category, label, mask_position):
    device = label.device
    
    num_numerical = 12
    num_category = 7
    ratio_numerical = num_numerical / (num_numerical + num_category)
    ratio_category = 1 / (num_numerical + num_category)

    masking_category = mask_position['masking_category']  # shape: (batch_size, num_category)
    masking_numerical = mask_position['masking_numerical']  # shape: (batch_size, num_numerical)

    label_category = label[:, :num_category]
    label_numerical = label[:, -num_numerical:]

    total_loss = torch.zeros(1, device=device)

    # === 1. MSE Loss ===
    # 先用 masking_numerical 過濾 pred_numerical 和 label_numerical
    pred_numerical_masked = pred_numerical[masking_numerical]
    label_numerical_masked = label_numerical[masking_numerical]

    if pred_numerical_masked.numel() > 0:  # 確保有 mask 位置
        mse_loss = MSE_loss_fn(pred_numerical_masked, label_numerical_masked).mean()
        total_loss += (mse_loss * 1)

    # # === 2. CrossEntropy Loss ===
    # for i in range(num_category):
    #     category_mask = masking_category[:, i]  # shape: (batch_size,)
    #     pred = pred_category[i]  # shape: (batch_size, num_classes)
    #     label_cat = label_category[:, i].long()  # shape: (batch_size,)

    #     # 先用 mask 過濾 pred 和 label
    #     pred_masked = pred[category_mask]
    #     label_masked = label_cat[category_mask]
        
    #     if pred_masked.shape[0] > 0:  # 確保有 mask 位置
    #         ce_loss = CE_loss_fn(pred_masked, label_masked).mean()
    #         total_loss += ce_loss * ratio_category

    # ce_loss = (total_loss - mse_loss)

    # if (total_loss - mse_loss) < 0.20:
    #     total_loss = mse_loss * 50 + ce_loss
    #     return total_loss, mse_loss, ce_loss
    
    return total_loss, mse_loss, torch.zeros([1])


In [None]:

train_LOSS = []
valid_LOSS = []

for epoch in tqdm(range(EPOCHS), desc="iterate epoch"):
    losses = []
    mse_losses = []
    ce_losses = []
    
    val_losses = []
    val_mse_losses = []
    val_ce_losses = []
    
    
    model.train()
    for data in train_dataloader:
        data = data.float().to(device)

        pred_numerical, pred_category, masking_position = model(data, training = True)
        loss, mse_loss, ce_loss = loss_fn(pred_numerical, pred_category, data, masking_position)
        losses.append(loss.item())
        mse_losses.append(mse_loss.item())
        ce_losses.append(ce_loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    scheduler.step()
    
    losses = np.mean(losses)
    mse_losses = np.mean(mse_losses)
    ce_losses = np.mean(ce_losses)

    train_LOSS.append(losses)
    
    if epoch % 100 == 0:    
        print(f"epoch: {epoch}, loss: {losses}, mse: {mse_losses}, ce: {ce_losses}")
    
    with torch.no_grad():
        model.eval()
        for data in valid_dataloader:
            data = data.float().to(device)
                
            pred_numerical, pred_category, masking_position = model(data, training = False)
            loss, mse_loss, ce_loss = loss_fn(pred_numerical, pred_category, data, masking_position)
            val_losses.append(loss.item())
            val_mse_losses.append(mse_loss.item())
            val_ce_losses.append(ce_loss.item())
            
    val_losses = np.mean(val_losses)
    val_mse_losses = np.mean(val_mse_losses)
    val_ce_losses = np.mean(val_ce_losses)
    
    valid_LOSS.append(val_losses)
    
    if epoch % 100 == 0:
        print(f"epoch: {epoch}, val_loss: {val_losses}, val_mse: {val_mse_losses}, val_ce_losses: {val_ce_losses}")
        print()
        
        
        
        # pred = model.inference(data)
        
        
    

In [None]:


plt .plot(range(len(train_LOSS)), train_LOSS, color = 'blue')
plt.plot(range(len(valid_LOSS)), valid_LOSS, color = 'red')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
from PIL import Image

img = Image.open('../../dataset/train/0.png')

In [None]:
img_np = np.array(img)

In [None]:
img_np.shape

In [None]:
a = torch.rand((128,128))
a.shape

In [None]:
128 * 128

In [None]:
mean = torch.mean(a)
std = torch.std(a)

outlier_upper = mean + 1 * std
outlier_down = mean - 1 * std

(a < outlier_upper) & (a > outlier_down)

In [None]:
k = a[(a < outlier_upper) & (a > outlier_down)]
k

In [None]:
k = a[(a < outlier_upper) & (a > outlier_down)]
k

In [None]:
torch.sum(((a < outlier_upper) & (a > outlier_down)))

In [None]:
mask = torch.ones((8,8))
causal_mask = torch.tril(mask)
# causal_mask[:8, :8] = float('-inf')
causal_mask

In [None]:
causal_mask = torch.where(causal_mask == 0, float('-inf'), causal_mask)

In [None]:
causal_mask

In [None]:
c = torch.rand((64, 8,8))
c.shape

In [None]:
d = c * causal_mask
d[0]

In [None]:
e = torch.softmax(d, dim=-1)
e

In [None]:
0.04

In [None]:
torch.arange(2)

In [None]:
nn.Parameter(torch.rand())