V1-V22 uses the lightgbm model, with the best version being V11.

From V23Ôºå try other models

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# ==============================
# Ëá™ÂÆö‰πâ L4 ÊçüÂ§±ÂáΩÊï∞
# ==============================
class L4Loss(nn.Module):
    def __init__(self, reduction='mean'):
        super().__init__()
        self.reduction = reduction

    def forward(self, y_pred, y_true):
        diff = y_pred - y_true
        loss = diff ** 4
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

# ==============================
# ËÆæÂ§áËÆæÁΩÆ
# ==============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ==============================
# Êï∞ÊçÆÂä†ËΩΩ
# ==============================
train_file = "/kaggle/input/playground-series-s6e1/train.csv"
test_file = "/kaggle/input/playground-series-s6e1/test.csv"
original_file = "/kaggle/input/exam-score-prediction-dataset/Exam_Score_Prediction.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
original_df = pd.read_csv(original_file) 
submission_df = pd.read_csv("/kaggle/input/playground-series-s6e1/sample_submission.csv") 
TARGET = 'exam_score'

num_features = ['study_hours', 'class_attendance', 'sleep_hours']
base_features = [col for col in train_df.columns if col not in [TARGET, 'id']]
CATS = base_features
NUMS = num_features  # only these are truly numerical

# ==============================
# ÁâπÂæÅÂ∑•Á®ãÔºà‰ªÖÂØπÊï∞ÂÄºÁâπÂæÅÊìç‰ΩúÔºâ
# ==============================
def add_engineered_features(df):
    df_temp = df.copy()
    # Sine features
    df_temp['_study_hours_sin'] = np.sin(2 * np.pi * df_temp['study_hours'] / 12).astype('float32')
    df_temp['_class_attendance_sin'] = np.sin(2 * np.pi * df_temp['class_attendance'] / 12).astype('float32')

    for col in num_features:
        if col in df_temp.columns:
            df_temp[f'log_{col}'] = np.log1p(df_temp[col])
            df_temp[f'{col}_sq'] = df_temp[col] ** 2

    # Linear combo feature
    df_temp['feature_formula'] = (
            5.9051154511950499 * df_temp['study_hours'] +
            0.34540967058057986 * df_temp['class_attendance'] +
            1.423461171860262 * df_temp['sleep_hours'] + 4.7819
    )

    # Keep categorical as string for encoding
    for col in CATS:
        df_temp[col] = df_temp[col].astype(str)

    return df_temp


# ==============================
# ÂàÜÂà´È¢ÑÂ§ÑÁêÜÊï∞ÂÄºÂíåÁ±ªÂà´ÁâπÂæÅ
# ==============================
train_eng = add_engineered_features(train_df)

# ÊâÄÊúâÊï∞ÂÄºÂàóÔºàÂåÖÊã¨ engineeredÔºâ
all_num_cols = [col for col in train_eng.columns if col not in CATS + [TARGET, 'id']]
all_cat_cols = CATS

# Scaler for numerical
scaler = StandardScaler()
scaler.fit(train_eng[all_num_cols])

# Encoder for categorical
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(train_eng[all_cat_cols])


def preprocess_pipeline_separate(df):
    df_eng = add_engineered_features(df)
    # Numerical: scale
    nums_scaled = scaler.transform(df_eng[all_num_cols])
    # Categorical: encode to integers
    cats_encoded = encoder.transform(df_eng[all_cat_cols]).astype(np.int64)
    return nums_scaled, cats_encoded


X_num, X_cat = preprocess_pipeline_separate(train_df)
y = train_df[TARGET].values
X_test_num, X_test_cat = preprocess_pipeline_separate(test_df)
X_orig_num, X_orig_cat = preprocess_pipeline_separate(original_df)
y_original = original_df[TARGET].values

# ==============================
# Ëé∑ÂèñÁ±ªÂà´ÁâπÂæÅÁöÑÂîØ‰∏ÄÂÄºÊï∞ÈáèÔºàÁî®‰∫é EmbeddingÔºâ
# ==============================
cat_unique_counts = []
for i, col in enumerate(all_cat_cols):
    n_unique = int(encoder.categories_[i].size)
    cat_unique_counts.append(n_unique)

print("Categorical feature cardinalities:", cat_unique_counts)


# ==============================
# SE Block (Squeeze-and-Excitation)
# ==============================
class SEBlock(nn.Module):
    def __init__(self, channels, reduction=4):
        super().__init__()
        self.fc1 = nn.Linear(channels, channels // reduction, bias=False)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(channels // reduction, channels, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x: (batch, channels)
        se = x.mean(dim=0, keepdim=True)  # global avg pool -> (1, channels)
        se = self.fc1(se)
        se = self.relu(se)
        se = self.fc2(se)
        se = self.sigmoid(se)
        return x * se  # broadcast


# ==============================
# Residual Block with SE
# ==============================
class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.1, reduction=4):
        super().__init__()
        self.linear1 = nn.Linear(dim, dim)
        self.linear2 = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.se = SEBlock(dim, reduction=reduction)
        self.relu = nn.ReLU()

    def forward(self, x):
        residual = x
        # First sub-block
        out = self.norm1(x)
        out = self.linear1(out)
        out = self.relu(out)
        out = self.dropout(out)
        # Second sub-block
        out = self.norm2(out)
        out = self.linear2(out)
        out = self.dropout(out)
        # SE
        out = self.se(out)
        # Residual connection
        out = out + residual
        return out


# ==============================
# ÂÆåÊï¥Ê®°ÂûãÔºöEmbedding + Concat + ResNet + Head
# ==============================
class TabularResNetWithEmbedding(nn.Module):
    def __init__(
            self,
            num_numerical,
            cat_unique_counts,
            embedding_dim=8,
            hidden_dim=256,
            n_blocks=4,
            dropout=0.1,
            head_dims=[64, 16]
    ):
        super().__init__()
        self.num_numerical = num_numerical
        self.embedding_dim = embedding_dim

        # Embedding layers for each categorical feature
        self.embeddings = nn.ModuleList([
            nn.Embedding(n_cat + 1, embedding_dim, padding_idx=-1)  # -1 mapped to last index
            for n_cat in cat_unique_counts
        ])

        total_cat_dim = len(cat_unique_counts) * embedding_dim
        input_dim = num_numerical + total_cat_dim

        # Projection to hidden_dim
        self.proj = nn.Linear(input_dim, hidden_dim)
        self.dropout_in = nn.Dropout(dropout)

        # Residual blocks
        self.blocks = nn.Sequential(
            *[ResidualBlock(hidden_dim, dropout=dropout) for _ in range(n_blocks)]
        )

        # Prediction head
        layers = []
        prev = hidden_dim
        for h in head_dims:
            layers.extend([
                nn.Linear(prev, h),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev = h
        layers.append(nn.Linear(prev, 1))
        self.head = nn.Sequential(*layers)

    def forward(self, x_num, x_cat):
        # x_num: (B, num_numerical)
        # x_cat: (B, n_cats)
        batch_size = x_num.size(0)

        # Embed categorical features
        x_embeds = []
        for i, emb in enumerate(self.embeddings):
            # x_cat[:, i] shape: (B,)
            xi = x_cat[:, i]
            # Handle -1 (unknown): map to last embedding index
            xi = torch.where(xi == -1, torch.tensor(emb.num_embeddings - 1, device=xi.device), xi)
            embed_i = emb(xi)  # (B, embedding_dim)
            x_embeds.append(embed_i)

        x_cat_emb = torch.cat(x_embeds, dim=1)  # (B, total_cat_dim)

        # Concat numerical and embedded categorical
        x = torch.cat([x_num, x_cat_emb], dim=1)  # (B, input_dim)

        # Project to hidden space
        x = self.proj(x)
        x = self.dropout_in(x)

        # Residual blocks
        x = self.blocks(x)

        # Prediction head
        out = self.head(x).squeeze(1)
        return out


# ==============================
# ËÆ≠ÁªÉÂáΩÊï∞Ôºà‰ΩøÁî® L4 LossÔºå‰ΩÜÈ™åËØÅÁî® RMSEÔºâ
# ==============================
def train_model(model, train_loader, val_loader, epochs=200, lr=1e-3, weight_decay=1e-5, patience=20, factor=0.5,
                min_lr=1e-6):
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=factor, patience=patience // 2, min_lr=min_lr
    )
    train_criterion = L4Loss()          # ‚Üê ËÆ≠ÁªÉÁî® L4
    val_criterion = nn.MSELoss()        # ‚Üê È™åËØÅÁî® MSEÔºà‰∏∫‰∫ÜÊ≠£Á°ÆËÆ°ÁÆó RMSEÔºâ

    best_val_rmse = float('inf')
    patience_counter = 0
    best_weights = None

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for xb_num, xb_cat, yb in train_loader:
            xb_num, xb_cat, yb = xb_num.to(device), xb_cat.to(device), yb.to(device)
            optimizer.zero_grad()
            pred = model(xb_num, xb_cat)
            loss = train_criterion(pred, yb)
            loss.backward()
            # üëá Ê¢ØÂ∫¶Ë£ÅÂâ™Èò≤Ê≠¢ÁàÜÁÇ∏ÔºàL4 Ê¢ØÂ∫¶Â§ßÔºÅÔºâ
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_mse = 0.0
        with torch.no_grad():
            for xb_num, xb_cat, yb in val_loader:
                xb_num, xb_cat, yb = xb_num.to(device), xb_cat.to(device), yb.to(device)
                pred = model(xb_num, xb_cat)
                mse_loss = val_criterion(pred, yb)
                val_mse += mse_loss.item()
        val_mse /= len(val_loader)
        val_rmse = val_mse ** 0.25
        scheduler.step(val_rmse)  # Ë∞ÉÂ∫¶Âô®Êåâ RMSE Èôç

        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            patience_counter = 0
            best_weights = model.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break

        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch + 1}/{epochs} | Val RMSE: {val_rmse:.5f}")

    if best_weights is not None:
        model.load_state_dict(best_weights)
    return model, best_val_rmse


# ==============================
# K ÊäòËÆ≠ÁªÉ
# ==============================
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

test_predictions = []
oof_predictions = np.zeros(len(y))

print(f"Starting {n_splits}-fold CV with L4 Loss...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_num, y)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")

    # Split
    X_num_train, X_cat_train = X_num[train_idx], X_cat[train_idx]
    y_train = y[train_idx]
    X_num_val, X_cat_val = X_num[val_idx], X_cat[val_idx]
    y_val = y[val_idx]

    # Augment with original data
    X_num_combined = np.vstack([X_num_train, X_orig_num])
    X_cat_combined = np.vstack([X_cat_train, X_orig_cat])
    y_combined = np.concatenate([y_train, y_original])

    # Tensors
    X_num_train_t = torch.tensor(X_num_combined, dtype=torch.float32)
    X_cat_train_t = torch.tensor(X_cat_combined, dtype=torch.int64)
    y_train_t = torch.tensor(y_combined, dtype=torch.float32)

    X_num_val_t = torch.tensor(X_num_val, dtype=torch.float32)
    X_cat_val_t = torch.tensor(X_cat_val, dtype=torch.int64)
    y_val_t = torch.tensor(y_val, dtype=torch.float32)

    X_test_num_t = torch.tensor(X_test_num, dtype=torch.float32)
    X_test_cat_t = torch.tensor(X_test_cat, dtype=torch.int64)

    # Datasets & Loaders
    train_ds = TensorDataset(X_num_train_t, X_cat_train_t, y_train_t)
    val_ds = TensorDataset(X_num_val_t, X_cat_val_t, y_val_t)
    train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=1024, shuffle=False)

    # Model
    model = TabularResNetWithEmbedding(
        num_numerical=X_num.shape[1],
        cat_unique_counts=cat_unique_counts,
        embedding_dim=8,
        hidden_dim=256,
        n_blocks=3,
        dropout=0.11,
        head_dims=[64, 16]
    ).to(device)

    # Train
    model, best_rmse = train_model(
        model,
        train_loader,
        val_loader,
        epochs=300,
        lr=1e-3,
        weight_decay=1e-4,
        patience=20,
        factor=0.5,
        min_lr=1e-6
    )

    # Predict
    model.eval()
    with torch.no_grad():
        val_pred = model(X_num_val_t.to(device), X_cat_val_t.to(device)).cpu().numpy()
        test_pred = model(X_test_num_t.to(device), X_test_cat_t.to(device)).cpu().numpy()

    oof_predictions[val_idx] = val_pred
    test_predictions.append(test_pred)

    print(f"Fold {fold + 1} RMSE: {best_rmse:.5f}")

# ==============================
# Final Evaluation & Submission
# ==============================
oof_rmse = np.sqrt(mean_squared_error(y, oof_predictions))
print("\n" + "=" * 50)
print(f"Final OOF RMSE: {oof_rmse:.6f}")
print("=" * 50)

oof_df = pd.DataFrame({'id': train_df['id'], TARGET: oof_predictions})
oof_df.to_csv('nn_oof.csv', index=False)

submission_df[TARGET] = np.mean(test_predictions, axis=0)
submission_df.to_csv('submission.csv', index=False)
print("\nSubmission saved!")