# Imports

In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import xgboost as xgb
import pandas as pd
import numpy as np
import warnings
import os

warnings.filterwarnings("ignore")

np.random.seed(42) 

# Data Loading and Preprocessing

In [2]:
train_file = "/kaggle/input/playground-series-s6e1/train.csv"
test_file = "/kaggle/input/playground-series-s6e1/test.csv"
original_file = "/kaggle/input/exam-score-prediction-dataset/Exam_Score_Prediction.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
original_df = pd.read_csv(original_file) 
submission_df = pd.read_csv("/kaggle/input/playground-series-s6e1/sample_submission.csv") 

TARGET = 'exam_score'
base_features = [col for col in train_df.columns if col not in [TARGET, 'id']]

num_features = ['study_hours', 'class_attendance', 'sleep_hours']

def preprocess(df):
    df_temp = df.copy()

    for col in num_features:
        if col in df_temp.columns:
            df_temp[f'log_{col}'] = np.log1p(df_temp[col])

    for col in num_features:
        if col in df_temp.columns:
            df_temp[f'{col}_sq'] = df_temp[col] ** 2
            
    # Thanks to Spiritmilk for the following feature
    df_temp['feature_formula'] = (
        5.9051154511950499 * df_temp['study_hours'] + 
        0.34540967058057986 * df_temp['class_attendance'] + 
        1.423461171860262 * df_temp['sleep_hours'] + 4.7819
    )

    for col in base_features:
        df_temp[col] = df_temp[col].astype(str)

    log_cols = [f'log_{col}' for col in num_features]
    sq_cols = [f'{col}_sq' for col in num_features]
    
    return df_temp[base_features + log_cols + sq_cols + ['feature_formula']]

X_raw = preprocess(train_df)
y = train_df[TARGET].reset_index(drop=True)

X_test_raw = preprocess(test_df)
X_orig_raw = preprocess(original_df)
y_orig = original_df[TARGET].reset_index(drop=True)

full_data = pd.concat([X_raw, X_test_raw, X_orig_raw], axis=0)

for col in base_features:
    full_data[col] = full_data[col].astype('category')

engineered_cols = ['feature_formula'] + \
                  [f'log_{col}' for col in num_features] + \
                  [f'{col}_sq' for col in num_features]

for col in engineered_cols:
    if col in full_data.columns:
        full_data[col] = full_data[col].astype(float)

X = full_data.iloc[:len(train_df)].copy()
X_test = full_data.iloc[len(train_df):len(train_df)+len(test_df)].copy()
X_original = full_data.iloc[len(train_df)+len(test_df):].copy()

# Model Training

In [3]:
%%time

xgb_params = {
    'n_estimators': 10000,
    'learning_rate': 0.007,
    'max_depth': 7,
    'subsample': 0.8,
    'num_parallel_tree': 2,
    'reg_lambda': 3,
    'colsample_bytree': 0.6, 
    'colsample_bynode': 0.7,
    'tree_method': 'hist',
    'random_state': 42,
    'early_stopping_rounds': 100,
    'eval_metric': 'rmse',
    'enable_categorical': True
} 

test_predictions = []
oof_predictions = np.zeros(len(X))
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"\n--- Fold {fold+1} ---")

    X_train_fold, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index] 

    X_train_combined = pd.concat([X_train_fold, X_original], axis=0)
    y_train_combined = pd.concat([y_train_fold, y_orig], axis=0) 

    model = xgb.XGBRegressor(**xgb_params) 
    
    model.fit(
        X_train_combined, 
        y_train_combined, 
        eval_set=[(X_val, y_val)], 
        verbose=200
    )  
    
    val_preds = model.predict(X_val)
    oof_predictions[val_index] = val_preds
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f"RMSE score on validation set for Fold {fold+1}: {rmse:.5f}") 
    
    test_preds = model.predict(X_test)
    test_predictions.append(test_preds) 

oof_rmse = np.sqrt(mean_squared_error(y, oof_predictions)) 

print("-----------------------") 
print(f"Overall Out-of-Fold (OOF) RMSE Score: {oof_rmse:.5f}")


--- Fold 1 ---
[0]	validation_0-rmse:18.77206
[200]	validation_0-rmse:10.13511
[400]	validation_0-rmse:8.88479
[600]	validation_0-rmse:8.69390
[800]	validation_0-rmse:8.65150
[1000]	validation_0-rmse:8.64004
[1200]	validation_0-rmse:8.63663
[1400]	validation_0-rmse:8.63542
[1600]	validation_0-rmse:8.63484
[1794]	validation_0-rmse:8.63479
RMSE score on validation set for Fold 1: 8.63462

--- Fold 2 ---
[0]	validation_0-rmse:18.81617
[200]	validation_0-rmse:10.13433
[400]	validation_0-rmse:8.89381
[600]	validation_0-rmse:8.70024
[800]	validation_0-rmse:8.65787
[1000]	validation_0-rmse:8.64675
[1200]	validation_0-rmse:8.64330
[1400]	validation_0-rmse:8.64183
[1600]	validation_0-rmse:8.64138
[1800]	validation_0-rmse:8.64076
[2000]	validation_0-rmse:8.64048
[2200]	validation_0-rmse:8.64037
[2207]	validation_0-rmse:8.64038
RMSE score on validation set for Fold 2: 8.64026

--- Fold 3 ---
[0]	validation_0-rmse:18.82160
[200]	validation_0-rmse:10.13015
[400]	validation_0-rmse:8.88649
[600]	val

# Submission

In [4]:
oof_df = pd.DataFrame({'id': train_df['id'], TARGET: oof_predictions})
oof_df.to_csv('oof.csv', index=False) 

submission_df[TARGET] = np.mean(test_predictions, axis=0) 
submission_df.to_csv('submission.csv', index=False)
submission_df.head()  

Unnamed: 0,id,exam_score
0,630000,69.162926
1,630001,70.590294
2,630002,90.387497
3,630003,56.545555
4,630004,45.547356
