<a href="https://www.kaggle.com/code/samithsachidanandan/ps-s6e1-ridge-xgb-fe?scriptVersionId=289818351" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

Acknowledgement: [https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe](https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe)

### Importing Libraries and Loading the Data 

In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import xgboost as xgb
import pandas as pd
import numpy as np

from sklearn.linear_model import RidgeCV
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import TargetEncoder

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold



import warnings
warnings.filterwarnings("ignore")

np.random.seed(42)

train_file = "/kaggle/input/playground-series-s6e1/train.csv"
test_file = "/kaggle/input/playground-series-s6e1/test.csv"
original_file = "/kaggle/input/exam-score-prediction-dataset/Exam_Score_Prediction.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
original_df = pd.read_csv(original_file)

submission_df = pd.read_csv("/kaggle/input/playground-series-s6e1/sample_submission.csv")

TARGET = "exam_score"
ID_COL = "id"

train_df.shape, test_df.shape, original_df.shape

((630000, 13), (270000, 12), (20000, 13))

### Base features

In [2]:
base_features = [col for col in train_df.columns if col not in [TARGET, ID_COL]]


CATS = train_df.select_dtypes("object").columns.to_list()
print("CATS:", CATS)

CATS: ['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']


### Feature Engineering

In [3]:
def preprocess(df):

    df_temp = df.copy()
    eps = 1e-5
    
    sh_pos = df_temp['study_hours'].clip(lower=0)
    ca_pos = df_temp['class_attendance'].clip(lower=0)
    sl_pos = df_temp['sleep_hours'].clip(lower=0)
    ag_pos = df_temp['age'].clip(lower=0)
    
    df_temp['study_hours_squared'] = df_temp['study_hours'] ** 2
    df_temp['study_hours_cubed'] = df_temp['study_hours'] ** 3
    df_temp['study_hours_quartic'] = df_temp['study_hours'] ** 4
    df_temp['class_attendance_squared'] = df_temp['class_attendance'] ** 2
    df_temp['class_attendance_cubed'] = df_temp['class_attendance'] ** 3
    df_temp['sleep_hours_squared'] = df_temp['sleep_hours'] ** 2
    df_temp['sleep_hours_cubed'] = df_temp['sleep_hours'] ** 3
    df_temp['age_squared'] = df_temp['age'] ** 2
    df_temp['age_cubed'] = df_temp['age'] ** 3
    
    df_temp['log_study_hours'] = np.log1p(sh_pos)
    df_temp['log_class_attendance'] = np.log1p(ca_pos)
    df_temp['log_sleep_hours'] = np.log1p(sl_pos)
    df_temp['sqrt_study_hours'] = np.sqrt(sh_pos)
    df_temp['sqrt_class_attendance'] = np.sqrt(ca_pos)
    
    df_temp['inv_sleep'] = 1.0 / (sl_pos + 1.0)
    df_temp['inv_study'] = 1.0 / (sh_pos + 1.0)
    df_temp['inv_attendance'] = 1.0 / (ca_pos + 1.0)
    
    df_temp['study_tanh'] = np.tanh(df_temp['study_hours'] / 10.0)
    df_temp['sleep_tanh'] = np.tanh(df_temp['sleep_hours'] / 10.0)
    df_temp['attendance_tanh'] = np.tanh(df_temp['class_attendance'] / 100.0)
    
    df_temp['study_sigmoid'] = 1.0 / (1.0 + np.exp(-(df_temp['study_hours'] - 5.0)))
    df_temp['sleep_sigmoid'] = 1.0 / (1.0 + np.exp(-(df_temp['sleep_hours'] - 7.0)))
    df_temp['attendance_sigmoid'] = 1.0 / (1.0 + np.exp(-(df_temp['class_attendance'] - 85.0) / 8.0))
    
    df_temp['study_hours_times_attendance'] = df_temp['study_hours'] * df_temp['class_attendance']
    df_temp['study_hours_times_sleep'] = df_temp['study_hours'] * df_temp['sleep_hours']
    df_temp['attendance_times_sleep'] = df_temp['class_attendance'] * df_temp['sleep_hours']
    df_temp['age_times_study_hours'] = df_temp['age'] * df_temp['study_hours']
    df_temp['age_times_attendance'] = df_temp['age'] * df_temp['class_attendance']
    df_temp['age_times_sleep_hours'] = df_temp['age'] * df_temp['sleep_hours']
    
    df_temp['study_center_5'] = df_temp['study_hours'] - 5.0
    df_temp['sleep_center_7'] = df_temp['sleep_hours'] - 7.0
    df_temp['att_center_85'] = df_temp['class_attendance'] - 85.0
    df_temp['study_center_sq'] = df_temp['study_center_5'] ** 2
    df_temp['sleep_center_sq'] = df_temp['sleep_center_7'] ** 2
    df_temp['att_center_sq'] = df_temp['att_center_85'] ** 2
    
    df_temp['study_hours_over_sleep'] = df_temp['study_hours'] / (df_temp['sleep_hours'] + eps)
    df_temp['attendance_over_sleep'] = df_temp['class_attendance'] / (df_temp['sleep_hours'] + eps)
    df_temp['attendance_over_study'] = df_temp['class_attendance'] / (df_temp['study_hours'] + eps)
    df_temp['sleep_over_study'] = df_temp['sleep_hours'] / (df_temp['study_hours'] + eps)
    df_temp['study_over_age'] = df_temp['study_hours'] / (df_temp['age'] + eps)
    df_temp['attendance_over_age'] = df_temp['class_attendance'] / (df_temp['age'] + eps)
    
    df_temp['study_hours_clip'] = df_temp['study_hours'].clip(0, 12)
    df_temp['sleep_hours_clip'] = df_temp['sleep_hours'].clip(0, 12)
    df_temp['attendance_clip'] = df_temp['class_attendance'].clip(0, 100)
    
    df_temp['sleep_gap_8'] = (df_temp['sleep_hours'] - 8.0).abs()
    df_temp['sleep_gap_7'] = (df_temp['sleep_hours'] - 7.0).abs()
    df_temp['attendance_gap_100'] = (df_temp['class_attendance'] - 100.0).abs()
    df_temp['attendance_gap_90'] = (df_temp['class_attendance'] - 90.0).abs()
    df_temp['study_gap_6'] = (df_temp['study_hours'] - 6.0).abs()
    df_temp['study_gap_8'] = (df_temp['study_hours'] - 8.0).abs()
    
    df_temp['age_bin_num'] = pd.cut(df_temp['age'], bins=[0, 17, 19, 21, 23, 100], labels=[0, 1, 2, 3, 4]).astype(float)
    df_temp['study_bin_num'] = pd.cut(df_temp['study_hours'], bins=[-1, 2, 4, 6, 8, 100], labels=[0, 1, 2, 3, 4]).astype(float)
    df_temp['sleep_bin_num'] = pd.cut(df_temp['sleep_hours'], bins=[-1, 5, 6, 7, 8, 100], labels=[0, 1, 2, 3, 4]).astype(float)
    df_temp['attendance_bin_num'] = pd.cut(df_temp['class_attendance'], bins=[-1, 60, 75, 85, 95, 101], labels=[0, 1, 2, 3, 4]).astype(float)
    
    sleep_quality_map = {'poor': 0, 'average': 1, 'good': 2}
    facility_rating_map = {'low': 0, 'medium': 1, 'high': 2}
    exam_difficulty_map = {'easy': 0, 'moderate': 1, 'hard': 2}
    
    df_temp['sleep_quality_numeric'] = df_temp['sleep_quality'].map(sleep_quality_map).fillna(1).astype(int)
    df_temp['facility_rating_numeric'] = df_temp['facility_rating'].map(facility_rating_map).fillna(1).astype(int)
    df_temp['exam_difficulty_numeric'] = df_temp['exam_difficulty'].map(exam_difficulty_map).fillna(1).astype(int)
    
    df_temp['study_hours_times_sleep_quality'] = df_temp['study_hours'] * df_temp['sleep_quality_numeric']
    df_temp['attendance_times_facility'] = df_temp['class_attendance'] * df_temp['facility_rating_numeric']
    df_temp['sleep_hours_times_difficulty'] = df_temp['sleep_hours'] * df_temp['exam_difficulty_numeric']
    
    df_temp['facility_x_sleepq'] = df_temp['facility_rating_numeric'] * df_temp['sleep_quality_numeric']
    df_temp['difficulty_x_facility'] = df_temp['exam_difficulty_numeric'] * df_temp['facility_rating_numeric']
    df_temp['difficulty_x_sleepq'] = df_temp['exam_difficulty_numeric'] * df_temp['sleep_quality_numeric']
    
    df_temp['high_att_low_sleep'] = ((df_temp['class_attendance'] >= 90) & (df_temp['sleep_hours'] <= 6)).astype(int)
    df_temp['high_att_high_study'] = ((df_temp['class_attendance'] >= 90) & (df_temp['study_hours'] >= 6)).astype(int)
    df_temp['low_att_high_study'] = ((df_temp['class_attendance'] <= 60) & (df_temp['study_hours'] >= 7)).astype(int)
    df_temp['ideal_sleep_flag'] = ((df_temp['sleep_hours'] >= 7) & (df_temp['sleep_hours'] <= 9)).astype(int)
    df_temp['short_sleep_flag'] = (df_temp['sleep_hours'] <= 5.5).astype(int)
    df_temp['high_study_flag'] = (df_temp['study_hours'] >= 7).astype(int)
    
    df_temp['efficiency'] = (df_temp['study_hours'] * df_temp['class_attendance']) / (df_temp['sleep_hours'] + 1)
    df_temp['efficiency2'] = (df_temp['study_hours_clip'] * df_temp['attendance_clip']) / (df_temp['sleep_hours_clip'] + 1)
    df_temp['weighted_sum'] = (0.06 * df_temp['class_attendance'] + 2.0 * df_temp['study_hours'] + 1.2 * df_temp['sleep_hours'])
    df_temp['weighted_sum_x_difficulty'] = df_temp['weighted_sum'] * (1.0 + 0.2 * df_temp['exam_difficulty_numeric'])

  
    df_temp['study_rank'] = sh_pos.rank(pct=True)
    df_temp['attendance_rank'] = ca_pos.rank(pct=True)
    df_temp['sleep_rank'] = sl_pos.rank(pct=True)
    df_temp['age_rank'] = ag_pos.rank(pct=True)


    df_temp['study_z'] = (sh_pos - sh_pos.mean()) / (sh_pos.std() + eps)
    df_temp['attendance_z'] = (ca_pos - ca_pos.mean()) / (ca_pos.std() + eps)
    df_temp['sleep_z'] = (sl_pos - sl_pos.mean()) / (sl_pos.std() + eps)

 
    df_temp['harmonic_effort'] = 3 / (
        (1 / (sh_pos + eps)) +
        (1 / (ca_pos + eps)) +
        (1 / (sl_pos + eps))
        )

    df_temp['geo_effort'] = (
        (sh_pos + 1) *
        (ca_pos + 1) *
        (sl_pos + 1)
        ) ** (1 / 3)

 
    df_temp['study_above_6'] = np.maximum(0, sh_pos - 6)
    df_temp['study_above_8'] = np.maximum(0, sh_pos - 8)
    df_temp['sleep_below_6'] = np.maximum(0, 6 - sl_pos)
    df_temp['attendance_below_75'] = np.maximum(0, 75 - ca_pos)

    df_temp['log_study_sleep_ratio'] = np.log1p(sh_pos) - np.log1p(sl_pos)
    df_temp['log_att_study_ratio'] = np.log1p(ca_pos) - np.log1p(sh_pos)
    
    numeric_features = [
        'study_hours_squared', 'study_hours_cubed', 'study_hours_quartic',
        'class_attendance_squared', 'class_attendance_cubed',
        'sleep_hours_squared', 'sleep_hours_cubed',
        'age_squared', 'age_cubed',
        'log_study_hours', 'log_class_attendance', 'log_sleep_hours',
        'sqrt_study_hours', 'sqrt_class_attendance',
        'inv_sleep', 'inv_study', 'inv_attendance',
        'study_tanh', 'sleep_tanh', 'attendance_tanh',
        'study_sigmoid', 'sleep_sigmoid', 'attendance_sigmoid',
        'study_hours_times_attendance', 'study_hours_times_sleep', 'attendance_times_sleep',
        'age_times_study_hours', 'age_times_attendance', 'age_times_sleep_hours',
        'study_center_5', 'sleep_center_7', 'att_center_85',
        'study_center_sq', 'sleep_center_sq', 'att_center_sq',
        'study_hours_over_sleep', 'attendance_over_sleep',
        'attendance_over_study', 'sleep_over_study',
        'study_over_age', 'attendance_over_age',
        'study_hours_clip', 'sleep_hours_clip', 'attendance_clip',
        'sleep_gap_8', 'sleep_gap_7',
        'attendance_gap_100', 'attendance_gap_90',
        'study_gap_6', 'study_gap_8',
        'age_bin_num', 'study_bin_num', 'sleep_bin_num', 'attendance_bin_num',
        'sleep_quality_numeric', 'facility_rating_numeric', 'exam_difficulty_numeric',
        'study_hours_times_sleep_quality', 'attendance_times_facility', 'sleep_hours_times_difficulty',
        'facility_x_sleepq', 'difficulty_x_facility', 'difficulty_x_sleepq',
        'high_att_low_sleep', 'high_att_high_study', 'low_att_high_study',
        'ideal_sleep_flag', 'short_sleep_flag', 'high_study_flag',
        'efficiency', 'efficiency2',
        'weighted_sum', 'weighted_sum_x_difficulty',
        'study_rank', 'attendance_rank', 'sleep_rank', 'age_rank',
        'study_z', 'attendance_z', 'sleep_z',
        'harmonic_effort', 'geo_effort',
        'study_above_6', 'study_above_8',
        'sleep_below_6', 'attendance_below_75',
        'log_study_sleep_ratio', 'log_att_study_ratio',

        
    ]
    return df_temp[base_features + numeric_features], numeric_features

### Preprocessing and Preparing the Data

In [4]:
X_raw, numeric_cols = preprocess(train_df)
y = train_df[TARGET].reset_index(drop=True)

X_test_raw, _ = preprocess(test_df)
X_orig_raw, _ = preprocess(original_df)
y_orig = original_df[TARGET].reset_index(drop=True)


y = y.clip(0, 100)
y_orig = y_orig.clip(0, 100)

full_data = pd.concat([X_raw, X_test_raw, X_orig_raw], axis=0, ignore_index=True)


for col in numeric_cols:
    full_data[col] = full_data[col].astype(float)


X = full_data.iloc[:len(train_df)].copy()
X_test = full_data.iloc[len(train_df):len(train_df) + len(test_df)].copy()
X_original = full_data.iloc[len(train_df) + len(test_df):].copy()

print(f"Feature shapes - X: {X.shape}, X_test: {X_test.shape}, X_original: {X_original.shape}")

Feature shapes - X: (630000, 99), X_test: (270000, 99), X_original: (20000, 99)


### Ridge Regression 

In [5]:
FOLDS = 10


y_bins = pd.qcut(y, q=10, labels=False, duplicates='drop').astype(int)
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=1003)

scalers_ridge = []

N_SAMPLES_TRAIN = X.shape[0]
N_SAMPLES_TEST = X_test.shape[0]

oof_pred_lr = np.zeros(N_SAMPLES_TRAIN)
test_preds_lr = np.zeros((N_SAMPLES_TEST, FOLDS))
orig_preds_lr = np.zeros(X_original.shape[0])

fold_rmse_lr = []
lr_models = []
target_encoders = []

print("\n" + "="*50)
print("TRAINING RIDGE REGRESSION WITH SCALING")
print("="*50)

for fold, (train_index, val_index) in enumerate(kf.split(X, y_bins), start=1):
    print(f"Training fold {fold} (Ridge) ...")
    
    X_train_fold, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index]
    
    X_train_combined = pd.concat([X_train_fold, X_original], axis=0)
    y_train_combined = pd.concat([y_train_fold, y_orig], axis=0)
    

    target_encoder = TargetEncoder(smooth='auto', target_type='continuous')
    
    X_train_encoded = X_train_combined.copy()
    X_val_encoded = X_val.copy()
    X_test_encoded = X_test.copy()
    
    X_train_encoded[CATS] = target_encoder.fit_transform(X_train_combined[CATS], y_train_combined)
    X_val_encoded[CATS] = target_encoder.transform(X_val[CATS])
    X_test_encoded[CATS] = target_encoder.transform(X_test[CATS])
    
    
    scaler = RobustScaler()
    
    X_train_scaled = X_train_encoded.copy()
    X_val_scaled = X_val_encoded.copy()
    X_test_scaled = X_test_encoded.copy()
    
    X_train_scaled[:] = scaler.fit_transform(X_train_encoded)
    X_val_scaled[:] = scaler.transform(X_val_encoded)
    X_test_scaled[:] = scaler.transform(X_test_encoded)
    
    scalers_ridge.append(scaler)
    
    alphas = np.logspace(-2, 4, 30)  
    lr_model = RidgeCV(alphas=alphas, cv=5, scoring='neg_root_mean_squared_error')
    lr_model.fit(X_train_scaled, y_train_combined.to_numpy().ravel())
    lr_models.append(lr_model)
    target_encoders.append(target_encoder)
    
    lr_val_pred = lr_model.predict(X_val_scaled)
    lr_test_pred = lr_model.predict(X_test_scaled)
    lr_orig_pred = lr_model.predict(X_train_scaled.iloc[-X_original.shape[0]:])
    
    lr_val_pred = np.clip(lr_val_pred, 0, 100)
    lr_test_pred = np.clip(lr_test_pred, 0, 100)
    lr_orig_pred = np.clip(lr_orig_pred, 0, 100)
    
    oof_pred_lr[val_index] = lr_val_pred
    test_preds_lr[:, fold - 1] = lr_test_pred
    orig_preds_lr += lr_orig_pred / FOLDS
    
    rmse_lr = root_mean_squared_error(y_val, lr_val_pred)
    fold_rmse_lr.append(rmse_lr)
    print(f"Fold {fold} RMSE (Ridge): {rmse_lr:.6f}, Alpha: {lr_model.alpha_:.6f}")

ridge_oof_rmse = root_mean_squared_error(y, oof_pred_lr)
print(f"\nRidge OOF RMSE: {ridge_oof_rmse:.6f}")
print(f"Ridge Fold RMSE Mean: {np.mean(fold_rmse_lr):.6f} ± {np.std(fold_rmse_lr):.6f}")


TRAINING RIDGE REGRESSION WITH SCALING
Training fold 1 (Ridge) ...
Fold 1 RMSE (Ridge): 8.834381, Alpha: 1.172102
Training fold 2 (Ridge) ...
Fold 2 RMSE (Ridge): 8.902961, Alpha: 0.727895
Training fold 3 (Ridge) ...
Fold 3 RMSE (Ridge): 8.922053, Alpha: 1.172102
Training fold 4 (Ridge) ...
Fold 4 RMSE (Ridge): 8.850362, Alpha: 0.727895
Training fold 5 (Ridge) ...
Fold 5 RMSE (Ridge): 8.933412, Alpha: 1.172102
Training fold 6 (Ridge) ...
Fold 6 RMSE (Ridge): 8.846895, Alpha: 1.172102
Training fold 7 (Ridge) ...
Fold 7 RMSE (Ridge): 8.874743, Alpha: 0.727895
Training fold 8 (Ridge) ...
Fold 8 RMSE (Ridge): 8.927922, Alpha: 0.727895
Training fold 9 (Ridge) ...
Fold 9 RMSE (Ridge): 8.901229, Alpha: 1.172102
Training fold 10 (Ridge) ...
Fold 10 RMSE (Ridge): 8.848843, Alpha: 1.172102

Ridge OOF RMSE: 8.884352
Ridge Fold RMSE Mean: 8.884280 ± 0.035730


### Feature Selection

In [6]:
print("\n" + "="*50)
print("FEATURE SELECTION")
print("="*50)


feature_importance = pd.DataFrame({
    'feature': X_train_encoded.columns,
    'importance': np.abs(lr_models[0].coef_)
}).sort_values('importance', ascending=False)

print("\nTop 30 Features by Ridge importance:")
print(feature_importance.head(30))


n_features_to_keep = 65
top_features = feature_importance.head(n_features_to_keep)['feature'].tolist()
print(f"\nKeeping top {n_features_to_keep} features out of {len(feature_importance)}")


X = X[top_features]
X_test = X_test[top_features]
X_original = X_original[top_features]

print(f"XGB feature shapes - X: {X.shape}, X_test: {X_test.shape}, X_original: {X_original.shape}")


FEATURE SELECTION

Top 30 Features by Ridge importance:
                         feature  importance
23              sqrt_study_hours   60.670410
12             study_hours_cubed   28.683802
13           study_hours_quartic   28.414850
88                       study_z   22.759564
15        class_attendance_cubed   19.341877
31                 study_sigmoid   18.457358
89                  attendance_z   17.291389
43               study_center_sq   16.618647
92                    geo_effort   16.214379
26                     inv_study   15.502622
28                    study_tanh   15.473489
85               attendance_rank   15.309747
86                    sleep_rank   12.099436
50                study_over_age   11.520386
32                 sleep_sigmoid   10.992198
37         age_times_study_hours   10.863083
87                      age_rank   10.691062
97         log_study_sleep_ratio   10.172571
20               log_study_hours    9.886292
7                  sleep_quality    9.66731

### Preparing the Data with Categorical 

In [7]:
print("\n" + "="*50)
print("PREPARING XGB DATA")
print("="*50)


for col in CATS:
    if col in X.columns:
        X[col] = X[col].astype(str).astype("category")
        X_test[col] = X_test[col].astype(str).astype("category")
        X_original[col] = X_original[col].astype(str).astype("category")


X["ridge_pred"] = oof_pred_lr
X_test["ridge_pred"] = test_preds_lr.mean(axis=1)
X_original["ridge_pred"] = orig_preds_lr

print(f"Final XGB shapes with Ridge feature - X: {X.shape}, X_test: {X_test.shape}, X_original: {X_original.shape}")



PREPARING XGB DATA
Final XGB shapes with Ridge feature - X: (630000, 66), X_test: (270000, 66), X_original: (20000, 66)


### XGBoost Training 

In [8]:
print("\n" + "="*50)
print("TRAINING XGBOOST")
print("="*50)

xgb_params = {
    "n_estimators": 15000,
    "learning_rate": 0.005,
    "max_depth": 9,
    "subsample": 0.75,
    "reg_lambda": 5,
    "reg_alpha": 0.1,
    "gamma": 0.1,
    "colsample_bytree": 0.5,
    "colsample_bynode": 0.6,
    "min_child_weight": 5,
    "tree_method": "hist",
    "random_state": 42,
    "early_stopping_rounds": 80,
    "eval_metric": "rmse",
    "enable_categorical": True,
    "device": "cuda",
}

test_predictions = []
oof_predictions = np.zeros(len(X), dtype=float)
xgb_models = []


for fold, (train_index, val_index) in enumerate(kf.split(X, y_bins), start=1):
    print(f"\n--- Fold {fold} (XGB) ---")
    
    X_train_fold, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index]
    
    X_train_combined = pd.concat([X_train_fold, X_original], axis=0)
    y_train_combined = pd.concat([y_train_fold, y_orig], axis=0)
    
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_train_combined, 
        y_train_combined, 
        eval_set=[(X_val, y_val)], 
        verbose=500
    )
    
    xgb_models.append(model)
    
    val_preds = np.clip(model.predict(X_val), 0, 100)
    oof_predictions[val_index] = val_preds
    
    rmse_fold = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f"Fold {fold} XGB RMSE: {rmse_fold:.5f}")
    
    test_pred = np.clip(model.predict(X_test), 0, 100)
    test_predictions.append(test_pred)

xgb_oof_rmse = np.sqrt(mean_squared_error(y, oof_predictions))
print("\n" + "-"*50)
print(f"XGB OOF RMSE: {xgb_oof_rmse:.5f}")
print("-"*50)


TRAINING XGBOOST

--- Fold 1 (XGB) ---
[0]	validation_0-rmse:18.85448
[500]	validation_0-rmse:8.90254
[1000]	validation_0-rmse:8.71722
[1500]	validation_0-rmse:8.69441
[2000]	validation_0-rmse:8.67938
[2500]	validation_0-rmse:8.66752
[3000]	validation_0-rmse:8.65948
[3500]	validation_0-rmse:8.65351
[4000]	validation_0-rmse:8.64888
[4500]	validation_0-rmse:8.64533
[5000]	validation_0-rmse:8.64199
[5500]	validation_0-rmse:8.63975
[6000]	validation_0-rmse:8.63815
[6500]	validation_0-rmse:8.63702
[7000]	validation_0-rmse:8.63635
[7500]	validation_0-rmse:8.63567
[7755]	validation_0-rmse:8.63548
Fold 1 XGB RMSE: 8.63543

--- Fold 2 (XGB) ---
[0]	validation_0-rmse:18.84380
[500]	validation_0-rmse:8.97119
[1000]	validation_0-rmse:8.78462
[1500]	validation_0-rmse:8.76171
[2000]	validation_0-rmse:8.74662
[2500]	validation_0-rmse:8.73416
[3000]	validation_0-rmse:8.72546
[3500]	validation_0-rmse:8.71939
[4000]	validation_0-rmse:8.71477
[4500]	validation_0-rmse:8.71150
[5000]	validation_0-rmse:8.7

### Ensemble Blending 

In [9]:
print("\n" + "="*50)
print("ENSEMBLE BLENDING")
print("="*50)

alpha = 0.25  

final_oof = alpha * oof_pred_lr + (1 - alpha) * oof_predictions
final_test = alpha * test_preds_lr.mean(axis=1) + (1 - alpha) * np.mean(test_predictions, axis=0)


final_oof = np.clip(final_oof, 0, 100)
final_test = np.clip(final_test, 0, 100)

final_oof_rmse = np.sqrt(mean_squared_error(y, final_oof))

print(f"\nModel Performance:")
print(f"  Ridge OOF RMSE:     {ridge_oof_rmse:.5f}")
print(f"  XGB OOF RMSE:       {xgb_oof_rmse:.5f}")
print(f"  Blended OOF RMSE:   {final_oof_rmse:.5f} (alpha={alpha})")
print(f"\n  Improvement over Ridge: {ridge_oof_rmse - final_oof_rmse:.5f}")
print(f"  Improvement over XGB:   {xgb_oof_rmse - final_oof_rmse:.5f}")


ENSEMBLE BLENDING

Model Performance:
  Ridge OOF RMSE:     8.88435
  XGB OOF RMSE:       8.68503
  Blended OOF RMSE:   8.70126 (alpha=0.25)

  Improvement over Ridge: 0.18309
  Improvement over XGB:   -0.01623


### Submission

In [10]:
print("\n" + "="*50)
print("SAVING RESULTS")
print("="*50)


oof_df = pd.DataFrame({
    "id": train_df[ID_COL], 
    TARGET: final_oof
})
oof_df.to_csv("xgb.csv", index=False)


submission_df[TARGET] = final_test
submission_df.to_csv("submission.csv", index=False)



SAVING RESULTS


In [11]:
submission_df.head()

Unnamed: 0,id,exam_score
0,630000,70.851162
1,630001,69.959517
2,630002,88.350468
3,630003,56.297857
4,630004,46.894351


Acknowledgement: [https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe](https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe)