<a href="https://www.kaggle.com/code/samithsachidanandan/ps-s6e1-ridge-xgb?scriptVersionId=290405706" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

Acknowledgement: 

[https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe](https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe)
[https://www.kaggle.com/code/act18l/s6e1-single-xgb-add-categorymean](https://www.kaggle.com/code/act18l/s6e1-single-xgb-add-categorymean)

### Importing Libraries and Loading the Data 

In [12]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import xgboost as xgb
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error,mean_absolute_error
from sklearn.preprocessing import TargetEncoder

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import RidgeCV


import warnings
warnings.filterwarnings("ignore")

np.random.seed(42)

train_file = "/kaggle/input/playground-series-s6e1/train.csv"
test_file = "/kaggle/input/playground-series-s6e1/test.csv"
original_file = "/kaggle/input/exam-score-prediction-dataset/Exam_Score_Prediction.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
original_df = pd.read_csv(original_file)

submission_df = pd.read_csv("/kaggle/input/playground-series-s6e1/sample_submission.csv")

TARGET = "exam_score"
ID_COL = "id"

train_df.shape, test_df.shape, original_df.shape

((630000, 13), (270000, 12), (20000, 13))

### Base features

In [2]:
base_features = [col for col in train_df.columns if col not in [TARGET, ID_COL]]


CATS = train_df.select_dtypes("object").columns.to_list()
print("CATS:", CATS)

CATS: ['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']


### Feature Engineering

In [3]:


class CategoryMeanTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols=None):
        self.cat_cols = cat_cols
        self.mappings_ = {}
    
    def fit(self, X, y):
        X = X.copy()
        if self.cat_cols is None:
            self.cat_cols = X.select_dtypes(include=['category', 'object']).columns.tolist()
        self.mappings_ = {}
        for col in self.cat_cols:
            df_temp = pd.DataFrame({col: X[col], 'y': y})
            group_means = df_temp.groupby(col, dropna=False)['y'].mean()
            sorted_categories = group_means.sort_values().index
            self.mappings_[col] = {cat: i for i, cat in enumerate(sorted_categories)}
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for col, mapping in self.mappings_.items():
            if col in X.columns:
                X[col] = X[col].map(mapping).astype(np.float32)
        return X

In [4]:
def preprocess(df):

    df_temp = df.copy()
    eps = 1e-5
    
    sh_pos = df_temp['study_hours'].clip(lower=0)
    ca_pos = df_temp['class_attendance'].clip(lower=0)
    sl_pos = df_temp['sleep_hours'].clip(lower=0)
    ag_pos = df_temp['age'].clip(lower=0)
    
    df_temp['study_hours_squared'] = df_temp['study_hours'] ** 2
    df_temp['study_hours_cubed'] = df_temp['study_hours'] ** 3
    df_temp['study_hours_quartic'] = df_temp['study_hours'] ** 4
    df_temp['class_attendance_squared'] = df_temp['class_attendance'] ** 2
    df_temp['class_attendance_cubed'] = df_temp['class_attendance'] ** 3
    df_temp['sleep_hours_squared'] = df_temp['sleep_hours'] ** 2
    df_temp['sleep_hours_cubed'] = df_temp['sleep_hours'] ** 3
    df_temp['age_squared'] = df_temp['age'] ** 2
    df_temp['age_cubed'] = df_temp['age'] ** 3
    
    df_temp['log_study_hours'] = np.log1p(sh_pos)
    df_temp['log_class_attendance'] = np.log1p(ca_pos)
    df_temp['log_sleep_hours'] = np.log1p(sl_pos)
    df_temp['sqrt_study_hours'] = np.sqrt(sh_pos)
    df_temp['sqrt_class_attendance'] = np.sqrt(ca_pos)
    
    df_temp['inv_sleep'] = 1.0 / (sl_pos + 1.0)
    df_temp['inv_study'] = 1.0 / (sh_pos + 1.0)
    df_temp['inv_attendance'] = 1.0 / (ca_pos + 1.0)
    
    df_temp['study_tanh'] = np.tanh(df_temp['study_hours'] / 10.0)
    df_temp['sleep_tanh'] = np.tanh(df_temp['sleep_hours'] / 10.0)
    df_temp['attendance_tanh'] = np.tanh(df_temp['class_attendance'] / 100.0)
    
    df_temp['study_sigmoid'] = 1.0 / (1.0 + np.exp(-(df_temp['study_hours'] - 5.0)))
    df_temp['sleep_sigmoid'] = 1.0 / (1.0 + np.exp(-(df_temp['sleep_hours'] - 7.0)))
    df_temp['attendance_sigmoid'] = 1.0 / (1.0 + np.exp(-(df_temp['class_attendance'] - 85.0) / 8.0))
    
    df_temp['study_hours_times_attendance'] = df_temp['study_hours'] * df_temp['class_attendance']
    df_temp['study_hours_times_sleep'] = df_temp['study_hours'] * df_temp['sleep_hours']
    df_temp['attendance_times_sleep'] = df_temp['class_attendance'] * df_temp['sleep_hours']
    df_temp['age_times_study_hours'] = df_temp['age'] * df_temp['study_hours']
    df_temp['age_times_attendance'] = df_temp['age'] * df_temp['class_attendance']
    df_temp['age_times_sleep_hours'] = df_temp['age'] * df_temp['sleep_hours']
    
    df_temp['study_center_5'] = df_temp['study_hours'] - 5.0
    df_temp['sleep_center_7'] = df_temp['sleep_hours'] - 7.0
    df_temp['att_center_85'] = df_temp['class_attendance'] - 85.0
    df_temp['study_center_sq'] = df_temp['study_center_5'] ** 2
    df_temp['sleep_center_sq'] = df_temp['sleep_center_7'] ** 2
    df_temp['att_center_sq'] = df_temp['att_center_85'] ** 2
    
    df_temp['study_hours_over_sleep'] = df_temp['study_hours'] / (df_temp['sleep_hours'] + eps)
    df_temp['attendance_over_sleep'] = df_temp['class_attendance'] / (df_temp['sleep_hours'] + eps)
    df_temp['attendance_over_study'] = df_temp['class_attendance'] / (df_temp['study_hours'] + eps)
    df_temp['sleep_over_study'] = df_temp['sleep_hours'] / (df_temp['study_hours'] + eps)
    df_temp['study_over_age'] = df_temp['study_hours'] / (df_temp['age'] + eps)
    df_temp['attendance_over_age'] = df_temp['class_attendance'] / (df_temp['age'] + eps)
    
    df_temp['study_hours_clip'] = df_temp['study_hours'].clip(0, 12)
    df_temp['sleep_hours_clip'] = df_temp['sleep_hours'].clip(0, 12)
    df_temp['attendance_clip'] = df_temp['class_attendance'].clip(0, 100)
    
    df_temp['sleep_gap_8'] = (df_temp['sleep_hours'] - 8.0).abs()
    df_temp['sleep_gap_7'] = (df_temp['sleep_hours'] - 7.0).abs()
    df_temp['attendance_gap_100'] = (df_temp['class_attendance'] - 100.0).abs()
    df_temp['attendance_gap_90'] = (df_temp['class_attendance'] - 90.0).abs()
    df_temp['study_gap_6'] = (df_temp['study_hours'] - 6.0).abs()
    df_temp['study_gap_8'] = (df_temp['study_hours'] - 8.0).abs()
    
    df_temp['age_bin_num'] = pd.cut(df_temp['age'], bins=[0, 17, 19, 21, 23, 100], labels=[0, 1, 2, 3, 4]).astype(float)
    df_temp['study_bin_num'] = pd.cut(df_temp['study_hours'], bins=[-1, 2, 4, 6, 8, 100], labels=[0, 1, 2, 3, 4]).astype(float)
    df_temp['sleep_bin_num'] = pd.cut(df_temp['sleep_hours'], bins=[-1, 5, 6, 7, 8, 100], labels=[0, 1, 2, 3, 4]).astype(float)
    df_temp['attendance_bin_num'] = pd.cut(df_temp['class_attendance'], bins=[-1, 60, 75, 85, 95, 101], labels=[0, 1, 2, 3, 4]).astype(float)
    

    sleep_quality_map = {'poor': 0, 'average': 1, 'good': 2}
    facility_rating_map = {'low': 0, 'medium': 1, 'high': 2}
    exam_difficulty_map = {'easy': 0, 'moderate': 1, 'hard': 2}
    gender_map = {'male': 0, 'female': 1}
    internet_access_map = {'no': 0, 'yes': 1}
    

    df_temp['sleep_quality_numeric'] = df_temp['sleep_quality'].map(sleep_quality_map).fillna(1).astype(int)
    df_temp['facility_rating_numeric'] = df_temp['facility_rating'].map(facility_rating_map).fillna(1).astype(int)
    df_temp['exam_difficulty_numeric'] = df_temp['exam_difficulty'].map(exam_difficulty_map).fillna(1).astype(int)
    df_temp['gender_numeric'] = df_temp['gender'].map(gender_map).fillna(0).astype(int) if 'gender' in df_temp.columns else 0
    df_temp['internet_access_numeric'] = df_temp['internet_access'].map(internet_access_map).fillna(0).astype(int) if 'internet_access' in df_temp.columns else 0
    
  
    if 'study_method' in df_temp.columns:
        study_methods = df_temp['study_method'].unique()
        study_method_map = {method: i for i, method in enumerate(sorted(study_methods))}
        df_temp['study_method_numeric'] = df_temp['study_method'].map(study_method_map).fillna(0).astype(int)
    else:
        df_temp['study_method_numeric'] = 0
    
    if 'course' in df_temp.columns:
        courses = df_temp['course'].unique()
        course_map = {course: i for i, course in enumerate(sorted(courses))}
        df_temp['course_numeric'] = df_temp['course'].map(course_map).fillna(0).astype(int)
    else:
        df_temp['course_numeric'] = 0
    

    df_temp['study_hours_times_sleep_quality'] = df_temp['study_hours'] * df_temp['sleep_quality_numeric']
    df_temp['attendance_times_facility'] = df_temp['class_attendance'] * df_temp['facility_rating_numeric']
    df_temp['sleep_hours_times_difficulty'] = df_temp['sleep_hours'] * df_temp['exam_difficulty_numeric']
    
    df_temp['facility_x_sleepq'] = df_temp['facility_rating_numeric'] * df_temp['sleep_quality_numeric']
    df_temp['difficulty_x_facility'] = df_temp['exam_difficulty_numeric'] * df_temp['facility_rating_numeric']
    df_temp['difficulty_x_sleepq'] = df_temp['exam_difficulty_numeric'] * df_temp['sleep_quality_numeric']
    
    df_temp['high_att_low_sleep'] = ((df_temp['class_attendance'] >= 90) & (df_temp['sleep_hours'] <= 6)).astype(int)
    df_temp['high_att_high_study'] = ((df_temp['class_attendance'] >= 90) & (df_temp['study_hours'] >= 6)).astype(int)
    df_temp['low_att_high_study'] = ((df_temp['class_attendance'] <= 60) & (df_temp['study_hours'] >= 7)).astype(int)
    df_temp['ideal_sleep_flag'] = ((df_temp['sleep_hours'] >= 7) & (df_temp['sleep_hours'] <= 9)).astype(int)
    df_temp['short_sleep_flag'] = (df_temp['sleep_hours'] <= 5.5).astype(int)
    df_temp['high_study_flag'] = (df_temp['study_hours'] >= 7).astype(int)
    
    df_temp['efficiency'] = (df_temp['study_hours'] * df_temp['class_attendance']) / (df_temp['sleep_hours'] + 1)
    df_temp['efficiency2'] = (df_temp['study_hours_clip'] * df_temp['attendance_clip']) / (df_temp['sleep_hours_clip'] + 1)
    df_temp['weighted_sum'] = (0.06 * df_temp['class_attendance'] + 2.0 * df_temp['study_hours'] + 1.2 * df_temp['sleep_hours'])
    df_temp['weighted_sum_x_difficulty'] = df_temp['weighted_sum'] * (1.0 + 0.2 * df_temp['exam_difficulty_numeric'])

    df_temp['study_rank'] = sh_pos.rank(pct=True)
    df_temp['attendance_rank'] = ca_pos.rank(pct=True)
    df_temp['sleep_rank'] = sl_pos.rank(pct=True)
    df_temp['age_rank'] = ag_pos.rank(pct=True)

    df_temp['study_z'] = (sh_pos - sh_pos.mean()) / (sh_pos.std() + eps)
    df_temp['attendance_z'] = (ca_pos - ca_pos.mean()) / (ca_pos.std() + eps)
    df_temp['sleep_z'] = (sl_pos - sl_pos.mean()) / (sl_pos.std() + eps)

    df_temp['harmonic_effort'] = 3 / (
        (1 / (sh_pos + eps)) +
        (1 / (ca_pos + eps)) +
        (1 / (sl_pos + eps))
    )

    df_temp['geo_effort'] = (
        (sh_pos + 1) *
        (ca_pos + 1) *
        (sl_pos + 1)
    ) ** (1 / 3)

    df_temp['study_above_6'] = np.maximum(0, sh_pos - 6)
    df_temp['study_above_8'] = np.maximum(0, sh_pos - 8)
    df_temp['sleep_below_6'] = np.maximum(0, 6 - sl_pos)
    df_temp['attendance_below_75'] = np.maximum(0, 75 - ca_pos)

    df_temp['log_study_sleep_ratio'] = np.log1p(sh_pos) - np.log1p(sl_pos)
    df_temp['log_att_study_ratio'] = np.log1p(ca_pos) - np.log1p(sh_pos)
    

    df_temp['study_method_x_study_hours'] = df_temp['study_hours'] * df_temp['study_method_numeric']
    df_temp['course_difficulty'] = df_temp['course_numeric'] * df_temp['exam_difficulty_numeric']
    df_temp['internet_x_efficiency'] = (df_temp['study_hours'] * df_temp['class_attendance'] / 
                                        (df_temp['sleep_hours'] + 1)) * df_temp['internet_access_numeric']
    

    df_temp['sqrt_study_hours_x_attendance'] = df_temp['sqrt_study_hours'] * df_temp['class_attendance']
    df_temp['efficiency_cubed'] = ((df_temp['study_hours'] * df_temp['class_attendance'] / 
                                    (df_temp['sleep_hours'] + 1)) ** 1.5).fillna(0)
    

    df_temp['sleep_quality_x_study_z'] = df_temp['sleep_quality_numeric'] * df_temp['study_z']
    df_temp['facility_x_attendance'] = df_temp['facility_rating_numeric'] * df_temp['class_attendance']
    

    df_temp['study_hours_x_sleep_quality'] = df_temp['study_hours'] * df_temp['sleep_quality_numeric']
    df_temp['attendance_x_internet'] = df_temp['class_attendance'] * df_temp['internet_access_numeric']
    df_temp['course_x_attendance'] = df_temp['course_numeric'] * df_temp['class_attendance']
    df_temp['study_method_x_efficiency'] = df_temp['study_method_numeric'] * df_temp['efficiency']
    

    
    numeric_features = [
        'study_hours_squared', 'study_hours_cubed', 'study_hours_quartic',
        'class_attendance_squared', 'class_attendance_cubed',
        'sleep_hours_squared', 'sleep_hours_cubed',
        'age_squared', 'age_cubed',
        'log_study_hours', 'log_class_attendance', 'log_sleep_hours',
        'sqrt_study_hours', 'sqrt_class_attendance',
        'inv_sleep', 'inv_study', 'inv_attendance',
        'study_tanh', 'sleep_tanh', 'attendance_tanh',
        'study_sigmoid', 'sleep_sigmoid', 'attendance_sigmoid',
        'study_hours_times_attendance', 'study_hours_times_sleep', 'attendance_times_sleep',
        'age_times_study_hours', 'age_times_attendance', 'age_times_sleep_hours',
        'study_center_5', 'sleep_center_7', 'att_center_85',
        'study_center_sq', 'sleep_center_sq', 'att_center_sq',
        'study_hours_over_sleep', 'attendance_over_sleep',
        'attendance_over_study', 'sleep_over_study',
        'study_over_age', 'attendance_over_age',
        'study_hours_clip', 'sleep_hours_clip', 'attendance_clip',
        'sleep_gap_8', 'sleep_gap_7',
        'attendance_gap_100', 'attendance_gap_90',
        'study_gap_6', 'study_gap_8',
        'age_bin_num', 'study_bin_num', 'sleep_bin_num', 'attendance_bin_num',
        'sleep_quality_numeric', 'facility_rating_numeric', 'exam_difficulty_numeric',
        'study_hours_times_sleep_quality', 'attendance_times_facility', 'sleep_hours_times_difficulty',
        'facility_x_sleepq', 'difficulty_x_facility', 'difficulty_x_sleepq',
        'high_att_low_sleep', 'high_att_high_study', 'low_att_high_study',
        'ideal_sleep_flag', 'short_sleep_flag', 'high_study_flag',
        'efficiency', 'efficiency2',
        'weighted_sum', 'weighted_sum_x_difficulty',
        'study_rank', 'attendance_rank', 'sleep_rank', 'age_rank',
        'study_z', 'attendance_z', 'sleep_z',
        'harmonic_effort', 'geo_effort',
        'study_above_6', 'study_above_8',
        'sleep_below_6', 'attendance_below_75',
        'log_study_sleep_ratio', 'log_att_study_ratio',
        'study_method_x_study_hours', 'course_difficulty', 'internet_x_efficiency',
        'sqrt_study_hours_x_attendance', 'efficiency_cubed',
        'sleep_quality_x_study_z', 'facility_x_attendance',
        'study_hours_x_sleep_quality', 'attendance_x_internet',
        'course_x_attendance', 'study_method_x_efficiency',
    ]
    
    return df_temp[base_features + numeric_features], numeric_features

### Preprocessing and Preparing the Data

In [5]:
X_raw, numeric_cols = preprocess(train_df)
y = train_df[TARGET].reset_index(drop=True)

X_test_raw, _ = preprocess(test_df)
X_orig_raw, _ = preprocess(original_df)
y_orig = original_df[TARGET].reset_index(drop=True)


y = y.clip(0, 100)
y_orig = y_orig.clip(0, 100)

full_data = pd.concat([X_raw, X_test_raw, X_orig_raw], axis=0, ignore_index=True)


for col in numeric_cols:
    full_data[col] = full_data[col].astype(float)


X = full_data.iloc[:len(train_df)].copy()
X_test = full_data.iloc[len(train_df):len(train_df) + len(test_df)].copy()
X_original = full_data.iloc[len(train_df) + len(test_df):].copy()

print(f"Feature shapes - X: {X.shape}, X_test: {X_test.shape}, X_original: {X_original.shape}")

Feature shapes - X: (630000, 110), X_test: (270000, 110), X_original: (20000, 110)


### Preparing the Data with Categorical 

In [6]:
print(f"Shapes before encoding - X: {X.shape}, X_test: {X_test.shape}, X_original: {X_original.shape}")

cat_cols = X.select_dtypes(include=["category", "object"]).columns.tolist()
print(f"Categorical columns to encode: {cat_cols}")

cat_transformer = CategoryMeanTransformer(cat_cols=cat_cols)
cat_transformer.fit(X, y)

X = cat_transformer.transform(X)
X_test = cat_transformer.transform(X_test)
X_original = cat_transformer.transform(X_original)

X = X.astype(np.float32)
X_test = X_test.astype(np.float32)
X_original = X_original.astype(np.float32)
y_float = y.values.astype(np.float32)

print(f"Final shapes - X: {X.shape}, X_test: {X_test.shape}, X_original: {X_original.shape}")

Shapes before encoding - X: (630000, 110), X_test: (270000, 110), X_original: (20000, 110)
Categorical columns to encode: ['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']
Final shapes - X: (630000, 110), X_test: (270000, 110), X_original: (20000, 110)


### Training 

In [13]:
print("\nTraining Ridge Regression on full training data...")


scaler_ridge = StandardScaler()
X_ridge_scaled = scaler_ridge.fit_transform(X)
X_test_ridge_scaled = scaler_ridge.transform(X_test)




alphas = np.logspace(-2, 5, 100)
ridge_cv = RidgeCV(alphas=alphas, cv=5, scoring='neg_mean_squared_error')
ridge_cv.fit(X_ridge_scaled, y_float)




ridge_oof = ridge_cv.predict(X_ridge_scaled)
ridge_test = ridge_cv.predict(X_test_ridge_scaled)


ridge_oof = np.clip(ridge_oof, 0, 100)
ridge_test = np.clip(ridge_test, 0, 100)

ridge_rmse = np.sqrt(mean_squared_error(y_float, ridge_oof))
ridge_mae = mean_absolute_error(y_float, ridge_oof)

print(f"\nRidge Regression Results:")
print(f"  OOF RMSE: {ridge_rmse:.5f}")
print(f"  OOF MAE:  {ridge_mae:.5f}")


residuals = y_float - ridge_oof

print("\nTraining XGBoost to predict residuals using 10-fold CV...\n")


residual_bins = pd.qcut(residuals, q=10, labels=False, duplicates='drop').astype(int)

dtest_residual = xgb.DMatrix(X_test)


xgb_residual_params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.05,
    "max_depth": 8,
    "subsample": 0.85,
    "colsample_bytree": 0.75,
    "colsample_bynode": 0.8,
    "min_child_weight": 2,
    "gamma": 0.05,
    "lambda": 0.8,
    "alpha": 0.02,
    "eval_metric": "rmse",
    "tree_method": "hist",
    "verbosity": 0,
    "seed": 42,
}

oof_residual_pred = np.zeros(len(X), dtype=np.float32)
test_residual_preds = []
residual_fold_metrics = []

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X, residual_bins), start=1):
    print(f"Fold {fold}/10 - Training XGBoost on residuals...")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    residual_train = residuals[train_idx]
    residual_val = residuals[val_idx]
    
    dtrain = xgb.DMatrix(X_train, label=residual_train)
    dval = xgb.DMatrix(X_val, label=residual_val)
    evals = [(dtrain, "train"), (dval, "valid")]
    
    xgb_residual_model = xgb.train(
        params=xgb_residual_params,
        dtrain=dtrain,
        num_boost_round=3000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False,
    )
    

    residual_val_pred = xgb_residual_model.predict(dval)
    oof_residual_pred[val_idx] = residual_val_pred
    
    residual_test_pred = xgb_residual_model.predict(dtest_residual)
    test_residual_preds.append(residual_test_pred)
    
    residual_rmse = np.sqrt(mean_squared_error(residual_val, residual_val_pred))
    residual_mae = mean_absolute_error(residual_val, residual_val_pred)
    
    print(f"  RMSE (residuals): {residual_rmse:.5f} | MAE: {residual_mae:.5f}")
    print(f"  Best iteration: {xgb_residual_model.best_iteration}")
    
    residual_fold_metrics.append({
        "fold": fold,
        "rmse": residual_rmse,
        "mae": residual_mae,
    })


test_residual_preds = np.mean(test_residual_preds, axis=0)


xgb_residual_rmse = np.sqrt(mean_squared_error(residuals, oof_residual_pred))
xgb_residual_mae = mean_absolute_error(residuals, oof_residual_pred)

print(f"\n{'='*70}")
print("RESIDUAL MODEL PERFORMANCE")
print(f"{'='*70}")
print(f"XGBoost Residual OOF RMSE: {xgb_residual_rmse:.5f}")
print(f"XGBoost Residual OOF MAE:  {xgb_residual_mae:.5f}")


Training Ridge Regression on full training data...

Ridge Regression Results:
  OOF RMSE: 8.93853
  OOF MAE:  7.14107

Training XGBoost to predict residuals using 10-fold CV...

Fold 1/10 - Training XGBoost on residuals...
  RMSE (residuals): 8.70950 | MAE: 6.94075
  Best iteration: 657
Fold 2/10 - Training XGBoost on residuals...
  RMSE (residuals): 8.71449 | MAE: 6.94469
  Best iteration: 583
Fold 3/10 - Training XGBoost on residuals...
  RMSE (residuals): 8.70459 | MAE: 6.93785
  Best iteration: 578
Fold 4/10 - Training XGBoost on residuals...
  RMSE (residuals): 8.70577 | MAE: 6.93693
  Best iteration: 650
Fold 5/10 - Training XGBoost on residuals...
  RMSE (residuals): 8.70871 | MAE: 6.93639
  Best iteration: 846
Fold 6/10 - Training XGBoost on residuals...
  RMSE (residuals): 8.74528 | MAE: 6.96089
  Best iteration: 748
Fold 7/10 - Training XGBoost on residuals...
  RMSE (residuals): 8.72031 | MAE: 6.93952
  Best iteration: 721
Fold 8/10 - Training XGBoost on residuals...
  RMSE

In [14]:
print("\n" + "="*70)
print("STEP 4: STACKING - COMBINE RIDGE + RESIDUAL PREDICTIONS")
print("="*70)



best_stacking_rmse = float('inf')
best_correction_weight = None


correction_weights = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

for corr_w in correction_weights:

    stacking_oof = ridge_oof + corr_w * oof_residual_pred
    stacking_oof = np.clip(stacking_oof, 0, 100)
    
    stacking_rmse = np.sqrt(mean_squared_error(y_float, stacking_oof))
    
    print(f"Correction weight: {corr_w:.1f} → Stacking RMSE: {stacking_rmse:.5f}")
    
    if stacking_rmse < best_stacking_rmse:
        best_stacking_rmse = stacking_rmse
        best_correction_weight = corr_w

print(f"\n Best correction weight: {best_correction_weight:.2f}")
print(f"   Stacking RMSE: {best_stacking_rmse:.5f}\n")


print("="*70)
print("FINAL PREDICTIONS")
print("="*70)


final_oof = ridge_oof + best_correction_weight * oof_residual_pred
final_oof = np.clip(final_oof, 0, 100)


final_test = ridge_test + best_correction_weight * test_residual_preds
final_test = np.clip(final_test, 0, 100)

final_stacking_rmse = np.sqrt(mean_squared_error(y_float, final_oof))
final_stacking_mae = mean_absolute_error(y_float, final_oof)


print("\n" + "="*70)
print("PERFORMANCE SUMMARY")
print("="*70)

print(f"\n1. Base Model (Ridge):")
print(f"   OOF RMSE: {ridge_rmse:.5f}")
print(f"   OOF MAE:  {ridge_mae:.5f}")

print(f"\n2. Meta Model (XGBoost on Residuals):")
print(f"   Residual RMSE: {xgb_residual_rmse:.5f}")
print(f"   Residual MAE:  {xgb_residual_mae:.5f}")

print(f"\n3. Stacking Ensemble (Ridge + {best_correction_weight:.2f}×XGB Residual):")
print(f"   OOF RMSE: {final_stacking_rmse:.5f}")
print(f"   OOF MAE:  {final_stacking_mae:.5f}")

print(f"\n4. Improvements:")
print(f"   vs Ridge:     {ridge_rmse - final_stacking_rmse:.5f} RMSE improvement")
print(f"   vs XGB+LGB ensemble (if available): {8.72268 - final_stacking_rmse:.5f} improvement")

print(f"\n5. Prediction Ranges:")
print(f"   Ridge:    [{ridge_oof.min():.2f}, {ridge_oof.max():.2f}]")
print(f"   Stacking: [{final_oof.min():.2f}, {final_oof.max():.2f}]")



STEP 4: STACKING - COMBINE RIDGE + RESIDUAL PREDICTIONS
Correction weight: 0.0 → Stacking RMSE: 8.93853
Correction weight: 0.1 → Stacking RMSE: 8.89862
Correction weight: 0.2 → Stacking RMSE: 8.86261
Correction weight: 0.3 → Stacking RMSE: 8.83056
Correction weight: 0.4 → Stacking RMSE: 8.80251
Correction weight: 0.5 → Stacking RMSE: 8.77850
Correction weight: 0.6 → Stacking RMSE: 8.75855
Correction weight: 0.7 → Stacking RMSE: 8.74271
Correction weight: 0.8 → Stacking RMSE: 8.73098
Correction weight: 0.9 → Stacking RMSE: 8.72338
Correction weight: 1.0 → Stacking RMSE: 8.71993

 Best correction weight: 1.00
   Stacking RMSE: 8.71993

FINAL PREDICTIONS

PERFORMANCE SUMMARY

1. Base Model (Ridge):
   OOF RMSE: 8.93853
   OOF MAE:  7.14107

2. Meta Model (XGBoost on Residuals):
   Residual RMSE: 8.72000
   Residual MAE:  6.94558

3. Stacking Ensemble (Ridge + 1.00×XGB Residual):
   OOF RMSE: 8.71993
   OOF MAE:  6.94524

4. Improvements:
   vs Ridge:     0.21861 RMSE improvement
   vs XG

### Submission

In [15]:
print("\n" + "=" * 50)
print("SAVING RESULTS")
print("=" * 50)

oof_df = pd.DataFrame({
    "id": train_df[ID_COL], 
    TARGET: final_oof  
})

oof_df.to_csv("oof_ridge_xgb_stacking.csv", index=False)

submission_df[TARGET] = final_test
submission_df.to_csv("submission.csv", index=False)


SAVING RESULTS


In [16]:
submission_df.head()

Unnamed: 0,id,exam_score
0,630000,71.309814
1,630001,70.848984
2,630002,87.977707
3,630003,55.544342
4,630004,47.045559


Acknowledgement: [https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe](https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe)