<a href="https://www.kaggle.com/code/samithsachidanandan/ps-s6e1-ridge-xgb-fe?scriptVersionId=289994921" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

Acknowledgement: 

[https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe](https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe)
[https://www.kaggle.com/code/act18l/s6e1-single-xgb-add-categorymean](https://www.kaggle.com/code/act18l/s6e1-single-xgb-add-categorymean)

### Importing Libraries and Loading the Data 

In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import xgboost as xgb
import pandas as pd
import numpy as np

from sklearn.linear_model import RidgeCV
from sklearn.metrics import root_mean_squared_error,mean_absolute_error
from sklearn.preprocessing import TargetEncoder

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold

from sklearn.base import BaseEstimator, TransformerMixin
import lightgbm as lgb
from sklearn.linear_model import Ridge, ElasticNetCV
from sklearn.preprocessing import StandardScaler



import warnings
warnings.filterwarnings("ignore")

np.random.seed(42)

train_file = "/kaggle/input/playground-series-s6e1/train.csv"
test_file = "/kaggle/input/playground-series-s6e1/test.csv"
original_file = "/kaggle/input/exam-score-prediction-dataset/Exam_Score_Prediction.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
original_df = pd.read_csv(original_file)

submission_df = pd.read_csv("/kaggle/input/playground-series-s6e1/sample_submission.csv")

TARGET = "exam_score"
ID_COL = "id"

train_df.shape, test_df.shape, original_df.shape

  if entities is not ():


((630000, 13), (270000, 12), (20000, 13))

### Base features

In [2]:
base_features = [col for col in train_df.columns if col not in [TARGET, ID_COL]]


CATS = train_df.select_dtypes("object").columns.to_list()
print("CATS:", CATS)

CATS: ['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']


### Feature Engineering

In [3]:


class CategoryMeanTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols=None):
        self.cat_cols = cat_cols
        self.mappings_ = {}
    
    def fit(self, X, y):
        X = X.copy()
        if self.cat_cols is None:
            self.cat_cols = X.select_dtypes(include=['category', 'object']).columns.tolist()
        self.mappings_ = {}
        for col in self.cat_cols:
            df_temp = pd.DataFrame({col: X[col], 'y': y})
            group_means = df_temp.groupby(col, dropna=False)['y'].mean()
            sorted_categories = group_means.sort_values().index
            self.mappings_[col] = {cat: i for i, cat in enumerate(sorted_categories)}
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for col, mapping in self.mappings_.items():
            if col in X.columns:
                X[col] = X[col].map(mapping).astype(np.float32)
        return X

In [4]:
def preprocess(df):

    df_temp = df.copy()
    eps = 1e-5
    
    sh_pos = df_temp['study_hours'].clip(lower=0)
    ca_pos = df_temp['class_attendance'].clip(lower=0)
    sl_pos = df_temp['sleep_hours'].clip(lower=0)
    ag_pos = df_temp['age'].clip(lower=0)
    
    df_temp['study_hours_squared'] = df_temp['study_hours'] ** 2
    df_temp['study_hours_cubed'] = df_temp['study_hours'] ** 3
    df_temp['study_hours_quartic'] = df_temp['study_hours'] ** 4
    df_temp['class_attendance_squared'] = df_temp['class_attendance'] ** 2
    df_temp['class_attendance_cubed'] = df_temp['class_attendance'] ** 3
    df_temp['sleep_hours_squared'] = df_temp['sleep_hours'] ** 2
    df_temp['sleep_hours_cubed'] = df_temp['sleep_hours'] ** 3
    df_temp['age_squared'] = df_temp['age'] ** 2
    df_temp['age_cubed'] = df_temp['age'] ** 3
    
    df_temp['log_study_hours'] = np.log1p(sh_pos)
    df_temp['log_class_attendance'] = np.log1p(ca_pos)
    df_temp['log_sleep_hours'] = np.log1p(sl_pos)
    df_temp['sqrt_study_hours'] = np.sqrt(sh_pos)
    df_temp['sqrt_class_attendance'] = np.sqrt(ca_pos)
    
    df_temp['inv_sleep'] = 1.0 / (sl_pos + 1.0)
    df_temp['inv_study'] = 1.0 / (sh_pos + 1.0)
    df_temp['inv_attendance'] = 1.0 / (ca_pos + 1.0)
    
    df_temp['study_tanh'] = np.tanh(df_temp['study_hours'] / 10.0)
    df_temp['sleep_tanh'] = np.tanh(df_temp['sleep_hours'] / 10.0)
    df_temp['attendance_tanh'] = np.tanh(df_temp['class_attendance'] / 100.0)
    
    df_temp['study_sigmoid'] = 1.0 / (1.0 + np.exp(-(df_temp['study_hours'] - 5.0)))
    df_temp['sleep_sigmoid'] = 1.0 / (1.0 + np.exp(-(df_temp['sleep_hours'] - 7.0)))
    df_temp['attendance_sigmoid'] = 1.0 / (1.0 + np.exp(-(df_temp['class_attendance'] - 85.0) / 8.0))
    
    df_temp['study_hours_times_attendance'] = df_temp['study_hours'] * df_temp['class_attendance']
    df_temp['study_hours_times_sleep'] = df_temp['study_hours'] * df_temp['sleep_hours']
    df_temp['attendance_times_sleep'] = df_temp['class_attendance'] * df_temp['sleep_hours']
    df_temp['age_times_study_hours'] = df_temp['age'] * df_temp['study_hours']
    df_temp['age_times_attendance'] = df_temp['age'] * df_temp['class_attendance']
    df_temp['age_times_sleep_hours'] = df_temp['age'] * df_temp['sleep_hours']
    
    df_temp['study_center_5'] = df_temp['study_hours'] - 5.0
    df_temp['sleep_center_7'] = df_temp['sleep_hours'] - 7.0
    df_temp['att_center_85'] = df_temp['class_attendance'] - 85.0
    df_temp['study_center_sq'] = df_temp['study_center_5'] ** 2
    df_temp['sleep_center_sq'] = df_temp['sleep_center_7'] ** 2
    df_temp['att_center_sq'] = df_temp['att_center_85'] ** 2
    
    df_temp['study_hours_over_sleep'] = df_temp['study_hours'] / (df_temp['sleep_hours'] + eps)
    df_temp['attendance_over_sleep'] = df_temp['class_attendance'] / (df_temp['sleep_hours'] + eps)
    df_temp['attendance_over_study'] = df_temp['class_attendance'] / (df_temp['study_hours'] + eps)
    df_temp['sleep_over_study'] = df_temp['sleep_hours'] / (df_temp['study_hours'] + eps)
    df_temp['study_over_age'] = df_temp['study_hours'] / (df_temp['age'] + eps)
    df_temp['attendance_over_age'] = df_temp['class_attendance'] / (df_temp['age'] + eps)
    
    df_temp['study_hours_clip'] = df_temp['study_hours'].clip(0, 12)
    df_temp['sleep_hours_clip'] = df_temp['sleep_hours'].clip(0, 12)
    df_temp['attendance_clip'] = df_temp['class_attendance'].clip(0, 100)
    
    df_temp['sleep_gap_8'] = (df_temp['sleep_hours'] - 8.0).abs()
    df_temp['sleep_gap_7'] = (df_temp['sleep_hours'] - 7.0).abs()
    df_temp['attendance_gap_100'] = (df_temp['class_attendance'] - 100.0).abs()
    df_temp['attendance_gap_90'] = (df_temp['class_attendance'] - 90.0).abs()
    df_temp['study_gap_6'] = (df_temp['study_hours'] - 6.0).abs()
    df_temp['study_gap_8'] = (df_temp['study_hours'] - 8.0).abs()
    
    df_temp['age_bin_num'] = pd.cut(df_temp['age'], bins=[0, 17, 19, 21, 23, 100], labels=[0, 1, 2, 3, 4]).astype(float)
    df_temp['study_bin_num'] = pd.cut(df_temp['study_hours'], bins=[-1, 2, 4, 6, 8, 100], labels=[0, 1, 2, 3, 4]).astype(float)
    df_temp['sleep_bin_num'] = pd.cut(df_temp['sleep_hours'], bins=[-1, 5, 6, 7, 8, 100], labels=[0, 1, 2, 3, 4]).astype(float)
    df_temp['attendance_bin_num'] = pd.cut(df_temp['class_attendance'], bins=[-1, 60, 75, 85, 95, 101], labels=[0, 1, 2, 3, 4]).astype(float)
    

    sleep_quality_map = {'poor': 0, 'average': 1, 'good': 2}
    facility_rating_map = {'low': 0, 'medium': 1, 'high': 2}
    exam_difficulty_map = {'easy': 0, 'moderate': 1, 'hard': 2}
    gender_map = {'male': 0, 'female': 1}
    internet_access_map = {'no': 0, 'yes': 1}
    

    df_temp['sleep_quality_numeric'] = df_temp['sleep_quality'].map(sleep_quality_map).fillna(1).astype(int)
    df_temp['facility_rating_numeric'] = df_temp['facility_rating'].map(facility_rating_map).fillna(1).astype(int)
    df_temp['exam_difficulty_numeric'] = df_temp['exam_difficulty'].map(exam_difficulty_map).fillna(1).astype(int)
    df_temp['gender_numeric'] = df_temp['gender'].map(gender_map).fillna(0).astype(int) if 'gender' in df_temp.columns else 0
    df_temp['internet_access_numeric'] = df_temp['internet_access'].map(internet_access_map).fillna(0).astype(int) if 'internet_access' in df_temp.columns else 0
    
  
    if 'study_method' in df_temp.columns:
        study_methods = df_temp['study_method'].unique()
        study_method_map = {method: i for i, method in enumerate(sorted(study_methods))}
        df_temp['study_method_numeric'] = df_temp['study_method'].map(study_method_map).fillna(0).astype(int)
    else:
        df_temp['study_method_numeric'] = 0
    
    if 'course' in df_temp.columns:
        courses = df_temp['course'].unique()
        course_map = {course: i for i, course in enumerate(sorted(courses))}
        df_temp['course_numeric'] = df_temp['course'].map(course_map).fillna(0).astype(int)
    else:
        df_temp['course_numeric'] = 0
    

    df_temp['study_hours_times_sleep_quality'] = df_temp['study_hours'] * df_temp['sleep_quality_numeric']
    df_temp['attendance_times_facility'] = df_temp['class_attendance'] * df_temp['facility_rating_numeric']
    df_temp['sleep_hours_times_difficulty'] = df_temp['sleep_hours'] * df_temp['exam_difficulty_numeric']
    
    df_temp['facility_x_sleepq'] = df_temp['facility_rating_numeric'] * df_temp['sleep_quality_numeric']
    df_temp['difficulty_x_facility'] = df_temp['exam_difficulty_numeric'] * df_temp['facility_rating_numeric']
    df_temp['difficulty_x_sleepq'] = df_temp['exam_difficulty_numeric'] * df_temp['sleep_quality_numeric']
    
    df_temp['high_att_low_sleep'] = ((df_temp['class_attendance'] >= 90) & (df_temp['sleep_hours'] <= 6)).astype(int)
    df_temp['high_att_high_study'] = ((df_temp['class_attendance'] >= 90) & (df_temp['study_hours'] >= 6)).astype(int)
    df_temp['low_att_high_study'] = ((df_temp['class_attendance'] <= 60) & (df_temp['study_hours'] >= 7)).astype(int)
    df_temp['ideal_sleep_flag'] = ((df_temp['sleep_hours'] >= 7) & (df_temp['sleep_hours'] <= 9)).astype(int)
    df_temp['short_sleep_flag'] = (df_temp['sleep_hours'] <= 5.5).astype(int)
    df_temp['high_study_flag'] = (df_temp['study_hours'] >= 7).astype(int)
    
    df_temp['efficiency'] = (df_temp['study_hours'] * df_temp['class_attendance']) / (df_temp['sleep_hours'] + 1)
    df_temp['efficiency2'] = (df_temp['study_hours_clip'] * df_temp['attendance_clip']) / (df_temp['sleep_hours_clip'] + 1)
    df_temp['weighted_sum'] = (0.06 * df_temp['class_attendance'] + 2.0 * df_temp['study_hours'] + 1.2 * df_temp['sleep_hours'])
    df_temp['weighted_sum_x_difficulty'] = df_temp['weighted_sum'] * (1.0 + 0.2 * df_temp['exam_difficulty_numeric'])

    df_temp['study_rank'] = sh_pos.rank(pct=True)
    df_temp['attendance_rank'] = ca_pos.rank(pct=True)
    df_temp['sleep_rank'] = sl_pos.rank(pct=True)
    df_temp['age_rank'] = ag_pos.rank(pct=True)

    df_temp['study_z'] = (sh_pos - sh_pos.mean()) / (sh_pos.std() + eps)
    df_temp['attendance_z'] = (ca_pos - ca_pos.mean()) / (ca_pos.std() + eps)
    df_temp['sleep_z'] = (sl_pos - sl_pos.mean()) / (sl_pos.std() + eps)

    df_temp['harmonic_effort'] = 3 / (
        (1 / (sh_pos + eps)) +
        (1 / (ca_pos + eps)) +
        (1 / (sl_pos + eps))
    )

    df_temp['geo_effort'] = (
        (sh_pos + 1) *
        (ca_pos + 1) *
        (sl_pos + 1)
    ) ** (1 / 3)

    df_temp['study_above_6'] = np.maximum(0, sh_pos - 6)
    df_temp['study_above_8'] = np.maximum(0, sh_pos - 8)
    df_temp['sleep_below_6'] = np.maximum(0, 6 - sl_pos)
    df_temp['attendance_below_75'] = np.maximum(0, 75 - ca_pos)

    df_temp['log_study_sleep_ratio'] = np.log1p(sh_pos) - np.log1p(sl_pos)
    df_temp['log_att_study_ratio'] = np.log1p(ca_pos) - np.log1p(sh_pos)
    

    df_temp['study_method_x_study_hours'] = df_temp['study_hours'] * df_temp['study_method_numeric']
    df_temp['course_difficulty'] = df_temp['course_numeric'] * df_temp['exam_difficulty_numeric']
    df_temp['internet_x_efficiency'] = (df_temp['study_hours'] * df_temp['class_attendance'] / 
                                        (df_temp['sleep_hours'] + 1)) * df_temp['internet_access_numeric']
    

    df_temp['sqrt_study_hours_x_attendance'] = df_temp['sqrt_study_hours'] * df_temp['class_attendance']
    df_temp['efficiency_cubed'] = ((df_temp['study_hours'] * df_temp['class_attendance'] / 
                                    (df_temp['sleep_hours'] + 1)) ** 1.5).fillna(0)
    

    df_temp['sleep_quality_x_study_z'] = df_temp['sleep_quality_numeric'] * df_temp['study_z']
    df_temp['facility_x_attendance'] = df_temp['facility_rating_numeric'] * df_temp['class_attendance']
    

    df_temp['study_hours_x_sleep_quality'] = df_temp['study_hours'] * df_temp['sleep_quality_numeric']
    df_temp['attendance_x_internet'] = df_temp['class_attendance'] * df_temp['internet_access_numeric']
    df_temp['course_x_attendance'] = df_temp['course_numeric'] * df_temp['class_attendance']
    df_temp['study_method_x_efficiency'] = df_temp['study_method_numeric'] * df_temp['efficiency']
    

    
    numeric_features = [
        'study_hours_squared', 'study_hours_cubed', 'study_hours_quartic',
        'class_attendance_squared', 'class_attendance_cubed',
        'sleep_hours_squared', 'sleep_hours_cubed',
        'age_squared', 'age_cubed',
        'log_study_hours', 'log_class_attendance', 'log_sleep_hours',
        'sqrt_study_hours', 'sqrt_class_attendance',
        'inv_sleep', 'inv_study', 'inv_attendance',
        'study_tanh', 'sleep_tanh', 'attendance_tanh',
        'study_sigmoid', 'sleep_sigmoid', 'attendance_sigmoid',
        'study_hours_times_attendance', 'study_hours_times_sleep', 'attendance_times_sleep',
        'age_times_study_hours', 'age_times_attendance', 'age_times_sleep_hours',
        'study_center_5', 'sleep_center_7', 'att_center_85',
        'study_center_sq', 'sleep_center_sq', 'att_center_sq',
        'study_hours_over_sleep', 'attendance_over_sleep',
        'attendance_over_study', 'sleep_over_study',
        'study_over_age', 'attendance_over_age',
        'study_hours_clip', 'sleep_hours_clip', 'attendance_clip',
        'sleep_gap_8', 'sleep_gap_7',
        'attendance_gap_100', 'attendance_gap_90',
        'study_gap_6', 'study_gap_8',
        'age_bin_num', 'study_bin_num', 'sleep_bin_num', 'attendance_bin_num',
        'sleep_quality_numeric', 'facility_rating_numeric', 'exam_difficulty_numeric',
        'study_hours_times_sleep_quality', 'attendance_times_facility', 'sleep_hours_times_difficulty',
        'facility_x_sleepq', 'difficulty_x_facility', 'difficulty_x_sleepq',
        'high_att_low_sleep', 'high_att_high_study', 'low_att_high_study',
        'ideal_sleep_flag', 'short_sleep_flag', 'high_study_flag',
        'efficiency', 'efficiency2',
        'weighted_sum', 'weighted_sum_x_difficulty',
        'study_rank', 'attendance_rank', 'sleep_rank', 'age_rank',
        'study_z', 'attendance_z', 'sleep_z',
        'harmonic_effort', 'geo_effort',
        'study_above_6', 'study_above_8',
        'sleep_below_6', 'attendance_below_75',
        'log_study_sleep_ratio', 'log_att_study_ratio',
        'study_method_x_study_hours', 'course_difficulty', 'internet_x_efficiency',
        'sqrt_study_hours_x_attendance', 'efficiency_cubed',
        'sleep_quality_x_study_z', 'facility_x_attendance',
        'study_hours_x_sleep_quality', 'attendance_x_internet',
        'course_x_attendance', 'study_method_x_efficiency',
    ]
    
    return df_temp[base_features + numeric_features], numeric_features

### Preprocessing and Preparing the Data

In [5]:
X_raw, numeric_cols = preprocess(train_df)
y = train_df[TARGET].reset_index(drop=True)

X_test_raw, _ = preprocess(test_df)
X_orig_raw, _ = preprocess(original_df)
y_orig = original_df[TARGET].reset_index(drop=True)


y = y.clip(0, 100)
y_orig = y_orig.clip(0, 100)

full_data = pd.concat([X_raw, X_test_raw, X_orig_raw], axis=0, ignore_index=True)


for col in numeric_cols:
    full_data[col] = full_data[col].astype(float)


X = full_data.iloc[:len(train_df)].copy()
X_test = full_data.iloc[len(train_df):len(train_df) + len(test_df)].copy()
X_original = full_data.iloc[len(train_df) + len(test_df):].copy()

print(f"Feature shapes - X: {X.shape}, X_test: {X_test.shape}, X_original: {X_original.shape}")

Feature shapes - X: (630000, 110), X_test: (270000, 110), X_original: (20000, 110)


### Ridge Regression 

In [None]:
print("\n" + "="*50)
print("IMPROVED RIDGE REGRESSION WITH OPTIMIZATION")
print("="*50)

FOLDS = 10
y_bins = pd.qcut(y, q=10, labels=False, duplicates='drop').astype(int)
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=1003)

scalers_ridge = []
N_SAMPLES_TRAIN = X.shape[0]
N_SAMPLES_TEST = X_test.shape[0]

oof_pred_lr = np.zeros(N_SAMPLES_TRAIN)
test_preds_lr = np.zeros((N_SAMPLES_TEST, FOLDS))
orig_preds_lr = np.zeros(X_original.shape[0])

fold_rmse_lr = []
lr_models = []
target_encoders = []


alphas = np.concatenate([
    np.logspace(-3, -1, 20),      
    np.logspace(-1, 1, 30),      
    np.logspace(1, 3, 20),        
])

for fold, (train_index, val_index) in enumerate(kf.split(X, y_bins), start=1):
    print(f"\nTraining fold {fold} (Ridge)...")
    
    X_train_fold, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index]
    
 
    X_train_combined = pd.concat([X_train_fold, X_original], axis=0)
    y_train_combined = pd.concat([y_train_fold, y_orig], axis=0)
    

    target_encoder = TargetEncoder(smooth='auto', target_type='continuous')
    
    X_train_encoded = X_train_combined.copy()
    X_val_encoded = X_val.copy()
    X_test_encoded = X_test.copy()
    
    X_train_encoded[CATS] = target_encoder.fit_transform(X_train_combined[CATS], y_train_combined)
    X_val_encoded[CATS] = target_encoder.transform(X_val[CATS])
    X_test_encoded[CATS] = target_encoder.transform(X_test[CATS])
    

    scaler = StandardScaler()
    
    X_train_scaled = X_train_encoded.copy()
    X_val_scaled = X_val_encoded.copy()
    X_test_scaled = X_test_encoded.copy()
    
    X_train_scaled[:] = scaler.fit_transform(X_train_encoded)
    X_val_scaled[:] = scaler.transform(X_val_encoded)
    X_test_scaled[:] = scaler.transform(X_test_encoded)
    
    scalers_ridge.append(scaler)
    


    lr_model = RidgeCV(
        alphas=alphas, 
        cv=10,  
        scoring='neg_mean_squared_error', 
        alpha_per_target=False
    )
    lr_model.fit(X_train_scaled, y_train_combined.to_numpy().ravel())
    lr_models.append(lr_model)
    target_encoders.append(target_encoder)
    

    lr_val_pred = lr_model.predict(X_val_scaled)
    lr_test_pred = lr_model.predict(X_test_scaled)
    lr_orig_pred = lr_model.predict(X_train_scaled.iloc[-X_original.shape[0]:])
    

    lr_val_pred = np.clip(lr_val_pred, 0, 100)
    lr_test_pred = np.clip(lr_test_pred, 0, 100)
    lr_orig_pred = np.clip(lr_orig_pred, 0, 100)
    
    oof_pred_lr[val_index] = lr_val_pred
    test_preds_lr[:, fold - 1] = lr_test_pred
    orig_preds_lr += lr_orig_pred / FOLDS
    
    rmse_lr = root_mean_squared_error(y_val, lr_val_pred)
    fold_rmse_lr.append(rmse_lr)
    
    print(f"  Fold {fold} RMSE: {rmse_lr:.6f}")
    print(f"  Alpha selected: {lr_model.alpha_:.6f}")
    

ridge_oof_rmse = root_mean_squared_error(y, oof_pred_lr)

print(f"\n" + "="*50)
print("RIDGE REGRESSION RESULTS")
print("="*50)
print(f"Ridge OOF RMSE: {ridge_oof_rmse:.6f}")
print(f"Ridge Fold RMSE Mean: {np.mean(fold_rmse_lr):.6f} ± {np.std(fold_rmse_lr):.6f}")
print(f"Min fold RMSE: {np.min(fold_rmse_lr):.6f}")
print(f"Max fold RMSE: {np.max(fold_rmse_lr):.6f}")


print(f"\n" + "="*50)
print(" ELASTICNET COMPARISON")
print("="*50)

oof_pred_elastic = np.zeros(N_SAMPLES_TRAIN)
test_preds_elastic = np.zeros((N_SAMPLES_TEST, FOLDS))
fold_rmse_elastic = []

for fold, (train_index, val_index) in enumerate(kf.split(X, y_bins), start=1):
    X_train_fold, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index]
    
    X_train_combined = pd.concat([X_train_fold, X_original], axis=0)
    y_train_combined = pd.concat([y_train_fold, y_orig], axis=0)
    
    target_encoder = TargetEncoder(smooth='auto', target_type='continuous')
    
    X_train_encoded = X_train_combined.copy()
    X_val_encoded = X_val.copy()
    X_test_encoded = X_test.copy()
    
    X_train_encoded[CATS] = target_encoder.fit_transform(X_train_combined[CATS], y_train_combined)
    X_val_encoded[CATS] = target_encoder.transform(X_val[CATS])
    X_test_encoded[CATS] = target_encoder.transform(X_test[CATS])
    
    scaler = StandardScaler()
    
    X_train_scaled = scaler.fit_transform(X_train_encoded)
    X_val_scaled = scaler.transform(X_val_encoded)
    X_test_scaled = scaler.transform(X_test_encoded)
    
    
    elastic_model = ElasticNetCV(
        l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99],
        alphas=np.logspace(-3, 2, 100),
        cv=10,
        max_iter=10000,
        random_state=42
    )
    elastic_model.fit(X_train_scaled, y_train_combined.to_numpy().ravel())
    
    elastic_val_pred = np.clip(elastic_model.predict(X_val_scaled), 0, 100)
    elastic_test_pred = np.clip(elastic_model.predict(X_test_scaled), 0, 100)
    
    oof_pred_elastic[val_index] = elastic_val_pred
    test_preds_elastic[:, fold - 1] = elastic_test_pred
    
    rmse_elastic = root_mean_squared_error(y_val, elastic_val_pred)
    fold_rmse_elastic.append(rmse_elastic)

elastic_oof_rmse = root_mean_squared_error(y, oof_pred_elastic)

print(f"ElasticNet OOF RMSE: {elastic_oof_rmse:.6f}")
print(f"ElasticNet Fold RMSE Mean: {np.mean(fold_rmse_elastic):.6f} ± {np.std(fold_rmse_elastic):.6f}")


print(f"\n" + "="*50)
print("MODEL COMPARISON")
print("="*50)

if elastic_oof_rmse < ridge_oof_rmse:
    print(f"   ElasticNet is better!")
    print(f"   Ridge: {ridge_oof_rmse:.6f}")
    print(f"   ElasticNet: {elastic_oof_rmse:.6f}")
    print(f"   Improvement: {ridge_oof_rmse - elastic_oof_rmse:.6f}")
    
  
    oof_pred_lr = oof_pred_elastic
    test_preds_lr = test_preds_elastic
    linear_oof_rmse = elastic_oof_rmse
else:
    print(f"   Ridge is better!")
    print(f"   Ridge: {ridge_oof_rmse:.6f}")
    print(f"   ElasticNet: {elastic_oof_rmse:.6f}")
    
    linear_oof_rmse = ridge_oof_rmse

print(f"\n  Final Linear Model RMSE: {linear_oof_rmse:.6f}")


IMPROVED RIDGE REGRESSION WITH OPTIMIZATION

Training fold 1 (Ridge)...
  Fold 1 RMSE: 8.833441
  Alpha selected: 2.807216

Training fold 2 (Ridge)...
  Fold 2 RMSE: 8.899728
  Alpha selected: 0.001000

Training fold 3 (Ridge)...
  Fold 3 RMSE: 8.921972
  Alpha selected: 2.807216

Training fold 4 (Ridge)...
  Fold 4 RMSE: 8.849183
  Alpha selected: 2.807216

Training fold 5 (Ridge)...
  Fold 5 RMSE: 8.929922
  Alpha selected: 0.001000

Training fold 6 (Ridge)...
  Fold 6 RMSE: 8.846485
  Alpha selected: 2.807216

Training fold 7 (Ridge)...
  Fold 7 RMSE: 8.874272
  Alpha selected: 2.807216

Training fold 8 (Ridge)...
  Fold 8 RMSE: 8.925887
  Alpha selected: 0.001000

Training fold 9 (Ridge)...
  Fold 9 RMSE: 8.898398
  Alpha selected: 0.001000

Training fold 10 (Ridge)...
  Fold 10 RMSE: 8.848074
  Alpha selected: 2.807216

RIDGE REGRESSION RESULTS
Ridge OOF RMSE: 8.882806
Ridge Fold RMSE Mean: 8.882736 ± 0.035078
Min fold RMSE: 8.833441
Max fold RMSE: 8.929922

 ELASTICNET COMPARISO

### Feature Selection

In [None]:
print("\n" + "="*50)
print("FEATURE SELECTION")
print("="*50)


feature_importance = pd.DataFrame({
    'feature': X_train_encoded.columns,
    'importance': np.abs(lr_models[0].coef_)
}).sort_values('importance', ascending=False)

print("\nTop 30 Features by Ridge importance:")
print(feature_importance.head(30))


n_features_to_keep = 95
top_features = feature_importance.head(n_features_to_keep)['feature'].tolist()
print(f"\nKeeping top {n_features_to_keep} features out of {len(feature_importance)}")


X = X[top_features]
X_test = X_test[top_features]
X_original = X_original[top_features]

print(f"XGB feature shapes - X: {X.shape}, X_test: {X_test.shape}, X_original: {X_original.shape}")

### Preparing the Data with Categorical 

In [None]:
print("\n" + "="*50)
print("PREPARING XGB DATA WITH CATEGORY MEAN ENCODING")
print("="*50)


X["ridge_pred"] = oof_pred_lr
X_test["ridge_pred"] = test_preds_lr.mean(axis=1)
X_original["ridge_pred"] = orig_preds_lr

print(f"Shapes before encoding - X: {X.shape}, X_test: {X_test.shape}, X_original: {X_original.shape}")


cat_cols = X.select_dtypes(include=["category", "object"]).columns.tolist()
print(f"Categorical columns to encode: {cat_cols}")

cat_transformer = CategoryMeanTransformer(cat_cols=cat_cols)


cat_transformer.fit(X, y)


X = cat_transformer.transform(X)
X_test = cat_transformer.transform(X_test)
X_original = cat_transformer.transform(X_original)


X = X.astype(np.float32)
X_test = X_test.astype(np.float32)
X_original = X_original.astype(np.float32)
y_float = y.values.astype(np.float32)

print(f"Final XGB shapes with Ridge feature - X: {X.shape}, X_test: {X_test.shape}, X_original: {X_original.shape}")



### XGBoost Training 

In [None]:
print("\n" + "=" * 50)
print("TRAINING XGBOOST AND LIGHTGBM")
print("=" * 50)


dtest = xgb.DMatrix(X_test)

xgb_params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.05,  
    "max_depth": 8,        
    "subsample": 0.85,     
    "colsample_bytree": 0.75,  
    "colsample_bynode": 0.8,  
    "min_child_weight": 2,  
    "gamma": 0.05,         
    "lambda": 0.8,          
    "alpha": 0.02,         
    "eval_metric": "rmse",
    "tree_method": "hist",
    "verbosity": 0,
    "seed": 42,
}


lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': 8,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.85,
    'bagging_freq': 5,
    'min_child_samples': 20,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'verbose': -1,
}


oof_predictions_xgb = np.zeros(len(X), dtype=np.float32)
oof_predictions_lgb = np.zeros(len(X), dtype=np.float32)

test_predictions_xgb = []
test_predictions_lgb = []

fold_metrics = []

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_bins), start=1):
    print(f"\n{'='*50}")
    print(f"Fold {fold}/5")
    print(f"{'='*50}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_float[train_idx], y_float[val_idx]
    

    print("Training XGBoost...")
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    evals = [(dtrain, "train"), (dval, "valid")]
    
    xgb_model = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=3000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False,
    )
    
    xgb_val_preds = np.clip(xgb_model.predict(dval), 0, 100)
    oof_predictions_xgb[val_idx] = xgb_val_preds
    
    xgb_test_pred = np.clip(xgb_model.predict(dtest), 0, 100)
    test_predictions_xgb.append(xgb_test_pred)
    
    xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_val_preds))
    xgb_mae = mean_absolute_error(y_val, xgb_val_preds)
    
    print(f"  XGBoost RMSE: {xgb_rmse:.5f} | MAE: {xgb_mae:.5f}")
    print(f"  Best iteration: {xgb_model.best_iteration}")
    
  
    print("Training LightGBM...")
    
    
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)
    
 
    lgb_model = lgb.train(
        params=lgb_params,
        train_set=lgb_train,
        num_boost_round=3000,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(period=0)  
        ]
    )
    
    lgb_val_preds = np.clip(lgb_model.predict(X_val), 0, 100)
    oof_predictions_lgb[val_idx] = lgb_val_preds
    
    lgb_test_pred = np.clip(lgb_model.predict(X_test), 0, 100)
    test_predictions_lgb.append(lgb_test_pred)
    
    lgb_rmse = np.sqrt(mean_squared_error(y_val, lgb_val_preds))
    lgb_mae = mean_absolute_error(y_val, lgb_val_preds)
    
    print(f"  LightGBM RMSE: {lgb_rmse:.5f} | MAE: {lgb_mae:.5f}")
    print(f"  Best iteration: {lgb_model.best_iteration}")
    
 
    fold_metrics.append({
        "fold": fold,
        "xgb_rmse": xgb_rmse,
        "xgb_mae": xgb_mae,
        "lgb_rmse": lgb_rmse,
        "lgb_mae": lgb_mae,
    })



print(f"\n{'='*50}")
print("AGGREGATING PREDICTIONS")
print(f"{'='*50}\n")


test_predictions_xgb = np.mean(test_predictions_xgb, axis=0)
test_predictions_lgb = np.mean(test_predictions_lgb, axis=0)


xgb_oof_rmse = np.sqrt(mean_squared_error(y_float, oof_predictions_xgb))
xgb_oof_mae = mean_absolute_error(y_float, oof_predictions_xgb)

lgb_oof_rmse = np.sqrt(mean_squared_error(y_float, oof_predictions_lgb))
lgb_oof_mae = mean_absolute_error(y_float, oof_predictions_lgb)

print(f"XGBoost OOF RMSE: {xgb_oof_rmse:.5f} | MAE: {xgb_oof_mae:.5f}")
print(f"LightGBM OOF RMSE: {lgb_oof_rmse:.5f} | MAE: {lgb_oof_mae:.5f}")


### Ensemble Blending 

In [None]:
print("\n" + "="*50)
print("ENSEMBLE BLENDING WITH SOFT CLIPPING")
print("="*50)


print("\nTesting different ensemble weights...\n")

best_rmse = float('inf')
best_weights = None


weight_combinations = [
    (0.0, 0.5, 0.5),    
    (0.0, 0.6, 0.4),    
    (0.0, 0.4, 0.6),  
    (0.05, 0.5, 0.45),  
    (0.1, 0.45, 0.45),  
    (0.0, 0.7, 0.3),   
    (0.0, 0.3, 0.7),   
]

for ridge_w, xgb_w, lgb_w in weight_combinations:
    ensemble_oof = (ridge_w * oof_pred_lr + 
                    xgb_w * oof_predictions_xgb + 
                    lgb_w * oof_predictions_lgb)
    
    ensemble_oof = np.clip(ensemble_oof, 0, 100)
    ensemble_rmse = np.sqrt(mean_squared_error(y, ensemble_oof))
    
    print(f"Ridge: {ridge_w:.2f} | XGB: {xgb_w:.2f} | LGB: {lgb_w:.2f} → RMSE: {ensemble_rmse:.5f}")
    
    if ensemble_rmse < best_rmse:
        best_rmse = ensemble_rmse
        best_weights = (ridge_w, xgb_w, lgb_w)

ridge_w, xgb_w, lgb_w = best_weights

print(f"\n  Best ensemble weights:")
print(f"   Ridge: {ridge_w:.2f} | XGB: {xgb_w:.2f} | LGB: {lgb_w:.2f}")
print(f"   Ensemble RMSE (before clipping): {best_rmse:.5f}\n")


final_oof = (ridge_w * oof_pred_lr + 
             xgb_w * oof_predictions_xgb + 
             lgb_w * oof_predictions_lgb)

final_test = (ridge_w * test_preds_lr.mean(axis=1) + 
              xgb_w * test_predictions_xgb + 
              lgb_w * test_predictions_lgb)



def soft_clip(pred, lower=0, upper=100):

    scaled = lower + (upper - lower) / (1 + np.exp(-10 * (pred - 50) / 50))
    return scaled


lower_q = y.quantile(0.01)
upper_q = y.quantile(0.99)

print(f"Quantile bounds: [{lower_q:.2f}, {upper_q:.2f}]")


final_oof = np.clip(final_oof, 0, 100)
final_test = np.clip(final_test, 0, 100)


final_oof_rmse = np.sqrt(mean_squared_error(y, final_oof))

print(f"\n{'='*50}")
print("FINAL MODEL PERFORMANCE")
print(f"{'='*50}\n")

print(f"Individual Models:")
print(f"  Ridge OOF RMSE:    {ridge_oof_rmse:.5f}")
print(f"  XGBoost OOF RMSE:  {xgb_oof_rmse:.5f}")
print(f"  LightGBM OOF RMSE: {lgb_oof_rmse:.5f}")

print(f"\nEnsemble (before clipping):")
print(f"  Ensemble RMSE: {best_rmse:.5f}")

print(f"\nFinal (after clipping):")
print(f"  Final OOF RMSE: {final_oof_rmse:.5f}")
print(f"  Prediction range: [{final_test.min():.2f}, {final_test.max():.2f}]")

print(f"\n{'='*50}")
print("IMPROVEMENTS")
print(f"{'='*50}")
print(f"  vs Ridge:    {ridge_oof_rmse - final_oof_rmse:.5f}")
print(f"  vs XGBoost:  {xgb_oof_rmse - final_oof_rmse:.5f}")
print(f"  vs LightGBM: {lgb_oof_rmse - final_oof_rmse:.5f}\n")

### Submission

In [None]:
print("\n" + "="*50)
print("SAVING RESULTS")
print("="*50)


oof_df = pd.DataFrame({
    "id": train_df[ID_COL], 
    TARGET: final_oof  
})

oof_df.to_csv("oof_df.csv", index=False)


submission_df[TARGET] = final_test
submission_df.to_csv("submission.csv", index=False)


In [None]:
submission_df.head()

Acknowledgement: [https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe](https://www.kaggle.com/code/mdevian/ps-s6e1-clean-strong-baseline-ridge-xgb-fe)