In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

ModuleNotFoundError: No module named 'numpy'

In [None]:
# Imports
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')
np.random.seed(42)

In [None]:
# Load data
train_df = pd.read_csv('/kaggle/input/playground-series-s6e1/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s6e1/test.csv')
submission_df = pd.read_csv('/kaggle/input/playground-series-s6e1/sample_submission.csv')

In [None]:
TARGET = 'exam_score'
base_features = [col for col in train_df.columns if col not in [TARGET, 'id']]
num_features = ['study_hours', 'class_attendance', 'sleep_hours']

def preprocess(df, is_train=True, target_mean=None):
    """Create features for train/test"""
    df_temp = df.copy()
    
    # 1. Log transforms on numerical features
    for col in num_features:
        if col in df_temp.columns:
            df_temp[f'log_{col}'] = np.log1p(df_temp[col])
    
    # 2. Squared features
    for col in num_features:
        if col in df_temp.columns:
            df_temp[f'{col}_sq'] = df_temp[col] ** 2
    
    # 3. Learned formula feature (from top notebooks)
    df_temp['feature_formula'] = (
        5.9051154511950499 * df_temp['study_hours'] + 
        0.34540967058057986 * df_temp['class_attendance'] + 
        1.423461171860262 * df_temp['sleep_hours'] + 4.7819
    )
    
    # 4. Interaction features (most important)
    df_temp['study_attendance_interaction'] = df_temp['study_hours'] * df_temp['class_attendance']
    df_temp['study_sleep_interaction'] = df_temp['study_hours'] * df_temp['sleep_hours']
    df_temp['attendance_sleep_interaction'] = df_temp['class_attendance'] * df_temp['sleep_hours']
    
    # 5. Ratio features
    df_temp['study_sleep_ratio'] = df_temp['study_hours'] / (df_temp['sleep_hours'] + 1e-8)
    df_temp['attendance_sleep_ratio'] = df_temp['class_attendance'] / (df_temp['sleep_hours'] + 1e-8)
    
    # 6. Convert categorical to string for XGBoost categorical support
    for col in base_features:
        if df_temp[col].dtype == 'object':
            df_temp[col] = df_temp[col].astype(str)
    
    # Select features
    log_cols = [f'log_{col}' for col in num_features]
    sq_cols = [f'{col}_sq' for col in num_features]
    interaction_cols = ['study_attendance_interaction', 'study_sleep_interaction', 
                        'attendance_sleep_interaction', 'study_sleep_ratio', 'attendance_sleep_ratio']
    
    feature_cols = base_features + log_cols + sq_cols + ['feature_formula'] + interaction_cols
    
    return df_temp[feature_cols]

In [None]:
# Preprocess data
X_raw = preprocess(train_df)
y = train_df[TARGET].reset_index(drop=True)
X_test_raw = preprocess(test_df)

# Combine for categorical encoding
full_data = pd.concat([X_raw, X_test_raw], axis=0)

# Convert categorical columns
for col in base_features:
    if full_data[col].dtype == 'object':
        full_data[col] = full_data[col].astype('category')

# Convert engineered features to float
engineered_cols = ['feature_formula'] + \
                  [f'log_{col}' for col in num_features] + \
                  [f'{col}_sq' for col in num_features] + \
                  ['study_attendance_interaction', 'study_sleep_interaction', 
                   'attendance_sleep_interaction', 'study_sleep_ratio', 'attendance_sleep_ratio']

for col in engineered_cols:
    if col in full_data.columns:
        full_data[col] = full_data[col].astype(float)

# Split back
X = full_data.iloc[:len(train_df)].copy()
X_test = full_data.iloc[len(train_df):].copy()

print(f"Training features: {X.shape}")
print(f"Test features: {X_test.shape}")

In [None]:
# XGBoost parameters (optimized from top notebooks)
xgb_params = {
    'n_estimators': 10000,
    'learning_rate': 0.007,
    'max_depth': 7,
    'subsample': 0.8,
    'num_parallel_tree': 2,
    'reg_lambda': 3,
    'colsample_bytree': 0.6,
    'colsample_bynode': 0.7,
    'tree_method': 'hist',
    'random_state': 42,
    'early_stopping_rounds': 100,
    'eval_metric': 'rmse',
    'enable_categorical': True
}

# Cross-validation
test_predictions_xgb = []
oof_predictions_xgb = np.zeros(len(X))
kf = KFold(n_splits=5, shuffle=True, random_state=42)

print("Training XGBoost with 5-fold CV...\n")

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"--- Fold {fold+1} ---")
    
    X_train_fold, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index]
    
    X_train_combined = X_train_fold
    y_train_combined = y_train_fold
    
    model = xgb.XGBRegressor(**xgb_params)
    
    model.fit(
        X_train_combined,
        y_train_combined,
        eval_set=[(X_val, y_val)],
        verbose=200
    )
    
    val_preds = model.predict(X_val)
    oof_predictions_xgb[val_index] = val_preds
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f"Fold {fold+1} RMSE: {rmse:.5f}\n")
    
    test_preds = model.predict(X_test)
    test_predictions_xgb.append(test_preds)

oof_rmse_xgb = np.sqrt(mean_squared_error(y, oof_predictions_xgb))
print(f"XGBoost OOF RMSE: {oof_rmse_xgb:.5f}")

## Competition Information

**Playground Series - Season 6, Episode 1: Predicting Student Test Scores**

- **Evaluation Metric**: Root Mean Squared Error (RMSE)
- **Target**: Predict `exam_score` for each student in the test set
- **Best Public LB Scores**: ~8.56-8.64 RMSE
- **Timeline**: January 1-31, 2026

### Key Insights from Top Notebooks:
1. **Feature Engineering is Critical**:
   - Log transforms on numerical features
   - Squared features
   - Learned formula feature (linear combination of study_hours, class_attendance, sleep_hours)
   - Interaction features (study×attendance, study×sleep, attendance×sleep)
   - Ratio features (study/sleep, attendance/sleep)

2. **Model Configuration**:
   - XGBoost with categorical support (`enable_categorical=True`)
   - 5-fold cross-validation
   - Learning rate: 0.007
   - Max depth: 7
   - Early stopping: 100 rounds
   - Many estimators (10000) with early stopping

3. **Performance Expectations**:
   - OOF RMSE: ~8.64-8.74 (without original data)
   - Public LB: ~8.56-8.64 (with ensemble/blending)
   - Training time: ~10-15 minutes per fold on Kaggle GPU

In [None]:
# Generate submission
# Make sure Cell 4 (model training) has been executed first!
try:
    # Try to access test_predictions_xgb
    _ = test_predictions_xgb
    if len(test_predictions_xgb) == 0:
        raise ValueError("test_predictions_xgb is empty!")
except NameError:
    raise NameError(
        "❌ ERROR: test_predictions_xgb not found!\n"
        "Please run Cell 4 (XGBoost model training) first.\n"
        "The training cell creates the predictions needed for submission.\n"
        "Execution order: Cell 0 → Cell 1 → Cell 2 → Cell 3 → Cell 4 → Cell 6"
    )

submission_df[TARGET] = np.mean(test_predictions_xgb, axis=0)

# Clip predictions to valid range (0-100 for exam scores)
submission_df[TARGET] = np.clip(submission_df[TARGET], 0, 100)

# Save submission
submission_df.to_csv('submission.csv', index=False)
print(f"\nSubmission saved! Shape: {submission_df.shape}")
print(f"Prediction range: [{submission_df[TARGET].min():.2f}, {submission_df[TARGET].max():.2f}]")
print(f"Mean prediction: {submission_df[TARGET].mean():.2f}")
submission_df.head()

In [None]:
# Save OOF predictions for analysis
# Make sure Cell 4 (model training) has been executed first!
try:
    # Try to access oof_predictions_xgb
    _ = oof_predictions_xgb
except NameError:
    raise NameError(
        "❌ ERROR: oof_predictions_xgb not found!\n"
        "Please run Cell 4 (XGBoost model training) first.\n"
        "The training cell creates the OOF predictions needed for analysis.\n"
        "Execution order: Cell 0 → Cell 1 → Cell 2 → Cell 3 → Cell 4 → Cell 7"
    )

oof_df = pd.DataFrame({
    'id': train_df['id'],
    TARGET: oof_predictions_xgb
})
oof_df.to_csv('oof_predictions.csv', index=False)
print("OOF predictions saved to oof_predictions.csv")