In [139]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [140]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder

print("Libraries imported successfully.")

Libraries imported successfully.


In [141]:
# https://www.kaggle.com/competitions/playground-series-s6e1/data?select=train.csv

# 1. Load the datasets
folder_path_local = "../input/playground-series-s6e1/"
folder_path_remote = "https://kagglecsv.netlify.app/input/playground-series-s6e1/"
folder_path = folder_path_local if os.path.exists(folder_path_local) else folder_path_remote  # choose local if available, else remote
train_data = pd.read_csv(folder_path + 'train.csv')
test_data = pd.read_csv(folder_path + 'test.csv')
print("Datasets loaded successfully.")

Datasets loaded successfully.


In [142]:
# 2. Check column names for both train and test data
print("Train Data Columns:", train_data.columns)
print("Test Data Columns:", test_data.columns)

Train Data Columns: Index(['id', 'age', 'gender', 'course', 'study_hours', 'class_attendance',
       'internet_access', 'sleep_hours', 'sleep_quality', 'study_method',
       'facility_rating', 'exam_difficulty', 'exam_score'],
      dtype='object')
Test Data Columns: Index(['id', 'age', 'gender', 'course', 'study_hours', 'class_attendance',
       'internet_access', 'sleep_hours', 'sleep_quality', 'study_method',
       'facility_rating', 'exam_difficulty'],
      dtype='object')


In [143]:
# 3. Identify categorical columns (ignore 'id' column)
categorical_columns = train_data.select_dtypes(include=['object']).columns
categorical_columns = categorical_columns.drop('id', errors='ignore')  # Drop 'id' if it exists, otherwise ignore
print(f"Categorical columns: {categorical_columns}")

Categorical columns: Index(['gender', 'course', 'internet_access', 'sleep_quality', 'study_method',
       'facility_rating', 'exam_difficulty'],
      dtype='object')


In [144]:
# 4. Convert categorical features to pandas 'category' dtype (preferred for tree models)
categorical_columns = train_data.select_dtypes(include=['object']).columns.tolist()
if 'id' in categorical_columns:
    categorical_columns.remove('id')
for col in categorical_columns:
    train_data[col] = train_data[col].astype('category')
    test_data[col] = test_data[col].astype('category')
print(f"Categorical columns converted: {categorical_columns}")

Categorical columns converted: ['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']


In [145]:
# 5. Split data into features (X) and target (y)
if 'id' in train_data.columns:  # Check if 'id' exists before dropping it
    X = train_data.drop(columns=['id', 'exam_score'])  # Drop 'id' and 'exam_score' columns
else:
    X = train_data.drop(columns=['exam_score'])  # Only drop 'exam_score' if 'id' is not present
y = train_data['exam_score']

In [146]:
# 6. Simple imputation for numeric features (no scaling for tree models)
from sklearn.impute import SimpleImputer
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
imputer = SimpleImputer(strategy='median')
X[num_cols] = imputer.fit_transform(X[num_cols])
# Prepare a version of the test features for later
if 'id' in test_data.columns:
    X_test = test_data.drop(columns=['id']).copy()
else:
    X_test = test_data.copy()
X_test[num_cols] = imputer.transform(X_test[num_cols])
print('Imputed numeric columns:', num_cols)

Imputed numeric columns: ['age', 'study_hours', 'class_attendance', 'sleep_hours']


In [147]:
# 7. Train-test split (for evaluation purposes)
# NOTE: the earlier pipeline removed scaling; the actual split is performed later using the imputed DataFrame `X`
# (Old split referencing `X_scaled_df` has been removed to avoid NameError)

In [148]:
# Models: fast, well-performing tree ensembles with early stopping capability
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

models = {
    'LightGBM': LGBMRegressor(n_estimators=1000, learning_rate=0.03, max_depth=7, random_state=42),
    # 'CatBoost': CatBoostRegressor(iterations=1000, learning_rate=0.03, depth=6, random_state=42, verbose=0),
    # 'XGBoost': XGBRegressor(n_estimators=1000, learning_rate=0.03, max_depth=6, random_state=42, tree_method='hist', verbosity=0)
}

# Determine categorical feature names (for LightGBM/CatBoost)
categorical_features = [col for col in X.columns if str(X[col].dtype) == 'category']
print('Categorical features to pass to models:', categorical_features)

Categorical features to pass to models: ['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']


In [149]:
# 9. Train models and evaluate them using MAE with early stopping on a validation set
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

trained_models = {}
best_mae = float('inf')
best_model_name = None

for name, model in models.items():
    print(f"Training {name}...")
    try:
        if name == 'LightGBM':
            # Some lightgbm versions don't accept early_stopping_rounds in sklearn API; omit it for compatibility
            try:
                model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='l1',
                          categorical_feature=categorical_features, verbose=False)
            except TypeError:
                model.fit(X_train, y_train)
        elif name == 'CatBoost':
            # CatBoost expects cat_features list when pandas categorical dtypes are present
            try:
                model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=categorical_features,
                          use_best_model=True, verbose=False)
            except TypeError:
                model.fit(X_train, y_train)
        elif name == 'XGBoost':
            # XGBoost sklearn wrapper may not accept eval_metric in some versions; try with early stopping, else fallback
            try:
                model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)
            except TypeError:
                try:
                    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
                except TypeError:
                    model.fit(X_train, y_train)
        else:
            model.fit(X_train, y_train)
    except Exception as e:
        print(f"Error training {name}:", e)
        continue

    y_pred = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, y_pred)
    print(f"{name} MAE: {mae}")
    trained_models[name] = model
    if mae < best_mae:
        best_mae = mae
        best_model_name = name
    print('\n')

print('Best model:', best_model_name, 'with MAE:', best_mae)


Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 602
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 11
[LightGBM] [Info] Start training from score 62.482335
LightGBM MAE: 6.98679511074426


Best model: LightGBM with MAE: 6.98679511074426


In [150]:
# 10. Select best model object
if not trained_models:
    raise RuntimeError('No models were successfully trained. Check training logs above for errors.')
if best_model_name is None:
    # If best_model_name wasn't set (e.g. training skipped/failed for all), pick the first trained model
    best_model_name = list(trained_models.keys())[0]
    print('Warning: best_model_name was None; defaulting to', best_model_name)
best_model = trained_models[best_model_name]
best_model_name

'LightGBM'

In [151]:
# 11. Retrain best model on full training data (X, y)
print('Retraining best model on full data:', best_model_name)
if best_model_name == 'LightGBM':
    best_model.set_params(n_estimators=2000)
    # Some lightgbm sklearn wrappers don't accept 'verbose' in fit(); try safely
    try:
        best_model.fit(X, y, categorical_feature=categorical_features, verbose=False)
    except TypeError:
        try:
            best_model.fit(X, y, categorical_feature=categorical_features)
        except TypeError:
            best_model.fit(X, y)
elif best_model_name == 'CatBoost':
    best_model.set_params(iterations=2000)
    try:
        best_model.fit(X, y, cat_features=categorical_features, verbose=False)
    except TypeError:
        best_model.fit(X, y)
elif best_model_name == 'XGBoost':
    best_model.set_params(n_estimators=2000)
    try:
        best_model.fit(X, y, verbose=False)
    except TypeError:
        best_model.fit(X, y)
else:
    best_model.fit(X, y)

Retraining best model on full data: LightGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 602
[LightGBM] [Info] Number of data points in the train set: 630000, number of used features: 11
[LightGBM] [Info] Start training from score 62.506672


In [152]:
# 12. Ensure categorical dtypes preserved in test set
for col in categorical_features:
    if col in X_test.columns:
        X_test[col] = X_test[col].astype('category')
X_test.shape

(270000, 11)

In [153]:
# 13. Generate predictions using ensemble of trained models (average)
preds = []
for name, model in trained_models.items():
    try:
        preds.append(model.predict(X_test))
    except Exception as e:
        print(f'Error predicting with {name}:', e)
import numpy as np
if preds:
    ensemble_preds = np.mean(preds, axis=0)
else:
    ensemble_preds = best_model.predict(X_test)

test_predictions = ensemble_preds

In [154]:
# 14. Prepare the submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'exam_score': test_predictions
})
submission.head()

Unnamed: 0,id,exam_score
0,630000,71.748949
1,630001,69.879723
2,630002,87.721867
3,630003,56.195579
4,630004,46.743615


In [155]:
# 15. Save the submission file
submission.to_csv('submission.csv', index=False)

print('Submission file created successfully!')

Submission file created successfully!
