In [1]:
# ==============================================
# Import Necessary Libraries
# ==============================================
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.display import clear_output
import warnings

from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from sklearn.impute import KNNImputer

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from scipy.stats import mode  # For majority voting

# Suppress warnings and set display options
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

# Set random seed and number of folds for cross-validation
SEED = 42
n_splits = 5

# ==============================================
# Load and Merge Data
# ==============================================
# Load main datasets
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

# ==============================================
# Feature Engineering
# ==============================================
def engineer_features(df):
    """
    Create interaction features.
    """
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['Pulse_Pressure'] = df['Physical-Systolic_BP'] - df['Physical-Diastolic_BP']
    df['HeartRate_Age'] = df['Physical-HeartRate'] * df['Basic_Demos-Age']
    df['Fitness_Score'] = df['Fitness_Endurance-Max_Stage'] * (
        df['Fitness_Endurance-Time_Mins'] * 60 + df['Fitness_Endurance-Time_Sec']
    )
    df['FMI_FFMI_Ratio'] = df['BIA-BIA_FMI'] / (df['BIA-BIA_FFMI'] + 1e-6)
    df['Sleep_Internet_Hours'] = df['SDS-SDS_Total_T'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['Waist_Height_Ratio'] = df['Physical-Waist_Circumference'] / (df['Physical-Height'] + 1e-6)
    return df

# # Apply feature engineering to both train and test datasets
# train = engineer_features(train)
# test = engineer_features(test)

# ==============================================
# Define Feature Columns
# ==============================================
# Define featuresCols as the common columns between train and test
# featuresCols = list(set(train.columns).intersection(set(test.columns)))
featuresCols = [
                'Physical-Height', 'Basic_Demos-Age',
                'PreInt_EduHx-computerinternet_hoursday', 'Physical-Weight',
                'FGC-FGC_CU', 'SDS-SDS_Total_T', 'SDS-SDS_Total_Raw', 'BIA-BIA_BMI',
                'Physical-BMI', 'FGC-FGC_PU', 'BIA-BIA_Frame_num',
                'Physical-Systolic_BP', 'FGC-FGC_SRL_Zone', 'FGC-FGC_TL',
                'BIA-BIA_FFMI', 'Basic_Demos-Sex', 'CGAS-CGAS_Score', 'BIA-BIA_FMI',
                'FGC-FGC_SRR_Zone', 'BIA-BIA_LST'
]
# Select the features from train and test datasets
train = train[featuresCols + ['sii']]  # Include 'sii' in train features
test = test[featuresCols]

# Drop 'id' from train and test data if present
train = train.drop('id', axis=1, errors='ignore')
test = test.drop('id', axis=1, errors='ignore')

# Drop rows with missing target variable 'sii' in train data
train = train.dropna(subset=['sii'])

# ==============================================
# Handle Missing Values with KNN Imputer
# ==============================================
# Impute missing values using KNNImputer
imputer = KNNImputer(n_neighbors=3)
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Remove 'sii' from numeric_cols when applying to test data
if 'sii' in numeric_cols:
    numeric_cols.remove('sii')

# Impute train data
train[numeric_cols] = imputer.fit_transform(train[numeric_cols])

# Impute test data
# Ensure that all columns in numeric_cols exist in test data
numeric_cols_test = [col for col in numeric_cols if col in test.columns]
test[numeric_cols_test] = imputer.transform(test[numeric_cols_test])

# Ensure 'sii' remains integer
train['sii'] = train['sii'].round().astype(int)

# ==============================================
# Model Training and Evaluation Functions
# ==============================================
def quadratic_weighted_kappa(y_true, y_pred):
    """
    Calculate Quadratic Weighted Kappa.
    """
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    """
    Apply thresholds to continuous predictions.
    """
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    """
    Objective function for optimizing thresholds.
    """
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainModelWithBootstrap(model_class, test_data, model_name):
    """
    Train the model using bootstrap sampling, stratified K-Fold CV,
    and return tuned predictions.
    """
    print(f"\nTraining {model_name}...")

    # Bootstrap sampling from the original training data
    train_data_bootstrap = train.sample(n=len(train), replace=True, random_state=SEED)
    X_bootstrap = train_data_bootstrap.drop(['sii'], axis=1)
    y_bootstrap = train_data_bootstrap['sii']
    
    # Ensure all data is numeric
    X_bootstrap = X_bootstrap.apply(pd.to_numeric, errors='coerce')
    test_data = test_data.apply(pd.to_numeric, errors='coerce')

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    oof_non_rounded = np.zeros(len(X_bootstrap), dtype=float)
    oof_rounded = np.zeros(len(X_bootstrap), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))

    train_S = []
    val_S = []

    for fold, (train_idx, val_idx) in enumerate(tqdm(SKF.split(X_bootstrap, y_bootstrap), desc=f"Training Folds for {model_name}", total=n_splits)):
        X_train_fold, X_val_fold = X_bootstrap.iloc[train_idx], X_bootstrap.iloc[val_idx]
        y_train_fold, y_val_fold = y_bootstrap.iloc[train_idx], y_bootstrap.iloc[val_idx]
        
        model = clone(model_class)
        model.fit(X_train_fold, y_train_fold)
        
        y_train_pred = model.predict(X_train_fold)
        y_val_pred = model.predict(X_val_fold)
        y_val_pred_rounded = y_val_pred.round().astype(int)
        y_train_pred_rounded = y_train_pred.round().astype(int)

        oof_non_rounded[val_idx] = y_val_pred

        train_kappa = quadratic_weighted_kappa(y_train_fold, y_train_pred_rounded)
        val_kappa = quadratic_weighted_kappa(y_val_fold, y_val_pred_rounded)

        train_S.append(train_kappa)
        val_S.append(val_kappa)

        test_preds[:, fold] = model.predict(test_data)

        # Print fold results (Removed clear_output)
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")

    mean_train_kappa = np.mean(train_S)
    mean_val_kappa = np.mean(val_S)

    print(f"{model_name} Mean Train QWK: {mean_train_kappa:.4f}")
    print(f"{model_name} Mean Validation QWK: {mean_val_kappa:.4f}")

    # Optimize thresholds
    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y_bootstrap, oof_non_rounded), 
                              method='Nelder-Mead')
    thresholds = KappaOPtimizer.x

    oof_tuned = threshold_Rounder(oof_non_rounded, thresholds)
    tuned_kappa = quadratic_weighted_kappa(y_bootstrap, oof_tuned)

    print(f"{model_name} Optimized (Tuned) QWK: {tuned_kappa:.4f}")

    # Apply thresholds to test predictions
    test_preds_mean = test_preds.mean(axis=1)
    test_preds_tuned = threshold_Rounder(test_preds_mean, thresholds)

    return test_preds_tuned

# ==============================================
# Model Parameters and Instantiation
# ==============================================
# Define model parameters
LGBMParams = {
    'learning_rate': 0.05,
    'max_depth': 7,
    'num_leaves': 100,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,
    'lambda_l2': 1,
    'random_state': SEED,
    'verbose': -1,
    'n_estimators': 100
}

XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 5,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,
    'reg_lambda': 10,
    'random_state': SEED
}

CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 7,
    'iterations': 100,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10
}

# Instantiate models
Light = LGBMRegressor(**LGBMParams)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# ==============================================
# Train Models and Generate Submission
# ==============================================
# Get predictions from each model
test_preds_lightgbm = TrainModelWithBootstrap(Light, test, 'LightGBM')
test_preds_xgboost = TrainModelWithBootstrap(XGB_Model, test, 'XGBoost')
test_preds_catboost = TrainModelWithBootstrap(CatBoost_Model, test, 'CatBoost')


Training LightGBM...


Training Folds for LightGBM:  40%|████      | 2/5 [00:00<00:00,  4.36it/s]

Fold 1 - Train QWK: 0.6778, Validation QWK: 0.5027
Fold 2 - Train QWK: 0.6684, Validation QWK: 0.5068


Training Folds for LightGBM:  80%|████████  | 4/5 [00:00<00:00,  5.88it/s]

Fold 3 - Train QWK: 0.6543, Validation QWK: 0.5020
Fold 4 - Train QWK: 0.6831, Validation QWK: 0.5044


Training Folds for LightGBM: 100%|██████████| 5/5 [00:00<00:00,  5.44it/s]


Fold 5 - Train QWK: 0.6535, Validation QWK: 0.5404
LightGBM Mean Train QWK: 0.6674
LightGBM Mean Validation QWK: 0.5113
LightGBM Optimized (Tuned) QWK: 0.6014

Training XGBoost...


Training Folds for XGBoost:  20%|██        | 1/5 [00:00<00:00,  4.76it/s]

Fold 1 - Train QWK: 0.7392, Validation QWK: 0.5355


Training Folds for XGBoost:  60%|██████    | 3/5 [00:00<00:00,  4.94it/s]

Fold 2 - Train QWK: 0.7201, Validation QWK: 0.5471
Fold 3 - Train QWK: 0.7249, Validation QWK: 0.5576


Training Folds for XGBoost:  80%|████████  | 4/5 [00:00<00:00,  4.69it/s]

Fold 4 - Train QWK: 0.7240, Validation QWK: 0.5281


Training Folds for XGBoost: 100%|██████████| 5/5 [00:01<00:00,  4.70it/s]


Fold 5 - Train QWK: 0.7334, Validation QWK: 0.5913
XGBoost Mean Train QWK: 0.7283
XGBoost Mean Validation QWK: 0.5519
XGBoost Optimized (Tuned) QWK: 0.6370

Training CatBoost...


Training Folds for CatBoost:  20%|██        | 1/5 [00:00<00:01,  2.43it/s]

Fold 1 - Train QWK: 0.6254, Validation QWK: 0.4869


Training Folds for CatBoost:  40%|████      | 2/5 [00:00<00:01,  2.45it/s]

Fold 2 - Train QWK: 0.6238, Validation QWK: 0.5164


Training Folds for CatBoost:  60%|██████    | 3/5 [00:01<00:00,  2.65it/s]

Fold 3 - Train QWK: 0.6095, Validation QWK: 0.4933


Training Folds for CatBoost:  80%|████████  | 4/5 [00:01<00:00,  2.65it/s]

Fold 4 - Train QWK: 0.6241, Validation QWK: 0.4824


Training Folds for CatBoost: 100%|██████████| 5/5 [00:01<00:00,  2.59it/s]

Fold 5 - Train QWK: 0.5976, Validation QWK: 0.4972
CatBoost Mean Train QWK: 0.6161
CatBoost Mean Validation QWK: 0.4952
CatBoost Optimized (Tuned) QWK: 0.5924





In [2]:
test_preds_lightgbm

array([0, 0, 1, 0, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 0, 0, 0, 2])

In [3]:
test_preds_xgboost

array([0, 0, 1, 0, 2, 1, 0, 0, 1, 2, 2, 1, 1, 2, 2, 0, 0, 0, 0, 1])

In [4]:
test_preds_catboost

array([0, 0, 1, 0, 2, 1, 1, 0, 2, 2, 1, 1, 1, 2, 2, 0, 0, 0, 0, 2])

In [5]:
# Create submission DataFrame
submission4 = pd.DataFrame({
    'id': sample['id'],
    'sii': test_preds_catboost
})

# Save submission
submission4.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' has been created.")

Submission file 'submission.csv' has been created.


In [6]:
# # Combine predictions via majority voting
# all_test_preds = np.vstack([test_preds_lightgbm, test_preds_xgboost, test_preds_catboost])

# # Majority voting
# final_test_preds, _ = mode(all_test_preds, axis=0)

# # Convert to 1D array
# final_test_preds = final_test_preds.flatten().astype(int)

# # Create submission DataFrame
# submission4 = pd.DataFrame({
#     'id': sample['id'],
#     'sii': final_test_preds
# })

# # Save submission
# submission4.to_csv('submission.csv', index=False)
# print("Submission file 'submission.csv' has been created.")

In [7]:
submission4

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,1
3,00115b9f,0
4,0016bb22,2
5,001f3379,1
6,0038ba98,1
7,0068a485,0
8,0069fbed,1
9,0083e397,1
