In [2]:
# ==============================================
# Import Necessary Libraries
# ==============================================
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.display import clear_output
import warnings

from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from sklearn.impute import KNNImputer

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor

# Suppress warnings and set display options
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

# Set random seed and number of folds for cross-validation
SEED = 42
n_splits = 10

# ==============================================
# Load and Merge Data
# ==============================================
# Load main datasets
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

# ==============================================
# Feature Engineering
# ==============================================
def engineer_features(df):
    """
    Create interaction features.
    """
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['Pulse_Pressure'] = df['Physical-Systolic_BP'] - df['Physical-Diastolic_BP']
    df['HeartRate_Age'] = df['Physical-HeartRate'] * df['Basic_Demos-Age']
    df['Fitness_Score'] = df['Fitness_Endurance-Max_Stage'] * (
        df['Fitness_Endurance-Time_Mins'] * 60 + df['Fitness_Endurance-Time_Sec']
    )
    df['FMI_FFMI_Ratio'] = df['BIA-BIA_FMI'] / (df['BIA-BIA_FFMI'] + 1e-6)
    df['Sleep_Internet_Hours'] = df['SDS-SDS_Total_T'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['Waist_Height_Ratio'] = df['Physical-Waist_Circumference'] / (df['Physical-Height'] + 1e-6)
    return df

# Apply feature engineering to both train and test datasets
train = engineer_features(train)
test = engineer_features(test)

# ==============================================
# Define Feature Columns
# ==============================================
# Define featuresCols as the common columns between train and test
featuresCols = list(set(train.columns).intersection(set(test.columns)))

# Select the features from train and test datasets
train = train[featuresCols + ['sii']]  # Include 'sii' in train features
test = test[featuresCols]

# Drop 'id' from train and test data if present
train = train.drop('id', axis=1, errors='ignore')
test = test.drop('id', axis=1, errors='ignore')

# Drop rows with missing target variable 'sii' in train data
train = train.dropna(subset=['sii'])

# ==============================================
# Handle Categorical Variables
# ==============================================
# List of categorical columns
cat_c = [
    'Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
    'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
    'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season'
]

def update_categorical(df):
    """
    Fill missing values and convert columns to categorical.
    """
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

# Update categorical columns in both datasets
train = update_categorical(train)
test = update_categorical(test)

def create_mapping(column, dataset):
    """
    Create a mapping for categorical variables.
    """
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

# Map categorical variables to integers
for col in cat_c:
    mapping_train = create_mapping(col, train)
    mapping_test = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping_train).astype(int)
    test[col] = test[col].replace(mapping_test).astype(int)

# ==============================================
# Handle Missing Values with KNN Imputer
# ==============================================
# Impute missing values using KNNImputer
imputer = KNNImputer(n_neighbors=5)
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Remove 'sii' from numeric_cols when applying to test data
if 'sii' in numeric_cols:
    numeric_cols.remove('sii')

# Impute train data
train[numeric_cols] = imputer.fit_transform(train[numeric_cols])

# Impute test data
# Ensure that all columns in numeric_cols exist in test data
numeric_cols_test = [col for col in numeric_cols if col in test.columns]
test[numeric_cols_test] = imputer.transform(test[numeric_cols_test])

# Ensure 'sii' remains integer
train['sii'] = train['sii'].round().astype(int)

# ==============================================
# Model Training and Evaluation Functions
# ==============================================
def quadratic_weighted_kappa(y_true, y_pred):
    """
    Calculate Quadratic Weighted Kappa.
    """
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    """
    Apply thresholds to continuous predictions.
    """
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    """
    Objective function for optimizing thresholds.
    """
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    """
    Train the model using Stratified K-Fold cross-validation and evaluate it.
    """
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    # Ensure all data is numeric
    X = X.apply(pd.to_numeric, errors='coerce')
    test_data = test_data.apply(pd.to_numeric, errors='coerce')

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    # Optimize thresholds
    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {tKappa:.3f}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

# ==============================================
# Model Parameters and Instantiation
# ==============================================
# Define model parameters
LGBMParams = {
    'learning_rate': 0.05,
    'max_depth': 5,
    'num_leaves': 100,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,
    'lambda_l2': 1,
    'random_seed': SEED,
    'verbose': -1,
    'n_estimators': 100
}

XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 5,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,
    'reg_lambda': 10,
    'random_seed': SEED
}

CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 5,
    'iterations': 100,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10
}

# Instantiate models
Light = LGBMRegressor(**LGBMParams)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

# ==============================================
# Train Model and Generate Submission
# ==============================================
# Train the ensemble model
submission5 = TrainML(voting_model, test)

# Save submission
# submission5.to_csv('submission.csv', index=False)
# print("Submission file 'submission.csv' has been created.")


Training Folds: 100%|██████████| 10/10 [00:14<00:00,  1.44s/it]

Mean Train QWK --> 0.5710
Mean Validation QWK ---> 0.3647
----> || Optimized QWK SCORE :: 0.453
Submission file 'submission.csv' has been created.



