In [None]:
# Standard library
import os
from concurrent.futures import ThreadPoolExecutor

# Data manipulation and analysis
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

# Machine learning and preprocessing
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline

# Specialized ML models
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.callbacks import Callback

# Deep learning
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

# Utility
from colorama import Fore, Style
from IPython.display import clear_output
from tqdm import tqdm


In [None]:
!pip -q install /kaggle/input/tabnet/pytorch/v1/1/pytorch_tabnet-4.1.0-py3-none-any.whl

In [None]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_data_parquet(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [None]:
from sklearn.preprocessing import OneHotEncoder
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

train_ts = load_data_parquet("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_data_parquet("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
featuresCols += time_series_cols

train = train[featuresCols]


In [None]:
def fill_nan_values(data):
    imputer = KNNImputer(n_neighbors=71)
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
    imputed_data = imputer.fit_transform(data[numeric_cols])
    data_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
    
    if 'sii' in data.columns:
        data_imputed['sii'] = data_imputed['sii'].round().astype(int)
    
    for col in data.columns:
        if col not in numeric_cols:
            data_imputed[col] = data[col]
    
    return data_imputed

def handle_category_data(data):
    all_categories = ['Spring', 'Summer', 'Fall', 'Winter']
    df_encoded = data.copy()
    encoder = OneHotEncoder(categories=[all_categories], sparse_output=False, handle_unknown='ignore', dtype=int)
    
    for column in data.select_dtypes(include=['object', 'category']).columns:
        encoded_array = encoder.fit_transform(df_encoded[[column]])
        encoded_columns = [f"{column}_{season}" for season in all_categories]
        encoded_df = pd.DataFrame(encoded_array, columns=encoded_columns, index=data.index)
        df_encoded = pd.concat([df_encoded.drop(column, axis=1), encoded_df], axis=1)
    
    return df_encoded

train = fill_nan_values(train)
train = handle_category_data(train)
test = fill_nan_values(test)
test = handle_category_data(test)

In [None]:
import numpy as np
from sklearn.metrics import cohen_kappa_score

# Replace infinite values with NaN in the train dataset
if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_rounder(oof_non_rounded, thresholds):
    return np.digitize(oof_non_rounded, bins=thresholds, right=True)

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_predictions = threshold_rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_predictions)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from scipy.optimize import minimize
from tqdm import tqdm
from IPython.display import clear_output
from colorama import Fore, Style

def train_model(model, test_data, train_data, sample, n_splits=5, seed=42):
    X = train_data.drop(['sii'], axis=1)
    y = train_data['sii']

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    train_scores = []
    val_scores = []
    
    oof_predictions = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_predictions = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model_instance = clone(model)
        model_instance.fit(X_train, y_train)

        y_train_pred = model_instance.predict(X_train)
        y_val_pred = model_instance.predict(X_val)

        oof_predictions[val_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[val_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_scores.append(train_kappa)
        val_scores.append(val_kappa)
        
        test_predictions[:, fold] = model_instance.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_scores):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(val_scores):.4f}")

    kappa_optimizer = minimize(evaluate_predictions,
                               x0=[0.5, 1.5, 2.5], args=(y, oof_predictions), 
                               method='Nelder-Mead')
    assert kappa_optimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_rounder(oof_predictions, kappa_optimizer.x)
    tuned_kappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tuned_kappa:.3f}{Style.RESET_ALL}")

    test_predictions_mean = test_predictions.mean(axis=1)
    test_predictions_tuned = threshold_rounder(test_predictions_mean, kappa_optimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': test_predictions_tuned
    })

    return submission

# Example usage:
# SEED = 2004
# n_splits = 5
# model = VotingRegressor(estimators=[...])
# submission = train_model(model, test, train, sample, n_splits=n_splits, seed=SEED)

In [None]:
SEED = 2004
n_splits = 5
# Model parameters for LightGBM
Params = {
    'learning_rate': 0.09970294901245966, 
    'max_depth': 3, 
    'subsample': 0.9651688449975022, 
    'colsample_bytree': 0.616732288405486, 
    'num_leaves': 34, 
    'min_data_in_leaf': 68, 
    'feature_fraction': 0.6476169754611282, 
    'bagging_fraction': 0.9184091064527949, 
    'bagging_freq': 10, 
    'reg_alpha': 0.015879148435808108, 
    'reg_lambda': 0.0036854044260839643
}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.29682190417298865,
    'max_depth': 4, 'n_estimators': 796,
    'subsample': 0.7542484622989069,
    'colsample_bytree': 0.886399359731497,
    'reg_alpha': 0.014681067600657996,
    'reg_lambda': 9.209859894025579,
    'gamma': 0.06495942878096272, 
    'min_child_weight': 13,
    'use_gpu': True
}

CatBoost_Params = {
    'learning_rate': 0.026392650714515364, 
    'depth': 15, 'l2_leaf_reg': 0.0018692968691208557,
    'iterations': 637, 'bagging_temperature': 0.45636037003578794,
    'random_strength': 7.2357605130667455, 'border_count': 135, 
    'grow_policy': 'Lossguide'
}
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)


In [None]:
ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[ ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[ ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[ ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[
    ('regressor', GradientBoostingRegressor(random_state=SEED))
]))
])

Submission3 = train_model(ensemble, test, train, sample, n_splits=n_splits, seed=SEED)
Submission3