In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('darkgrid')
from sklearn.metrics import make_scorer, cohen_kappa_score
import warnings
warnings.simplefilter('ignore')
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from IPython.display import clear_output
from scipy.optimize import minimize
from colorama import Fore, Style
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# Reads a parquet file, drops the 'step' column, and returns stats and ID
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

# Loads time-series data, processes each file, and compiles into a DataFrame
def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))

    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f'stat_{i}' for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [None]:
# Load static data
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

# Load time series data for train and test
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

In [None]:
# Extract time-series feature columns by removing 'id' from train_ts columns
time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")
time_series_cols

In [None]:
# Merge time-series data with train and test datasets on 'id'
train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

In [None]:
# Drop the 'id' column from both train and test datasets
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [None]:
# Define a list of 58 features (excluding 'id' and including 'sii') that appear in both train and test datasets
featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex', 
                'CGAS-Season', 'CGAS-CGAS_Score', 
                'Physical-Season', 'Physical-BMI', 'Physical-Height', 
                'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 
                'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI', 
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 
                'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_C-PAQ_C_Total', 
                'SDS-Season', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 
                'PreInt_EduHx-Season', 'PreInt_EduHx-computerinternet_hoursday', 
                'sii']
len(featuresCols)

In [None]:
# Add the time series feature columns to the list of features
featuresCols += time_series_cols

# Select the specified features from the train dataset
train = train[featuresCols]

In [None]:
# Remove rows where the 'sii' column has missing values
train = train.dropna(subset='sii')

After the above steps, the train dataset contains 2,736 rows (down from 3,960) because rows with missing values in the 'sii' column were removed. It now has 155 columns: 58 features from the test set, 1 'sii' column (the target we want to predict), and 96 statistical features (from 0 to 95, derived from the time series data). The test dataset has 20 rows and 154 columns, retaining all the feature columns from the train dataset except for 'sii' (since this is the target to predict).

In [None]:
# Define a list of categorical features that represent season data
season_cat = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
              'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
              'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

len(season_cat)

In [None]:
# Define a function to replace NaN with 'Missing' and change the type of season columns to 'category'
def update(df):
    global season_cat
    for cat in season_cat:
        df[cat] = df[cat].fillna('Missing')  # Replace NaN values with 'Missing'
        df[cat] = df[cat].astype('category')  # Convert column to 'category' type to optimize storage
    return df

In [None]:
# Apply the update function to the train and test datasets to handle missing values and optimize column types
train = update(train)
test = update(test)

In [None]:
# Define a function to create a mapping of category values to numbers
def create_mapping(column, dataset):
    unique_values = dataset[column].unique()  # Get unique category values from the column
    return {value: idx for idx, value in enumerate(unique_values)}  # Create a mapping of category to number

# Label encoding for season columns
for cat in season_cat: 
    # Create mapping for train and test datasets
    mapping_train = create_mapping(cat, train)
    mapping_test = create_mapping(cat, test)

    # Replace the categorical values with their corresponding numeric encoding and convert to integer data type
    train[cat] = train[cat].replace(mapping_train).astype(int)
    test[cat] = test[cat].replace(mapping_test).astype(int)

In [None]:
# Define a function to calculate the quadratic weighted kappa score
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [None]:
# Define a function to round values based on thresholds
def threshold_rounder(oof_non_rounded, thresholds):
    # Apply thresholds to classify values into categories: 0, 1, 2, or 3
    return np.where(oof_non_rounded < thresholds[0], 0,
                   np.where(oof_non_rounded < thresholds[1], 1,
                           np.where(oof_non_rounded < thresholds[2], 2, 3)))

In [None]:
# Define a function to train the model, perform cross-validation, and evaluate using QWK
def TrainML(model_class, test_data):
    # Split data into features (X) and target (y)
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    # Define StratifiedKFold for cross-validation
    SFK = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)

    train_S = []  # Store QWK scores for training
    test_S = []   # Store QWK scores for validation

    oof_non_rounded = np.zeros(len(y), dtype=float)  # Out-of-fold predictions (non-rounded)
    oof_rounded = np.zeros(len(y), dtype=int)        # Out-of-fold predictions (rounded)
    test_preds = np.zeros((len(test_data), 5))        # Store test predictions for each fold

    # Loop over each fold for cross-validation
    for fold, (train_idx, test_idx) in enumerate(tqdm(SFK.split(X, y), desc="Training Folds", total=5)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        # Clone and train the model
        model = clone(model_class)
        model.fit(X_train, y_train)

        # Make predictions on training and validation data
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        # Store out-of-fold predictions
        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        # Calculate QWK for training and validation
        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        # Store test predictions
        test_preds[:, fold] = model.predict(test_data)

        # Print fold results
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    # Print average QWK scores
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    # Optimize the threshold for better QWK performance
    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    # Apply optimized thresholds to tune predictions
    oof_tuned = threshold_rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    # Average test predictions and apply tuned threshold
    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_rounder(tpm, KappaOPtimizer.x)
    
    # Create submission DataFrame
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

In [None]:
# Parameters for XGBoost model
XGB_Params2 = {
    'max_depth': 10,
    'learning_rate': 0.05,
    'n_estimators': 200,
    'subsample': 0.6,
    'colsample_bytree': 1.0,
    'reg_lambda': 1,
    'reg_alpha': 5,
}

# Parameters for CatBoost model
CatBoost_Params1 = {
    'depth': 5,
    'learning_rate': 0.05,
    'iterations': 100,
    'subsample': 0.6,
    'l2_leaf_reg': 1,
    'random_strength': 0,
}

# Parameters for LightGBM model
LGBM_Params1 = {
    'learning_rate': 0.05,
    'max_depth': 8,
    'num_leaves': 100,
    'min_data_in_leaf': 5,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 6,
    'lambda_l1': 10,
    'lambda_l2': 0.1,
}

In [None]:
# Initialize SimpleImputer with median strategy for imputing missing values
imputer = SimpleImputer(strategy='median')

# Initialize individual models with optimized parameters
optimized_XGB_model = XGBRegressor(**XGB_Params2, random_state=12)
optimized_CatBoost_model = CatBoostRegressor(**CatBoost_Params1, random_state=12, silent=True)
optimized_LGBM_model = LGBMRegressor(**LGBM_Params1, random_state=12)

# Create a VotingRegressor ensemble with pipelines for each model
voting_model = VotingRegressor(estimators=[
    ('xgb', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  
        ('scaler', StandardScaler()), 
        ('regressor', XGBRegressor(**XGB_Params2, random_state=12))  
    ])),
    ('cat', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')), 
        ('regressor', CatBoostRegressor(**CatBoost_Params1, random_state=12, silent=True))  
    ])),
    ('lgb', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')), 
        ('scaler', StandardScaler()), 
        ('regressor', LGBMRegressor(**LGBM_Params1, random_state=12)) 
    ])),
])

In [None]:
# Train the ensemble VotingRegressor model and generate the submission
submission = TrainML(voting_model, test)

In [None]:
submission.to_csv('submission.csv', index=False)