# Libraries

In [1]:
import gc
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

from colorama import Fore, Style, init
from pprint import pprint

# 🚫 Suppressing warnings 🚫
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.metrics import cohen_kappa_score, make_scorer, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import xgboost as xgb

In [3]:
SEED = 42
n_splits = 5

# Load data

In [4]:
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
len(test)

20

In [5]:
test.head(5)

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,32.6909,,,,,,,,Fall,3.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,27.0552,,,Fall,2.34,Fall,46.0,64.0,Summer,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,45.9966,,,Winter,2.451,Summer,31.0,45.0,Winter,0.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,Summer,1.04,,,,,,,


In [6]:
df = pd.read_csv('/kaggle/input/colombian-frenchteam-problematicinternetusage/Dataset_problematic_internet_usage.csv')
len(df)

3960

In [7]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,...,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,Has_actigraphy_data,Train_Test_Label
0,0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,...,4.0,55.0,,,,Fall,3.0,2.0,No,isnotintrainset
1,1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,...,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0,No,isnotintrainset
2,2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,...,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0,No,isnotintrainset
3,3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,...,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0,Yes,train
4,4,0016bb22,Spring,18,1,Summer,,,,,...,,,,,,,,,No,isnotintrainset


In [8]:
sii_counts = df['sii'].value_counts(dropna=False)
sii_percentages = (sii_counts / sii_counts.sum()) * 100
sii_summary = pd.DataFrame({'Count': sii_counts,'Percentage': sii_percentages})

print(sii_summary)

     Count  Percentage
sii                   
0.0   1594   40.252525
NaN   1224   30.909091
1.0    730   18.434343
2.0    378    9.545455
3.0     34    0.858586


# Processing the data

In [9]:
featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

In [10]:
df = df[featuresCols]
df = df.dropna(subset='sii')
len(df)

2736

In [11]:
df_unchanged = df.copy()

In [12]:
sii_counts = df['sii'].value_counts(dropna=False)
sii_percentages = (sii_counts / sii_counts.sum()) * 100
sii_summary = pd.DataFrame({'Count': sii_counts,'Percentage': sii_percentages})

print(sii_summary)

     Count  Percentage
sii                   
0.0   1594   58.260234
1.0    730   26.681287
2.0    378   13.815789
3.0     34    1.242690


In [13]:
def update(df):
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

In [14]:
cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 
          'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

df = update(df)

In [15]:
def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

In [16]:
for col in cat_c:
    mapping_df = create_mapping(col, df)
    df[col] = df[col].replace(mapping_df).astype(int)

print(f'df Shape : {df.shape}')

df Shape : (2736, 59)


In [17]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

In [18]:
df.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,0.0,5.0,0.0,0.0,51.0,0.0,16.877316,46.0,50.8,26.625259,...,0.0,2.185758,0.0,2.590608,0.0,40.977839,57.647525,0.0,3.0,2.0
1,1.0,9.0,0.0,1.0,65.159266,0.0,14.03559,48.0,46.0,22.0,...,0.0,2.185758,1.0,2.34,1.0,46.0,64.0,1.0,0.0,0.0
2,1.0,10.0,1.0,2.0,71.0,0.0,16.648696,56.5,75.6,26.625259,...,0.0,2.185758,2.0,2.17,1.0,38.0,54.0,1.0,2.0,0.0
3,2.0,9.0,0.0,2.0,71.0,1.0,18.292347,56.0,81.6,26.625259,...,0.0,2.185758,3.0,2.451,2.0,31.0,45.0,2.0,0.0,1.0
5,3.0,13.0,1.0,0.0,50.0,1.0,22.279952,59.5,112.2,26.625259,...,0.0,2.185758,4.0,4.11,2.0,40.0,56.0,3.0,0.0,1.0


In [19]:
df_unchanged.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,...,,,,,,,,Fall,3.0,2.0
1,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,...,,,Fall,2.34,Fall,46.0,64.0,Summer,0.0,0.0
2,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,...,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0,0.0
3,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,...,,,Winter,2.451,Summer,31.0,45.0,Winter,0.0,1.0
5,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,,...,,,Spring,4.11,Summer,40.0,56.0,Spring,0.0,1.0


# Modelling 

In [20]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.metrics import cohen_kappa_score, make_scorer, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.optimize import minimize

In [21]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb
from scipy.optimize import minimize
from sklearn.metrics import mean_squared_error

In [22]:
# Custom Kappa scorer
KAPPA_SCORER = make_scorer(
    cohen_kappa_score, 
    greater_is_better=True, 
    weights='quadratic'
)

# Custom LGBM Regressor with threshold optimization
class CustomLGBMRegressor(lgb.LGBMRegressor):
    '''
    Custom LightGBM Regressor
    
    It optimizes threshold values during fitting.
    Main goal is preventing overfit on validation data.
    '''
    def fit(self, X, y, **kwargs):
        super().fit(X, y, **kwargs)
        y_pred = super().predict(X, **kwargs)
        
        self.optimizer = minimize(
            eval_preds, 
            x0=[0.5, 1.5, 2.5], 
            args=(y, y_pred), 
            method='Nelder-Mead',
        )
        
    def predict(self, X, **kwargs):
        y_pred = super().predict(X, **kwargs)
        y_pred = threshold_Rounder(y_pred, self.optimizer.x)
        return y_pred

class CustomXGBRegressor(XGBRegressor):
    def fit(self, X, y, **kwargs):
        super().fit(X, y, **kwargs)
        y_pred = super().predict(X, **kwargs)
        self.optimizer = minimize(
            eval_preds, 
            x0=[0.5, 1.5, 2.5], 
            args=(y, y_pred), 
            method='Nelder-Mead',
        )
        
    def predict(self, X, **kwargs):
        y_pred = super().predict(X, **kwargs)
        y_pred = threshold_Rounder(y_pred, self.optimizer.x)
        return y_pred

class CustomRFRegressor(RandomForestRegressor):
    def fit(self, X, y, **kwargs):
        super().fit(X, y, **kwargs)
        y_pred = super().predict(X, **kwargs)
        self.optimizer = minimize(
            eval_preds, 
            x0=[0.5, 1.5, 2.5], 
            args=(y, y_pred), 
            method='Nelder-Mead',
        )
        
    def predict(self, X, **kwargs):
        y_pred = super().predict(X, **kwargs)
        y_pred = threshold_Rounder(y_pred, self.optimizer.x)
        return y_pred

class CustomMLPRegressor(MLPRegressor):
    def fit(self, X, y, **kwargs):
        super().fit(X, y, **kwargs)
        y_pred = super().predict(X, **kwargs)
        self.optimizer = minimize(
            eval_preds, 
            x0=[0.5, 1.5, 2.5], 
            args=(y, y_pred), 
            method='Nelder-Mead',
        )
        
    def predict(self, X, **kwargs):
        y_pred = super().predict(X, **kwargs)
        y_pred = threshold_Rounder(y_pred, self.optimizer.x)
        return y_pred

In [23]:
# Helper functions
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def eval_preds(thresholds, y_true, y_pred):
    y_pred = threshold_Rounder(y_pred, thresholds)
    score = cohen_kappa_score(y_true, y_pred, weights='quadratic')
    return -score

In [24]:
# Custom LGBM Objective with SMOTE
def lgb_objective(trial):
    params = {
        'objective': 'l2',
        'verbosity': -1,
        'n_iter': 200,
        'random_state': SEED,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 10.0, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'num_leaves': trial.suggest_int('num_leaves', 16, 256),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
    }

    X = df.drop(['sii'], axis=1)
    y = df['sii']
    
    # Apply SMOTE to the data
    smote = SMOTE(random_state=SEED)
    X_res, y_res = smote.fit_resample(X, y)
    
    cv = StratifiedKFold(5, shuffle=True, random_state=SEED)
    estimator = CustomLGBMRegressor(**params)

    val_scores = cross_val_score(
        estimator=estimator, 
        X=X_res, y=y_res, 
        cv=cv, 
        scoring=KAPPA_SCORER,
    )

    return np.mean(val_scores)

# Modify for XGB
def xgb_objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'verbosity': 0,
        'n_estimators': 200,
        'random_state': SEED,
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.4, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    X = df.drop(['sii'], axis=1)
    y = df['sii']
    
    # Apply SMOTE to the data
    smote = SMOTE(random_state=SEED)
    X_res, y_res = smote.fit_resample(X, y)
    
    cv = StratifiedKFold(5, shuffle=True, random_state=SEED)
    estimator = CustomXGBRegressor(**params)

    val_scores = cross_val_score(
        estimator=estimator, 
        X=X_res, y=y_res, 
        cv=cv, 
        scoring=KAPPA_SCORER,
    )

    return np.mean(val_scores)

# Modify for RF
def rf_objective(trial):
    params = {
        'n_estimators': 200,
        'random_state': SEED,
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('max_features', 0.5, 1.0),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
    }

    X = df.drop(['sii'], axis=1)
    y = df['sii']
    
    # Apply SMOTE to the data
    smote = SMOTE(random_state=SEED)
    X_res, y_res = smote.fit_resample(X, y)
    
    cv = StratifiedKFold(5, shuffle=True, random_state=SEED)
    estimator = CustomRFRegressor(**params)

    val_scores = cross_val_score(
        estimator=estimator, 
        X=X_res, y=y_res, 
        cv=cv, 
        scoring=KAPPA_SCORER,
    )

    return np.mean(val_scores)

# Modify for MLP
def mlp_objective(trial):
    params = {
        'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (200,), (100, 50)]),
        'activation': trial.suggest_categorical('activation', ['relu', 'tanh', 'logistic']),
        'solver': trial.suggest_categorical('solver', ['adam', 'sgd']),
        'alpha': trial.suggest_float('alpha', 1e-5, 1e-1, log=True),
        'learning_rate': trial.suggest_categorical('learning_rate', ['constant', 'invscaling', 'adaptive']),
        'random_state': SEED,
        'max_iter': 200,
    }

    X = df.drop(['sii'], axis=1)
    y = df['sii']
    
    # Apply SMOTE to the data
    smote = SMOTE(random_state=SEED)
    X_res, y_res = smote.fit_resample(X, y)
    
    cv = StratifiedKFold(5, shuffle=True, random_state=SEED)
    estimator = CustomMLPRegressor(**params)

    val_scores = cross_val_score(
        estimator=estimator, 
        X=X_res, y=y_res, 
        cv=cv, 
        scoring=KAPPA_SCORER,
    )

    return np.mean(val_scores)

In [25]:
# Create a study for each model and optimize
#def optimize_model(objective, study_name):
#    study = optuna.create_study(direction='maximize', study_name=study_name)
#    study.optimize(objective, n_trials=30, show_progress_bar=True)
#    return study.best_trial.params

# Optimizing for each model
#best_params_lgb = optimize_model(lgb_objective, 'LGBM_Regressor')
#best_params_xgb = optimize_model(xgb_objective, 'XGB_Regressor')
#best_params_rf = optimize_model(rf_objective, 'RF_Regressor')
#best_params_mlp = optimize_model(mlp_objective, 'MLP_Regressor')

# Print the best parameters for each model
#print("Best parameters for LGBM:", best_params_lgb)
#print("Best parameters for XGB:", best_params_xgb)
#print("Best parameters for RF:", best_params_rf)
#print("Best parameters for MLP:", best_params_mlp)

In [26]:
best_params_lgb = {'lambda_l1': 2.579402984154109, 'lambda_l2': 0.966337187970675, 'learning_rate': 0.07634780231724438, 'max_depth': 8, 'num_leaves': 215, 'colsample_bytree': 0.5128196184058356, 'colsample_bynode': 0.5118288317492637, 'bagging_fraction': 0.9249047823132005, 'bagging_freq': 2, 'min_data_in_leaf': 18}
best_params_xgb = {'lambda': 1.515523424629609, 'alpha': 0.1573807955436052, 'learning_rate': 0.06786761631770998, 'max_depth': 8, 'subsample': 0.9040820789652597, 'colsample_bytree': 0.5072241031673306, 'colsample_bylevel': 0.9660029907148966, 'min_child_weight': 3}
best_params_rf = {'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 0.6449273837823946, 'bootstrap': True}
best_params_mlp = {'hidden_layer_sizes': (200,), 'activation': 'logistic', 'solver': 'adam', 'alpha': 0.0001635693635047117, 'learning_rate': 'adaptive'}

In [27]:
params_lgb = {
    'objective': 'l2',
    'verbosity': -1,
    'n_iter': 200,
    'random_state': SEED,
    'lambda_l1': 8.906831645766042,
     'lambda_l2': 0.004591584170840488,
     'learning_rate': 0.04874811806004515,
     'max_depth': 6,
     'num_leaves': 82,
     'colsample_bytree': 0.6403876590762936,
     'colsample_bynode': 0.49845692241164663,
     'bagging_fraction': 0.55635165528929,
     'bagging_freq': 1,
     'min_data_in_leaf': 70,
}

params_xgb = {
        'objective': 'reg:squarederror',
        'verbosity': 0,
        'n_estimators': 200,
        'random_state': SEED,
        'lambda': 1e-3,
        'alpha': 1e-3,
        'learning_rate': 1e-2,
        'max_depth': 4,
        'subsample': 0.5,
        'colsample_bytree': 0.4,
        'colsample_bylevel': 0.4,
        'min_child_weight': 1,
}

params_rf= {
        'n_estimators': 200,
        'random_state': SEED,
        'max_depth': 4,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 0.5,
        'bootstrap': True,
}

params_mlp = {
        'hidden_layer_sizes': (100, 50), 
        'activation': 'relu',
        'solver': 'adam',
        'alpha': 0.053958855282467966,
        'learning_rate': 'adaptive',
        'random_state': SEED,
        'max_iter': 200,
}

params_lgb.update(best_params_lgb)
params_xgb.update(best_params_xgb)
params_rf.update(best_params_rf)
params_mlp.update(best_params_mlp)

In [28]:
X = df.drop(['sii'], axis=1)
y = df['sii']
cv = StratifiedKFold(5, shuffle=True, random_state=SEED)

In [29]:
model_lgb = CustomLGBMRegressor(**params_lgb)
model_xgb = CustomXGBRegressor(**params_xgb)
model_rf = CustomRFRegressor(**params_rf)
model_mlp = CustomMLPRegressor(**params_mlp)

In [30]:
val_scores_lgb = cross_val_score(model_lgb, X, y, cv=cv, scoring=KAPPA_SCORER,)
val_scores_xgb = cross_val_score(model_xgb, X, y, cv=cv, scoring=KAPPA_SCORER,)
val_scores_rf = cross_val_score(model_rf, X, y, cv=cv, scoring=KAPPA_SCORER,)
val_scores_mlp = cross_val_score(model_mlp, X, y, cv=cv, scoring=KAPPA_SCORER,)

kappa_scores = {
    'LGBM': np.mean(val_scores_lgb),
    'XGB': np.mean(val_scores_xgb),
    'RF': np.mean(val_scores_rf),
    'MLP': np.mean(val_scores_mlp),
}

# Print all Kappa scores at once
for model_name, kappa_score in kappa_scores.items():
    print(f'{model_name} Kappa score: {kappa_score:.4f}')

LGBM Kappa score: 0.3989
XGB Kappa score: 0.4058
RF Kappa score: 0.4254
MLP Kappa score: 0.3314


In [31]:
model_lgb.fit(X, y)
model_xgb.fit(X, y)
model_rf.fit(X, y)
model_mlp.fit(X, y)

In [32]:
# Check individual model predictions
y_pred_lgb = model_lgb.predict(X)
y_pred_xgb = model_xgb.predict(X)
y_pred_rf = model_rf.predict(X)
y_pred_mlp = model_mlp.predict(X)

# Print the first few predictions for each model to check for variance
print("LGBM predictions:", y_pred_lgb[:10])
print("XGB predictions:", y_pred_xgb[:10])
print("RF predictions:", y_pred_rf[:10])
print("MLP predictions:", y_pred_mlp[:10])

LGBM predictions: [1 0 0 1 1 0 0 1 2 0]
XGB predictions: [2 0 0 1 1 0 0 1 2 0]
RF predictions: [1 0 0 0 1 0 0 1 1 0]
MLP predictions: [0 0 0 0 1 0 0 1 1 0]


In [33]:
all_preds = np.array([y_pred_lgb, y_pred_xgb, y_pred_rf, y_pred_mlp])
voted_preds = np.apply_along_axis(lambda x: np.bincount(x.astype(int)).argmax(), axis=0, arr=all_preds)
kappa_score = cohen_kappa_score(y, voted_preds)
print(f'Majority Voting Kappa score: {kappa_score:.4f}')

Majority Voting Kappa score: 0.7361


# Seems the best model is the LGBM so far with a predicted score of 0.4719

In [34]:
#model = model_lgb

# Submission 

In [35]:
test = update(test)

for col in cat_c:
    mapping_test = create_mapping(col, test)
    test[col] = test[col].replace(mapping_test).astype(int)

print(f'Test Shape : {test.shape}')

Test Shape : (20, 59)


In [36]:
numeric_columns = test.select_dtypes(include=['float64', 'int64']).columns
test[numeric_columns] = imputer.fit_transform(test[numeric_columns])

In [37]:
Submission = test[['id']]
test = test.drop('id', axis=1)

In [38]:
y_pred_lgb = model_lgb.predict(test)
y_pred_xgb = model_xgb.predict(test)
y_pred_rf = model_rf.predict(test)
y_pred_mlp = model_mlp.predict(test)

In [39]:
all_preds = np.array([y_pred_lgb, y_pred_xgb, y_pred_rf, y_pred_mlp])
voted_preds = np.apply_along_axis(lambda x: np.bincount(x.astype(int)).argmax(), axis=0, arr=all_preds)

In [40]:
voted_preds

array([1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0])

In [41]:
#Submission['sii'] = model.predict(test)
Submission['sii'] = voted_preds

In [42]:
Submission

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,0
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,0


In [43]:
Submission.to_csv('submission.csv', index=False)
print(Submission['sii'].value_counts())

sii
0    14
1     6
Name: count, dtype: int64
