# Initial

In [1]:
# Standard library
import os
from concurrent.futures import ThreadPoolExecutor

# Data manipulation and analysis
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

# Machine learning and preprocessing
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline

# Specialized ML models
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.callbacks import Callback

# Deep learning
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

# Utility
from colorama import Fore, Style
from IPython.display import clear_output
from tqdm import tqdm


In [2]:
!pip -q install /kaggle/input/tabnet/pytorch/v1/1/pytorch_tabnet-4.1.0-py3-none-any.whl

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  pid, fd = os.forkpty()


# Process Data

In [None]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_data_parquet(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [4]:
from sklearn.preprocessing import OneHotEncoder
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

train_ts = load_data_parquet("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_data_parquet("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
featuresCols += time_series_cols

train = train[featuresCols]


100%|██████████| 996/996 [01:31<00:00, 10.87it/s]
100%|██████████| 2/2 [00:00<00:00,  7.03it/s]


In [5]:
def fill_nan_values(data):
    imputer = KNNImputer(n_neighbors=71)
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
    imputed_data = imputer.fit_transform(data[numeric_cols])
    data_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
    
    if 'sii' in data.columns:
        data_imputed['sii'] = data_imputed['sii'].round().astype(int)
    
    for col in data.columns:
        if col not in numeric_cols:
            data_imputed[col] = data[col]
    
    return data_imputed

def handle_category_data(data):
    all_categories = ['Spring', 'Summer', 'Fall', 'Winter']
    df_encoded = data.copy()
    encoder = OneHotEncoder(categories=[all_categories], sparse_output=False, handle_unknown='ignore', dtype=int)
    
    for column in data.select_dtypes(include=['object', 'category']).columns:
        encoded_array = encoder.fit_transform(df_encoded[[column]])
        encoded_columns = [f"{column}_{season}" for season in all_categories]
        encoded_df = pd.DataFrame(encoded_array, columns=encoded_columns, index=data.index)
        df_encoded = pd.concat([df_encoded.drop(column, axis=1), encoded_df], axis=1)
    
    return df_encoded

train = fill_nan_values(train)
train = handle_category_data(train)
test = fill_nan_values(test)
test = handle_category_data(test)

train_2 = train.copy()
test_2 = test.copy()

train_3 = train.copy()
test_3 = test.copy()

# Model 1 (Using normally with Voting Regressor)

In [7]:
import numpy as np
from sklearn.metrics import cohen_kappa_score

# Replace infinite values with NaN in the train dataset
if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_rounder(oof_non_rounded, thresholds):
    return np.digitize(oof_non_rounded, bins=thresholds, right=True)

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_predictions = threshold_rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_predictions)

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from scipy.optimize import minimize
from tqdm import tqdm
from IPython.display import clear_output
from colorama import Fore, Style

def train_model(model, test_data, train_data, sample, n_splits=5, seed=42):
    X = train_data.drop(['sii'], axis=1)
    y = train_data['sii']

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    train_scores = []
    val_scores = []
    
    oof_predictions = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_predictions = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model_instance = clone(model)
        model_instance.fit(X_train, y_train)

        y_train_pred = model_instance.predict(X_train)
        y_val_pred = model_instance.predict(X_val)

        oof_predictions[val_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[val_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_scores.append(train_kappa)
        val_scores.append(val_kappa)
        
        test_predictions[:, fold] = model_instance.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_scores):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(val_scores):.4f}")

    kappa_optimizer = minimize(evaluate_predictions,
                               x0=[0.5, 1.5, 2.5], args=(y, oof_predictions), 
                               method='Nelder-Mead')
    assert kappa_optimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_rounder(oof_predictions, kappa_optimizer.x)
    tuned_kappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tuned_kappa:.3f}{Style.RESET_ALL}")

    test_predictions_mean = test_predictions.mean(axis=1)
    test_predictions_tuned = threshold_rounder(test_predictions_mean, kappa_optimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': test_predictions_tuned
    })

    return submission

# Example usage:
# SEED = 2004
# n_splits = 5
# model = VotingRegressor(estimators=[...])
# submission = train_model(model, test, train, sample, n_splits=n_splits, seed=SEED)

In [9]:
SEED = 2004
n_splits = 5
# Model parameters for LightGBM
Params = {
    'learning_rate': 0.09970294901245966, 
    'max_depth': 3, 
    'subsample': 0.9651688449975022, 
    'colsample_bytree': 0.616732288405486, 
    'num_leaves': 34, 
    'min_data_in_leaf': 68, 
    'feature_fraction': 0.6476169754611282, 
    'bagging_fraction': 0.9184091064527949, 
    'bagging_freq': 10, 
    'reg_alpha': 0.015879148435808108, 
    'reg_lambda': 0.0036854044260839643
}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.29682190417298865,
    'max_depth': 4, 'n_estimators': 796,
    'subsample': 0.7542484622989069,
    'colsample_bytree': 0.886399359731497,
    'reg_alpha': 0.014681067600657996,
    'reg_lambda': 9.209859894025579,
    'gamma': 0.06495942878096272, 
    'min_child_weight': 13,
    'use_gpu': True
}

CatBoost_Params = {
    'learning_rate': 0.026392650714515364, 
    'depth': 15, 'l2_leaf_reg': 0.0018692968691208557,
    'iterations': 637, 'bagging_temperature': 0.45636037003578794,
    'random_strength': 7.2357605130667455, 'border_count': 135, 
    'grow_policy': 'Lossguide'
}
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)


In [10]:
ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[ ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[ ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[ ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[
    ('regressor', GradientBoostingRegressor(random_state=SEED))
]))
])

Submission3 = train_model(ensemble, test, train, sample, n_splits=n_splits, seed=SEED)
Submission3

Training Folds: 100%|██████████| 5/5 [03:17<00:00, 39.58s/it]

Mean Train QWK --> 0.9273
Mean Validation QWK ---> 0.3884
----> || Optimized QWK SCORE :: [36m[1m 0.436[0m





Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,1
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,1
8,0069fbed,1
9,0083e397,1


# Model 2 ( Using Tabnet)

In [11]:


class TabNetModelWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.tabnet_model = TabNetRegressor(**kwargs)
        self.kwargs = kwargs
        self.imputer = SimpleImputer(strategy='median')
        self.best_model_path = 'best_tabnet_model.pt'
        
    def fit(self, X, y):
        # Impute missing values
        X_imputed = self.imputer.fit_transform(X)
        
        if hasattr(y, 'values'):
            y = y.values
            
        # Split data into training and validation sets
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_imputed, 
            y, 
            test_size=0.2,
            random_state=42
        )
        
        # Train the TabNet model
        self.tabnet_model.fit(
            X_train=X_train,
            y_train=y_train.reshape(-1, 1),
            eval_set=[(X_valid, y_valid.reshape(-1, 1))],
            eval_name=['valid'],
            eval_metric=['mse'],
            max_epochs=200,
            patience=20,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
            callbacks=[
                ModelCheckpoint(
                    filepath=self.best_model_path,
                    monitor='valid_mse',
                    mode='min',
                    save_best_only=True,
                    verbose=True
                )
            ]
        )
        
        # Load the best model
        if os.path.exists(self.best_model_path):
            self.tabnet_model.load_model(self.best_model_path)
            os.remove(self.best_model_path)  # Remove the temporary file
        
        return self
    
    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.tabnet_model.predict(X_imputed).flatten()
    
    def __deepcopy__(self, memo):
        # Support for deepcopy in scikit-learn
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

# TabNet hyperparameters
TabNet_Params = {
    'n_d': 47,              
    'n_a': 56,              
    'n_steps': 5,           
    'gamma': 1.5,           
    'n_independent': 2,     
    'n_shared': 2,          
    'lambda_sparse': 1e-4, 
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
    'mask_type': 'entmax',
    'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose': -1,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}

class ModelCheckpoint(Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min', 
                 save_best_only=True, verbose=1):
        super().__init__()  # Initialize parent class
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float('inf') if mode == 'min' else -float('inf')
        
    def on_train_begin(self, logs=None):
        self.model = self.trainer  # Use trainer itself as model
        
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return

        if (self.mode == 'min' and current < self.best) or \
           (self.mode == 'max' and current > self.best):
            if self.verbose:
                print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath) 

In [12]:
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
TabNet_Model = TabNetModelWrapper(**TabNet_Params) 
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('tabnet', TabNet_Model)
])
Submission2 = train_model(voting_model, test, train, sample, n_splits=n_splits, seed=SEED)
Submission2

Training Folds: 100%|██████████| 5/5 [03:49<00:00, 45.81s/it]

Mean Train QWK --> 0.7567
Mean Validation QWK ---> 0.3413
----> || Optimized QWK SCORE :: [36m[1m 0.422[0m





Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,1
2,00105258,1
3,00115b9f,1
4,0016bb22,1
5,001f3379,0
6,0038ba98,1
7,0068a485,1
8,0069fbed,1
9,0083e397,1


# Model 3 (Idea about using AutoML to detect the best model, in this case, using H2O)

In [14]:
import h2o
from h2o.automl import H2OAutoML
def TrainML(best_model, test_data, train_data):
    X = train_data.drop(['sii'], axis=1)
    y = train_data['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    oof_preds = np.zeros(len(y))
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))

    train_scores, val_scores = [], []

    for fold, (train_idx, val_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds")):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        train_preds = best_model.predict(h2o.H2OFrame(X_train)).as_data_frame().values.flatten()
        val_preds = best_model.predict(h2o.H2OFrame(X_val)).as_data_frame().values.flatten()

        oof_preds[val_idx] = val_preds
        oof_rounded[val_idx] = val_preds.round().astype(int)


        train_scores.append(quadratic_weighted_kappa(y_train, train_preds.round().astype(int)))
        val_scores.append(quadratic_weighted_kappa(y_val, val_preds.round().astype(int)))

        test_preds[:, fold] = best_model.predict(h2o.H2OFrame(test_data)).as_data_frame().values.flatten()

    print(f"Mean Train QWK: {np.mean(train_scores):.4f}")
    print(f"Mean Validation QWK: {np.mean(val_scores):.4f}")

    KappaOptimizer = minimize(evaluate_predictions, x0=[0.5, 1.5, 2.5], args=(y, oof_preds), method='Nelder-Mead')
    assert KappaOptimizer.success, "Threshold optimization failed"

    oof_tuned =threshold_rounder(oof_preds, KappaOptimizer.x)
    print(f"Optimized QWK Score: {quadratic_weighted_kappa(y, oof_tuned):.3f}")

    final_test_preds = threshold_rounder(test_preds.mean(axis=1), KappaOptimizer.x)

    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': final_test_preds
    })

    return submission, KappaOptimizer


In [15]:

h2o.init()
train_data = h2o.H2OFrame(train)

aml = H2OAutoML(max_runtime_secs=600,seed=5)
aml.train(y='sii', training_frame=train_data)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.25" 2024-10-15; OpenJDK Runtime Environment (build 11.0.25+9-post-Ubuntu-1ubuntu120.04); OpenJDK 64-Bit Server VM (build 11.0.25+9-post-Ubuntu-1ubuntu120.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmppwseoplw
  JVM stdout: /tmp/tmppwseoplw/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmppwseoplw/h2o_unknownUser_started_from_python.err


In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = _posixsubprocess.fork_exec(


  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 17 days
H2O_cluster_name:,H2O_from_python_unknownUser_ke3s4j
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.500 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
17:22:03.588: _train param, Dropping bad and constant columns: [stat_42, stat_41]

████
17:22:24.20: _train param, Dropping bad and constant columns: [stat_42, stat_41]


17:22:27.817: _train param, Dropping bad and constant columns: [stat_42, stat_41]

██
17:22:43.175: _train param, Dropping unused columns: [stat_42, stat_41]
17:22:43.635: _train param, Dropping bad and constant columns: [stat_42, stat_41]

█
17:22:53.716: _train param, Dropping bad and constant columns: [stat_42, stat_41]

████
17:23:25.716: _train param, Dropping bad and constant columns: [stat_42, stat_41]

█
17:23:36.288: _train param, Dropping bad and constant columns: [stat_42, stat_41]

█
17:23:47.316: _train param, Dropping bad and constant columns: [stat_42, stat_41]

██
17:24:01.335: _train param, Dropping unused columns: [stat_42, stat_41]


17:24:01.718: _train param, Dropping unused columns: [

key,value
Stacking strategy,cross_validation
Number of base models (used / total),20/40
# GBM base models (used / total),8/14
# XGBoost base models (used / total),9/20
# DRF base models (used / total),2/2
# DeepLearning base models (used / total),1/3
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,1467.9686,115.78374,1468.7183,1607.0631,1493.2822,1485.3804,1285.3989
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,0.453819,0.0159249,0.4576773,0.4709561,0.4573653,0.4555504,0.4275459
mean_residual_deviance,0.3559041,0.0244244,0.3501201,0.3866423,0.3667285,0.3560523,0.3199775
mse,0.3559041,0.0244244,0.3501201,0.3866423,0.3667285,0.3560523,0.3199775
null_deviance,371.53046,26.049839,356.3925,394.16498,368.95016,400.33057,337.8141
r2,0.2397619,0.0495225,0.2136993,0.1869816,0.2136241,0.2835574,0.3009473
residual_deviance,282.46817,30.059025,280.09604,319.75317,290.08224,286.26605,236.1434
rmse,0.5962926,0.0205936,0.5917094,0.6218057,0.6055811,0.5967012,0.5656655
rmsle,0.3564295,0.0132471,0.3585249,0.3736555,0.3620692,0.3493102,0.3385877


In [16]:
best_model = aml.leader
Submission1,KappaOPtimizer = TrainML(best_model,test,train)
print(KappaOPtimizer.x)
print(Submission1)

Training Folds: 0it [00:00, ?it/s]

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%





Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Parse progress: |




████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |


Training Folds: 1it [00:03,  3.20s/it]

███████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%





Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Parse progress: |




████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |


Training Folds: 2it [00:06,  3.10s/it]

███████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%





Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Parse progress: |




████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |


Training Folds: 3it [00:08,  2.89s/it]

███████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%





Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Parse progress: |




████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |


Training Folds: 4it [00:11,  2.84s/it]

███████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%





Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Parse progress: |




████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |


Training Folds: 5it [00:15,  3.02s/it]

███████████████████████████████████████████| (done) 100%
Mean Train QWK: 0.7675
Mean Validation QWK: 0.7675
Optimized QWK Score: 0.852
[0.55321595 1.17587285 2.62779895]
          id  sii
0   00008ff9    1
1   000fd460    0
2   00105258    1
3   00115b9f    1
4   0016bb22    1
5   001f3379    1
6   0038ba98    1
7   0068a485    1
8   0069fbed    1
9   0083e397    1
10  0087dd65    1
11  00abe655    1
12  00ae59c9    1
13  00af6387    1
14  00bd4359    1
15  00c0cd71    1
16  00d56d4b    0
17  00d9913d    1
18  00e6167c    1
19  00ebc35d    1





In [17]:
import pandas as pd

submission1 = Submission1.sort_values(by='id').reset_index(drop=True)
submission2 = Submission2.sort_values(by='id').reset_index(drop=True)
submission3 = Submission3.sort_values(by='id').reset_index(drop=True)

combined_submissions = pd.DataFrame({
    'id': submission1['id'],
    'sii_1': submission1['sii'],
    'sii_2': submission2['sii'],
    'sii_3': submission3['sii']
})


def majority_vote(row):
    return row.mode()[0]

combined_submissions['final_sii'] = combined_submissions[['sii_1', 'sii_2', 'sii_3']].apply(majority_vote, axis=1)


final_submission = combined_submissions[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})


final_submission.to_csv('submission.csv', index=False)

print("Majority voting completed and saved to 'submission.csv'")

Majority voting completed and saved to 'Final_Submission.csv'


In [18]:
final_submission

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,1
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,1
8,0069fbed,1
9,0083e397,1
