In [3]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

SEED = 42
n_splits = 5

# Load datasets
train = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/train.csv')
test = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/test.csv')
sample = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/sample_submission.csv')

print(sample)

def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

train_ts = load_time_series("/Users/ad53533/Desktop/Applied ML/Project/series_train.parquet")
test_ts = load_time_series("/Users/ad53533/Desktop/Applied ML/Project/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)


featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-Season', 'CGAS-CGAS_Score',
                'Physical-Season', 'Physical-BMI', 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 'Fitness_Endurance-Season',
                'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec', 'FGC-Season',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone',
                'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone',
                'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat',
                'BIA-BIA_Frame_num', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 'PAQ_A-Season',
                'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T',
                'PreInt_EduHx-Season', 'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')


cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
         'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

# Update categorical variables
train = update(train)
test = update(test)

# Perform one-hot encoding
train = pd.get_dummies(train, columns=cat_c, drop_first=True)
test = pd.get_dummies(test, columns=cat_c, drop_first=True)

# Align train and test datasets to have the same columns
train_sii = train['sii']
train, test = train.drop('sii', axis=1).align(test, join='outer', axis=1, fill_value=0)
train['sii'] = train_sii

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

# for col in cat_c:
#     mapping = create_mapping(col, train)
#     mappingTe = create_mapping(col, test)
#     
#     train[col] = train[col].replace(mapping).astype(int)
#     test[col] = test[col].replace(mappingTe).astype(int)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tp_rounded = threshold_Rounder(tpm, KappaOPtimizer.x)

    return tp_rounded

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Imputation step: Filling missing values with the median
imputer = SimpleImputer(strategy='median')

          id  sii
0   00008ff9    0
1   000fd460    1
2   00105258    2
3   00115b9f    3
4   0016bb22    0
5   001f3379    1
6   0038ba98    2
7   0068a485    3
8   0069fbed    0
9   0083e397    1
10  0087dd65    2
11  00abe655    3
12  00ae59c9    0
13  00af6387    1
14  00bd4359    2
15  00c0cd71    3
16  00d56d4b    0
17  00d9913d    1
18  00e6167c    2
19  00ebc35d    3


100%|██████████| 996/996 [00:20<00:00, 47.59it/s]
100%|██████████| 2/2 [00:00<00:00, 16.36it/s]


In [2]:
from collections import Counter
import numpy as np
import pandas as pd

def run_with_parameters(params_list, test_data):
    """
    Run the training multiple times with different parameters
    and aggregate the predictions using majority voting.
    
    Args:
    - params_list: A list of parameter dictionaries for regressors.
    - test_data: The test dataset.
    
    Returns:
    - Final predictions based on majority voting.
    """
    all_predictions = []

    for params in params_list:
        # Define the ensemble with updated parameters
        ensemble = VotingRegressor(estimators=[
            ('lgb', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('regressor', LGBMRegressor(**params.get('lgb', {}), random_state=SEED))
            ])),
            ('xgb', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('regressor', XGBRegressor(**params.get('xgb', {}), random_state=SEED))
            ])),
            ('cat', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('regressor', CatBoostRegressor(**params.get('cat', {}), random_state=SEED, silent=True))
            ])),
            ('rf', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('regressor', RandomForestRegressor(**params.get('rf', {}), random_state=SEED))
            ])),
            ('gb', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('regressor', GradientBoostingRegressor(**params.get('gb', {}), random_state=SEED))
            ]))
        ])

        # Train the model and get predictions
        predictions = TrainML(ensemble, test_data)
        all_predictions.append(predictions)

    # Perform majority voting for the final predictions
    final_predictions = np.array(all_predictions).T
    majority_vote = [Counter(row).most_common(1)[0][0] for row in final_predictions]

    return majority_vote

# Define parameter sets for multiple runs
parameter_sets = [
    {
        'lgb': {'learning_rate': 0.1, 'n_estimators': 100},
        'xgb': {'learning_rate': 0.1, 'n_estimators': 100},
        'cat': {'depth': 6, 'iterations': 100},
        'rf': {'n_estimators': 100, 'max_depth': 10},
        'gb': {'n_estimators': 100, 'learning_rate': 0.1}
    },
    {
        'lgb': {'learning_rate': 0.05, 'n_estimators': 150},
        'xgb': {'learning_rate': 0.05, 'n_estimators': 150},
        'cat': {'depth': 8, 'iterations': 150},
        'rf': {'n_estimators': 150, 'max_depth': 12},
        'gb': {'n_estimators': 150, 'learning_rate': 0.05}
    },
    {
        'lgb': {'learning_rate': 0.01, 'n_estimators': 200},
        'xgb': {'learning_rate': 0.01, 'n_estimators': 200},
        'cat': {'depth': 10, 'iterations': 200},
        'rf': {'n_estimators': 200, 'max_depth': 15},
        'gb': {'n_estimators': 200, 'learning_rate': 0.01}
    }
]

# Run the process and get the final predictions
final_predictions = run_with_parameters(parameter_sets, test)

# Save the final predictions to a CSV file
sample['sii'] = final_predictions
sample.to_csv('submission.csv', index=False)

print("Final predictions saved to submission.csv")

Training Folds: 100%|██████████| 5/5 [01:53<00:00, 22.69s/it]

Mean Train QWK --> 0.7618
Mean Validation QWK ---> 0.3532
----> || Optimized QWK SCORE :: [36m[1m 0.460[0m
Final predictions saved to submission_majority_vote.csv



