In [15]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

SEED = 42
n_splits = 5

# Load datasets
train = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/train.csv')
test = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/test.csv')
sample = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/sample_submission.csv')

print(sample)

def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

train_ts = load_time_series("/Users/ad53533/Desktop/Applied ML/Project/series_train.parquet")
test_ts = load_time_series("/Users/ad53533/Desktop/Applied ML/Project/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)


featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-Season',
                'Physical-Season', 'Physical-BMI', 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 'Fitness_Endurance-Season',
                'FGC-Season',
                'FGC-FGC_CU', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD_Zone',
                'FGC-FGC_PU', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone',
                'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMI',
                'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI',
                'BIA-BIA_Frame_num', 'PAQ_A-Season',
                'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'PreInt_EduHx-Season', 'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')


cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
         'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

# Update categorical variables
train = update(train)
test = update(test)

# Perform one-hot encoding
train = pd.get_dummies(train, columns=cat_c, drop_first=True)
test = pd.get_dummies(test, columns=cat_c, drop_first=True)

def feature_engineering(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Age_Height'] = df['Basic_Demos-Age'] * df['Physical-Height']
    df['Waist_Internet_Hours'] = df['Physical-Waist_Circumference'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BIA-BMI_BIA-Frame_Num'] = df['BIA-BIA_BMI'] * df['BIA-BIA_Frame_num']
    return df

train = feature_engineering(train)
train = train.dropna(thresh=10, axis=0)
test = feature_engineering(test)

# Align train and test datasets to have the same columns
train_sii = train['sii']
train, test = train.drop('sii', axis=1).align(test, join='outer', axis=1, fill_value=0)
train['sii'] = train_sii

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

# for col in cat_c:
#     mapping = create_mapping(col, train)
#     mappingTe = create_mapping(col, test)
#     
#     train[col] = train[col].replace(mapping).astype(int)
#     test[col] = test[col].replace(mappingTe).astype(int)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tp_rounded = threshold_Rounder(tpm, KappaOPtimizer.x)

    return tp_rounded

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Imputation step: Filling missing values with the median
imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    # Existing methods
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))])),
    # New method
    ('elasticnet', Pipeline(steps=[('imputer', imputer), ('regressor', ElasticNet(random_state=SEED))])),
    ('svr', Pipeline(steps=[('imputer', imputer), ('regressor', SVR())])),
    ('knn', Pipeline(steps=[('imputer', imputer), ('regressor', KNeighborsRegressor())])),
    ('extratrees', Pipeline(steps=[('imputer', imputer), ('regressor', ExtraTreesRegressor(random_state=SEED))])),
    ('ridge', Pipeline(steps=[('imputer', imputer), ('regressor', Ridge(alpha=1.0, random_state=SEED))]))
])

# Train the ensemble with the updated model pipeline
predictions = TrainML(ensemble, test)

# Save predictions to a CSV file
sample['sii'] = predictions
# sample.to_csv('submission.csv', index=False)

Training Folds: 100%|██████████| 5/5 [01:11<00:00, 14.22s/it]

Mean Train QWK --> 0.7973
Mean Validation QWK ---> 0.3710





----> || Optimized QWK SCORE :: [36m[1m 0.456[0m


In [13]:
sample.to_csv('submission.csv', index=False)

In [None]:
np.size(sample)

In [None]:
np.size(test)

In [22]:
time_series_cols

Unnamed: 0,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Age,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Basic_Demos-Sex,CGAS-CGAS_Score,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Max_Stage,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,PAQ_A-PAQ_A_Total,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,PAQ_C-PAQ_C_Total,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,Physical-BMI,Physical-Diastolic_BP,Physical-HeartRate,Physical-Height,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter,Physical-Systolic_BP,Physical-Waist_Circumference,Physical-Weight,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PreInt_EduHx-computerinternet_hoursday,SDS-SDS_Total_Raw,SDS-SDS_Total_T,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,light_max,light_mean,light_std,motion_intensity_max,motion_intensity_mean,motion_intensity_std,non_wear_percentage,stat_0,stat_1,stat_10,stat_100,stat_101,stat_102,stat_103,stat_11,stat_12,stat_13,stat_14,stat_15,stat_16,stat_17,stat_18,stat_19,stat_2,stat_20,stat_21,stat_22,stat_23,stat_24,stat_25,stat_26,stat_27,stat_28,stat_29,stat_3,stat_30,stat_31,stat_32,stat_33,stat_34,stat_35,stat_36,stat_37,stat_38,stat_39,stat_4,stat_40,stat_41,stat_42,stat_43,stat_44,stat_45,stat_46,stat_47,stat_48,stat_49,stat_5,stat_50,stat_51,stat_52,stat_53,stat_54,stat_55,stat_56,stat_57,stat_58,stat_59,stat_6,stat_60,stat_61,stat_62,stat_63,stat_64,stat_65,stat_66,stat_67,stat_68,stat_69,stat_7,stat_70,stat_71,stat_72,stat_73,stat_74,stat_75,stat_76,stat_77,stat_78,stat_79,stat_8,stat_80,stat_81,stat_82,stat_83,stat_84,stat_85,stat_86,stat_87,stat_88,stat_89,stat_9,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95,stat_96,stat_97,stat_98,stat_99,sii
0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,False,False,False,False,5,False,False,False,0,51.0,False,False,False,True,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,False,False,False,False,,True,False,False,False,,,,True,False,False,False,,True,False,False,False,16.877316,,,46.0,False,False,False,False,,,50.8,False,False,False,False,3.0,,,True,False,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0
1,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,False,False,False,True,9,False,True,False,0,,True,False,False,False,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,False,False,False,False,,True,False,False,False,,,,True,False,False,False,2.34,False,False,False,False,14.03559,75.0,70.0,48.0,False,False,False,False,122.0,22.0,46.0,False,False,True,False,0.0,46.0,64.0,False,False,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0
2,,,,,,,,,,,,,,,,,True,False,False,False,10,False,True,False,1,71.0,False,False,False,False,20.0,1.0,14.7,2.0,10.2,1.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,False,False,False,False,5.0,False,False,False,False,7.0,33.0,,True,False,False,False,2.17,False,False,True,False,16.648696,65.0,94.0,56.5,False,False,False,False,117.0,,75.6,False,False,True,False,2.0,38.0,54.0,False,False,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0
3,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,False,False,True,False,9,False,False,True,0,71.0,False,False,False,False,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,False,False,True,False,6.0,False,False,True,False,9.0,37.0,,True,False,False,False,2.451,False,False,False,True,18.292347,60.0,97.0,56.0,False,False,True,False,117.0,,81.6,False,False,False,True,0.0,31.0,45.0,False,False,True,False,0.320171,-0.150284,-0.106555,-0.250832,13.523217,-0.240042,50462.148396,43330.0,43330.0,43330.0,42.29631,208.168884,2633.25,-0.008399,43330.0,-0.316384,0.016009,-0.16789,0.047388,-10.580416,0.0,42.29631,4053.579102,43330.0,50462150000000.0,4.470182,3.0,53.201683,0.453665,0.502698,0.58571,0.106353,42.947163,0.0,43330.0,208.168884,112.401535,19428420000000.0,1.931421,0.0,14.245132,-1.746094,-2.905339,-1.048372,0.0,43330.0,-89.833092,0.0,0.0,3824.0,55000000000.0,1.0,3.0,41.0,-0.68418,-0.309863,43330.0,-0.649974,0.006432,-41.541863,0.0,2.392969,4028.666748,36890000000000.0,3.0,3.0,42.0,43330.0,-0.366849,0.024974,-0.245378,0.023637,-15.086617,0.0,6.926828,4070.0,53477500000000.0,5.0,43330.0,3.0,50.0,-0.010677,0.400677,0.204727,0.04142,12.220764,0.0,15.0,4147.0,43330.0,66408750000000.0,6.0,3.0,53.0,1.507865,1.666354,1.546979,4.004276,89.751656,0.0,43330.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0,0.958392,0.107638,2.982514,,1.0
5,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,False,False,True,False,13,True,False,False,1,50.0,False,False,False,True,12.0,0.0,17.9,2.0,16.5,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,False,False,True,False,,True,False,False,False,,,,True,False,False,False,4.11,False,True,False,False,22.279952,60.0,73.0,59.5,False,False,True,False,102.0,,112.2,False,True,False,False,0.0,40.0,56.0,False,False,True,False,0.261557,0.068371,-0.011114,0.002237,11.504102,-0.001337,43212.117201,396396.0,396396.0,396396.0,16.771982,95.304085,2597.800049,-0.002717,396396.0,-0.004272,0.016859,-0.631731,0.011926,-55.630768,0.655708,16.771982,3838.189453,396396.0,43212120000000.0,3.909848,3.0,79.435593,0.351582,0.303726,0.622458,0.024306,50.368,0.467727,396396.0,95.304085,155.542389,24972640000000.0,1.946892,0.0,6.63358,-1.038711,-1.52269,-1.018787,0.0,396396.0,-88.761833,0.0,0.0,3098.166748,0.0,1.0,3.0,68.0,-0.052803,-0.044517,396396.0,-1.009344,0.008622,-88.386049,0.0,0.5,3747.0,21540000000000.0,2.0,3.0,74.0,396396.0,-0.020622,-0.028179,-1.007728,0.009831,-86.119919,1.0,0.879005,3812.0,43310000000000.0,4.0,396396.0,3.0,79.0,-0.019081,0.020307,-0.294459,0.010668,-17.483364,1.0,6.141348,3951.187561,396396.0,64855000000000.0,6.0,3.0,85.0,1.034351,1.946303,1.146284,2.952888,89.476036,1.0,396396.0,2597.800049,4175.0,86395000000000.0,7.0,3.0,91.0,1.001227,0.039254,2.171842,,1.0


In [None]:
train.head()