In [1]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from scipy.stats import mode  # Add this import
import matplotlib.pyplot as plt


SEED = 42
n_splits = 5

# Load datasets
train = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/train.csv')
test = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/test.csv')
sample = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/sample_submission.csv')


def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

train_ts = load_time_series("/Users/ad53533/Desktop/Applied ML/Project/series_train.parquet")
test_ts = load_time_series("/Users/ad53533/Desktop/Applied ML/Project/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-Season', 'CGAS-CGAS_Score',
                'Physical-Season', 'Physical-BMI', 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 'Fitness_Endurance-Season',
                'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec', 'FGC-Season',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone',
                'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone',
                'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat',
                'BIA-BIA_Frame_num', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 'PAQ_A-Season',
                'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T',
                'PreInt_EduHx-Season', 'PreInt_EduHx-computerinternet_hoursday', 'sii', 'PCIAT-PCIAT_Total']

featuresCols += time_series_cols


train_sii = train['sii']
train = train.dropna(subset='sii')
train = train[featuresCols]


cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
         'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

# Update categorical variables
train = update(train)
test = update(test)

# Perform one-hot encoding
train = pd.get_dummies(train, columns=cat_c, drop_first=True)
test = pd.get_dummies(test, columns=cat_c, drop_first=True)

# Align train and test datasets to have the same columns
train_sii = train['sii']
train_PCIAT_Total = train['PCIAT-PCIAT_Total']
train, test = train.drop(['sii', 'PCIAT-PCIAT_Total'], axis=1).align(test, join='outer', axis=1, fill_value=0)
train['sii'] = train_sii
train['PCIAT-PCIAT_Total'] = train_PCIAT_Total

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)


def map_to_scaled_real(y_pred):
    """
    Map values from 0-100 to a smaller real number with scaling in specific ranges.

    Args:
    - y_pred (array-like): Input values ranging from 0 to 100.

    Returns:
    - Scaled values as per the mapping logic.
    """
    # Initialize the result array
    scaled = np.zeros_like(y_pred, dtype=float)

    # Apply the mapping logic
    scaled = np.where((y_pred >= 0) & (y_pred <= 30), 
                      y_pred / 30,  # Scale 0-30 to 0-1
                      scaled)
    
    scaled = np.where((y_pred > 30) & (y_pred <= 49),
                      1 + (y_pred - 31) / (49 - 31),  # Scale 31-49 to 1-2
                      scaled)
    
    scaled = np.where((y_pred > 50) & (y_pred <= 79),
                      2 + (y_pred - 51) / (79 - 51),  # Scale 51-79 to 2-3
                      scaled)
    
    scaled = np.where((y_pred > 80) & (y_pred <= 100),
                      3 + (y_pred - 81) / (100 - 81),  # Scale 81-100 to 3-4
                      scaled)

    return scaled

def TrainML(model_class, test_data):
    X = train.drop(['sii', 'PCIAT-PCIAT_Total'], axis=1)
    y = train['PCIAT-PCIAT_Total']
    y_sii = np.select([y <= 30, y <= 49, y <= 79], [0, 1, 2], default=3)

    SKF = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        y_train_mapped = map_to_scaled_real(y_train)
        y_val_mapped = map_to_scaled_real(y_val)
        
        
        model = clone(model_class)
        model.fit(X_train, y_train_mapped)

        y_train_pred = model.predict(X_train)
        # y_train_pred = map_to_scaled_real(y_train_pred)
        
        y_val_pred = model.predict(X_val)
        # y_val_pred = map_to_scaled_real(y_val_pred)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded
        
        # # Calculate residuals
        # residuals = y_train_mapped - y_train_pred
        # # Plot the residuals
        # plt.figure(figsize=(10, 6))
        # plt.scatter(y_train_pred, residuals, alpha=0.5, edgecolor='k')
        # plt.axhline(0, color='red', linestyle='--', linewidth=1.5)
        # plt.title("Residuals of y_train_pred with y_train", fontsize=14)
        # plt.xlabel("Predicted Values (y_train_pred)", fontsize=12)
        # plt.ylabel("Residuals (y_train - y_train_pred)", fontsize=12)
        # plt.grid(True, linestyle='--', alpha=0.6)
        # plt.show()
        
        
        y_train_sii = np.select([y_train <= 30, y_train <= 49, y_train <= 79], [0, 1, 2], default=3)
        train_kappa = quadratic_weighted_kappa(y_train_sii, y_train_pred.round(0).astype(int))
        y_val_sii = np.select([y_val <= 30, y_val <= 49, y_val <= 79], [0, 1, 2], default=3)
        val_kappa = quadratic_weighted_kappa(y_val_sii, y_val_pred.round(0).astype(int))

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_pred = model.predict(test_data)
        # test_pred = np.select([test_pred <= 0.5, test_pred <= 1.5, test_pred <= 2.5], [0, 1, 2], default=3)
        test_preds[:, fold] = test_pred
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y_sii, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tp_rounded = threshold_Rounder(tpm, KappaOPtimizer.x)
    # tp_rounded = mode(test_preds, axis=1)[0].astype(int)
    return tp_rounded


# Imputation step: Filling missing values with the median
imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    # Existing methods
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))])),
    # New method
    ('elasticnet', Pipeline(steps=[('imputer', imputer), ('regressor', ElasticNet(random_state=SEED))])),
    ('svr', Pipeline(steps=[('imputer', imputer), ('regressor', SVR())])),
    ('knn', Pipeline(steps=[('imputer', imputer), ('regressor', KNeighborsRegressor())])),
    ('extratrees', Pipeline(steps=[('imputer', imputer), ('regressor', ExtraTreesRegressor(random_state=SEED))])),
    ('ridge', Pipeline(steps=[('imputer', imputer), ('regressor', Ridge(alpha=1.0, random_state=SEED))]))
])

# Train the ensemble with the updated model pipeline
predictions = TrainML(ensemble, test)

# Save predictions to a CSV file
sample['sii'] = predictions
# sample.to_csv('submission.csv', index=False)

Training Folds: 100%|██████████| 5/5 [01:42<00:00, 20.48s/it]

Mean Train QWK --> 0.4744
Mean Validation QWK ---> 0.1518
----> || Optimized QWK SCORE :: [36m[1m 0.013[0m





In [10]:
y_sii

NameError: name 'y_sii' is not defined

In [17]:
sample.to_csv('submission.csv', index=False)

In [31]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline

SEED = 42
train = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/train.csv')
test = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/test.csv')
# Your train data here
train = train.dropna(subset=['sii'])
y_train = train['PCIAT-PCIAT_Total']
train = train.drop(columns=[col for col in train.columns if 'PCIAT' in col])
train = train.drop(columns=[col for col in train.columns if 'Season' in col])
train = train.drop(columns=['id'])
X_train = train.drop('sii', axis=1)


imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))])),
    ('elasticnet', Pipeline(steps=[('imputer', imputer), ('regressor', ElasticNet(random_state=SEED))])),
    ('svr', Pipeline(steps=[('imputer', imputer), ('regressor', SVR())])),
    ('knn', Pipeline(steps=[('imputer', imputer), ('regressor', KNeighborsRegressor())])),
    ('extratrees', Pipeline(steps=[('imputer', imputer), ('regressor', ExtraTreesRegressor(random_state=SEED))])),
    ('ridge', Pipeline(steps=[('imputer', imputer), ('regressor', Ridge(alpha=1.0, random_state=SEED))]))
])

ensemble.fit(X_train, y_train)
first_sample_prediction = ensemble.predict([X_train.iloc[1]])
print(first_sample_prediction)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5900
[LightGBM] [Info] Number of data points in the train set: 2736, number of used features: 48
[LightGBM] [Info] Start training from score 27.896199
[11.81768039]


In [43]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline

SEED = 42
train = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/train.csv')
test = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/test.csv')
train = train.dropna(subset=['sii'])
y_train = train['PCIAT-PCIAT_Total']
train = train.drop(columns=[col for col in train.columns if 'PCIAT' in col])
train = train.drop(columns=[col for col in train.columns if 'Season' in col])
train = train.drop(columns=['id'])
X_train = train.drop('sii', axis=1)

imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('regressor', GradientBoostingRegressor(random_state=SEED))])),
    ('elasticnet', Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('regressor', ElasticNet(random_state=SEED))])),
    ('svr', Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('regressor', SVR())])),
    ('knn', Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('regressor', KNeighborsRegressor())])),
    ('extratrees', Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('regressor', ExtraTreesRegressor(random_state=SEED))])),
    ('ridge', Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('regressor', Ridge(alpha=1.0, random_state=SEED))]))
])

ensemble.fit(X_train, y_train)
first_sample_prediction = ensemble.predict([X_train.iloc[1]])
print(first_sample_prediction)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5837
[LightGBM] [Info] Number of data points in the train set: 2736, number of used features: 48
[LightGBM] [Info] Start training from score 27.896199
[10.39443045]


In [44]:
y_train[0]

55.0

In [45]:
ensemble.predict([X_train.iloc[3]])

array([30.05859101])

In [46]:
test = test.drop(columns=[col for col in test.columns if 'PCIAT' in col])
test = test.drop(columns=[col for col in test.columns if 'Season' in col])
test = test.drop(columns=['id'])

predictions = ensemble.predict(test)

In [47]:
predictions

array([35.29778349, 10.39443045, 29.25355105, 30.05859101, 29.99468139,
       30.10793945, 25.45122911, 25.14797596, 43.20253518, 27.48883597,
       32.33671887, 24.48992905, 33.83388519, 34.1026294 , 39.6151773 ,
       39.96327622,  6.08904921, 23.13830905, 28.00432845, 35.72834761])

In [48]:
predictions = np.select([predictions <= 30, predictions <= 49, predictions <= 79], [0, 1, 2], default=3)

In [49]:
predictions

array([1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1])