In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from scipy.optimize import minimize
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

SEED = 42
n_splits = 5

# Load datasets
train = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/train.csv')
test = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/test.csv')
sample = pd.read_csv('/Users/ad53533/Desktop/Applied ML/Project/sample_submission.csv')

# Utility function to process parquet files
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)

    # Calculate descriptive statistics
    stats = df.describe().values.reshape(-1)
    return stats, filename.split('=')[1]

def load_time_series(dirname):
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    columns = [f"stat_{i}" for i in range(len(stats[0]))]
    df = pd.DataFrame(stats, columns=columns)
    df['id'] = indexes
    return df

# Load time series data
train_ts = load_time_series("/Users/ad53533/Desktop/Applied ML/Project/series_train.parquet")
test_ts = load_time_series("/Users/ad53533/Desktop/Applied ML/Project/series_test.parquet")

# Merge time series data with train and test datasets
train = pd.merge(train, train_ts, how="left", on="id")
test = pd.merge(test, test_ts, how="left", on="id")
train = train.drop("id", axis=1)
test = test.drop("id", axis=1)

# Imputation for missing values
imputer = SimpleImputer(strategy="median")
train = pd.DataFrame(imputer.fit_transform(train), columns=train.columns)
test = pd.DataFrame(imputer.transform(test), columns=test.columns)

# Train individual regressors for PCIAT01 to PCIAT20
def TrainML_Per_PCIAT(model_class, test_data):
    X = train.drop([f'PCIAT{i:02d}' for i in range(1, 21)] + ['sii'], axis=1)
    y = train[[f'PCIAT{i:02d}' for i in range(1, 21)]]

    pciat_predictions_train = np.zeros_like(y)
    pciat_predictions_test = np.zeros((len(test_data), 20))

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

    for pciat_idx, pciat_name in enumerate(y.columns):
        print(f"Training for {pciat_name}")

        oof_preds = np.zeros(len(y), dtype=float)
        test_preds = np.zeros((len(test_data), n_splits))

        for fold, (train_idx, val_idx) in enumerate(SKF.split(X, y.iloc[:, pciat_idx])):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx, pciat_idx], y.iloc[val_idx, pciat_idx]

            model = clone(model_class)
            model.fit(X_train, y_train)

            oof_preds[val_idx] = model.predict(X_val)
            test_preds[:, fold] = model.predict(test_data)

        pciat_predictions_train[:, pciat_idx] = oof_preds
        pciat_predictions_test[:, pciat_idx] = test_preds.mean(axis=1)

    total_pciat_train = pciat_predictions_train.sum(axis=1)
    total_pciat_test = pciat_predictions_test.sum(axis=1)

    def map_total_to_class(total_pciat):
        return np.where(total_pciat < 31, 0,
                        np.where(total_pciat < 50, 1,
                                 np.where(total_pciat < 80, 2, 3)))

    train_sii_predicted = map_total_to_class(total_pciat_train)
    test_sii_predicted = map_total_to_class(total_pciat_test)

    return test_sii_predicted

# Define features and categorical columns
featuresCols = [col for col in train.columns if col not in ['id', 'sii'] + [f'PCIAT{i:02d}' for i in range(1, 21)]]
categorical_cols = [col for col in featuresCols if train[col].dtype == 'object']

# One-hot encoding for categorical features
train = pd.get_dummies(train, columns=categorical_cols, drop_first=True)
test = pd.get_dummies(test, columns=categorical_cols, drop_first=True)

# Align train and test datasets
train_sii = train['sii']
train, test = train.drop('sii', axis=1).align(test, join="outer", axis=1, fill_value=0)
train['sii'] = train_sii

# Define an ensemble regressor
ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', SimpleImputer(strategy="median")), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', SimpleImputer(strategy="median")), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy="median")), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
])

# Train the model and predict
predictions = TrainML_Per_PCIAT(ensemble, test)

# Save predictions
sample['sii'] = predictions
sample.to_csv('submission.csv', index=False)


100%|██████████| 996/996 [00:18<00:00, 52.54it/s]
100%|██████████| 2/2 [00:00<00:00, 16.87it/s]


ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'Fall'