## MLProject

Group 1

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.optimize import minimize
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import warnings
import os
warnings.filterwarnings('ignore')

In [6]:
# Set random seed for reproducibility
SEED = 42
N_SPLITS = 5

In [8]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

In [9]:
def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [10]:
def feature_engineering(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    
    return df

In [14]:
# Load train and test datasets
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

# Load time series data
train_time_series = load_time_series("./data/series_train.parquet")
test_time_series = load_time_series("./data/series_test.parquet")

# Merge time series data with main datasets
train_data = pd.merge(train_data, train_time_series, how="left", on='id')
test_data = pd.merge(test_data, test_time_series, how="left", on='id')

# Impute missing values
imputer = KNNImputer(n_neighbors=5)
numeric_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
train_data[numeric_cols] = imputer.fit_transform(train_data[numeric_cols])

# Feature engineering
train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)

100%|██████████| 996/996 [00:23<00:00, 42.56it/s]
100%|██████████| 2/2 [00:00<00:00, 26.50it/s]


In [16]:
# Define AutoEncoder for dimensionality reduction
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim*3),
            nn.ReLU(),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [17]:
# Evaluation metric: Quadratic Weighted Kappa
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# Threshold rounding function
def threshold_rounder(predictions, thresholds):
    return np.where(predictions < thresholds[0], 0,
                    np.where(predictions < thresholds[1], 1,
                             np.where(predictions < thresholds[2], 2, 3)))

# Evaluate predictions with thresholds
def evaluate_predictions(thresholds, y_true, predictions):
    rounded_predictions = threshold_rounder(predictions, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_predictions)

In [18]:
# Train and evaluate model
def train_and_evaluate_model(model_class, X_train, y_train, X_test):
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    
    oof_predictions = np.zeros(len(y_train))
    test_predictions = np.zeros(len(X_test))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
        print(f"Training fold {fold}")
        
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = model_class(random_state=SEED)
        model.fit(X_train_fold, y_train_fold)
        
        oof_predictions[val_idx] = model.predict(X_val_fold)
        test_predictions += model.predict(X_test) / N_SPLITS
    
    # Optimize thresholds
    initial_thresholds = [0.5, 1.5, 2.5]
    optimized_thresholds = minimize(evaluate_predictions, initial_thresholds, 
                                    args=(y_train, oof_predictions), 
                                    method='nelder-mead').x
    
    # Calculate final score
    final_score = -evaluate_predictions(optimized_thresholds, y_train, oof_predictions)
    print(f"Final Quadratic Weighted Kappa score: {final_score}")
    
    return test_predictions, optimized_thresholds

In [26]:
def convert_to_classes(y):
    return np.digitize(y, bins=[0.5, 1.5, 2.5]) - 1

In [27]:
features = [col for col in train_data.columns if col not in ['id', 'sii']]
X_train = train_data[features]
y_train = convert_to_classes(train_data['sii'])
available_features = [col for col in features if col in test_data.columns]
X_test = test_data[available_features]

# Train and evaluate models
models = [LGBMRegressor, XGBRegressor]
model_predictions = []

for model_class in models:
    predictions, thresholds = train_and_evaluate_model(model_class, X_train, y_train, X_test)
    model_predictions.append(predictions)

# Ensemble predictions
final_predictions = np.mean(model_predictions, axis=0)

print(final_predictions)

Training fold 1


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'