In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostRegressor
import optuna
import shap
import joblib
from sklearn.metrics import mean_squared_error


In [2]:
# Load datasets
train_df = pd.read_csv("/kaggle/input/parkinson/train.csv")
test_df = pd.read_csv("/kaggle/input/parkinson/test.csv")
val_df = pd.read_csv("/kaggle/input/parkinson/val.csv")

# Define features and target
target_col = "motor_UPDRS"
target_and_related = ["motor_UPDRS", "subject#", "total_UPDRS"]

# Split datasets
X_train = train_df.drop(columns=target_and_related)
y_train = train_df[target_col]

X_test = test_df.drop(columns=target_and_related)
y_test = test_df[target_col]

X_val = val_df.drop(columns=target_and_related)
y_val = val_df[target_col]

# Standard Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)


In [3]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Create bins for stratification
y_bins = pd.qcut(y_train, q=10, labels=False)

rmse_scores = []  # Store RMSE for each fold

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_bins)):
    X_train_fold, X_val_fold = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = AdaBoostRegressor(n_estimators=50, learning_rate=0.1, random_state=42)
    model.fit(X_train_fold, y_train_fold)

    y_val_pred = model.predict(X_val_fold)
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_val_pred))
    rmse_scores.append(rmse)

    print(f"Fold {fold+1} RMSE: {rmse:.4f}")

# Print Mean RMSE Across Folds
mean_rmse = np.mean(rmse_scores)
print(f"\nMean RMSE Across {n_splits} Folds: {mean_rmse:.4f}")


Fold 1 RMSE: 6.1682
Fold 2 RMSE: 6.2329
Fold 3 RMSE: 5.9539
Fold 4 RMSE: 6.1675
Fold 5 RMSE: 6.2082

Mean RMSE Across 5 Folds: 6.1461


In [4]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500, step=50)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)  # Log scale

    model = AdaBoostRegressor(n_estimators=n_estimators, learning_rate=learning_rate, random_state=42)
    
    model.fit(X_train_scaled, y_train)
    y_val_pred = model.predict(X_val_scaled)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))  # Use RMSE as the objective

    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

best_params = study.best_params
print("Best Hyperparameters:", best_params)


[I 2025-02-22 14:36:09,655] A new study created in memory with name: no-name-13256745-50bd-49e4-8cc3-175c6a4d805c
[I 2025-02-22 14:36:15,837] Trial 0 finished with value: 6.304523514374573 and parameters: {'n_estimators': 450, 'learning_rate': 0.07880016163600659}. Best is trial 0 with value: 6.304523514374573.
[I 2025-02-22 14:36:19,083] Trial 1 finished with value: 6.333483438577445 and parameters: {'n_estimators': 150, 'learning_rate': 0.1551579440875556}. Best is trial 0 with value: 6.304523514374573.
[I 2025-02-22 14:36:21,368] Trial 2 finished with value: 6.528720949189925 and parameters: {'n_estimators': 100, 'learning_rate': 0.013075544065591348}. Best is trial 0 with value: 6.304523514374573.
[I 2025-02-22 14:36:22,337] Trial 3 finished with value: 6.180074419020807 and parameters: {'n_estimators': 250, 'learning_rate': 0.6392963042247312}. Best is trial 3 with value: 6.180074419020807.
[I 2025-02-22 14:36:23,525] Trial 4 finished with value: 6.34037380028222 and parameters: {

Best Hyperparameters: {'n_estimators': 200, 'learning_rate': 0.591013328101066}


In [5]:
best_model = AdaBoostRegressor(
    n_estimators=best_params["n_estimators"],
    learning_rate=best_params["learning_rate"],
    random_state=42
)
best_model.fit(X_train_scaled, y_train)


In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Predict on test data
y_test_pred = best_model.predict(X_test_scaled)

# Calculate evaluation metrics
final_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))  # RMSE
final_mae = mean_absolute_error(y_test, y_test_pred)  # MAE
final_r2 = r2_score(y_test, y_test_pred)  # R² Score

# Print results
print(f"\nFinal Test RMSE: {final_rmse:.4f}")
print(f"Final Test MAE: {final_mae:.4f}")
print(f"Final Test R² Score: {final_r2:.4f}")



Final Test RMSE: 6.1353
Final Test MAE: 5.3492
Final Test R² Score: 0.4325


In [7]:
joblib.dump(best_model, "/kaggle/working/best_adaboost_model.pkl")
joblib.dump(scaler, "/kaggle/working/scaler.pkl")


['scaler.pkl']