# Imports 

In [63]:
import pandas as pd
import numpy as np
import joblib
import sys
import os
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import r_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline 
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.base import clone
from typing import Any, Callable, Dict, List, Tuple, Union
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

In [87]:
PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

In [89]:
from src.nCV import NestedCrossVal
ncv=NestedCrossVal()

## Load the data

In [33]:
def load_data(path):
        if not os.path.isfile(path):
            raise FileNotFoundError(f"The file at {path} was not found.")
        return pd.read_csv(path)

path_to_data="/home/user_stel/Assignment-2/data/breast_cancer.csv"
data_df=load_data(path_to_data)

#print(data_df.head()) #it should display a 512x32 dataframe

## Preprocessing

In [34]:
def preprocess_data(df, columns_to_drop=[]):
    df=df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    num_list=df.select_dtypes(include=[np.number]).columns.tolist()
    cat_list=df.select_dtypes(exclude=[np.number]).columns.tolist()

    for col in cat_list:
        df[col]=LabelEncoder().fit_transform(df[col])

    for col in num_list:
        df[col]=SimpleImputer(missing_values=np.nan, strategy='mean') \
            .fit_transform(df[[col]]).ravel()
    
    return df

data_new_df=preprocess_data(data_df, columns_to_drop=[])

In [35]:
def separate_features_target(df, target, columns_to_remove=None):
    if columns_to_remove is None:
        columns_to_remove=[]
    columns_to_remove=set(columns_to_remove + [target])
    X=df.drop(columns=[col for col in columns_to_remove if col in df.columns])
    y=df[target]
    return X, y

X, y=separate_features_target(data_new_df, target='diagnosis', columns_to_remove=None)
#print(X)
#print(y)

In [36]:
def select_features(X, y, threshold=0.1):
    correlations = pd.Series(r_regression(X, y), index=X.columns)
    selected_features = correlations[correlations.abs() >= threshold].index.tolist()
    print(f"The selected features of {X.shape[1]} were: {len(selected_features)}")
    return selected_features, correlations

selected_features, correlations=select_features(X, y, threshold=0.5)
print(selected_features)

# Creates a new dataset that contains only the selected features 
X_selected=X[selected_features]
#print(X_selected)

selected_feature_names = X_selected.columns.tolist()
target = 'diagnosis'
data_selected_df = data_new_df[selected_feature_names + [target]]
#print(data_selected_df) # the way this new dataframe is built the diagnosis column is last

The selected features of 31 were: 15
['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'radius_se', 'perimeter_se', 'area_se', 'radius_worst', 'perimeter_worst', 'area_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst']


In [37]:
models = {
    'LogisticRegression-elasticnet': LogisticRegression(
        penalty='elasticnet', solver='saga', random_state=0, max_iter=10000
    ),
    'GaussianNB': GaussianNB(),
    'LDA': LinearDiscriminantAnalysis(),
    'SVC': SVC(random_state=0),
    'RandomForest': RandomForestClassifier(random_state=0),
    'LightGBM': LGBMClassifier(random_state=0)
}

param_grid = {
    'LogisticRegression-elasticnet': {
        'C': [0.01, 0.1, 1, 10],
        'l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]
    },
    'GaussianNB': {
        'var_smoothing': np.logspace(-9, -1, 9)
    },
    'LDA': [
        {'solver': ['svd']},
        {'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto']}
    ],
    'SVC': [
        {'kernel': ['linear'], 'C': [0.1, 1, 10]},
        {'kernel': ['rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto', 0.01, 0.1]}
    ],
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'LightGBM': {
        'n_estimators': [100, 200],
        'num_leaves': [31, 50, 100],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [-1, 10, 20]
    }
}

In [95]:
# Slightly modified generate_param_combinations function from previous assignment because for this project the param_grid conains lists of dictionaries 
# for two separet models 
def generate_param_combinations(param_grid):
    model_combinations = {}
    for model, params in param_grid.items():
        
        if isinstance(params, list):
            param_combinations = []
            for param_set in params:
                param_combinations.extend(
                    [dict(zip(param_set.keys(), values)) for values in itertools.product(*param_set.values())]
                )
            model_combinations[model] = param_combinations
        else:
            model_combinations[model] = [
                dict(zip(params.keys(), values)) for values in itertools.product(*params.values())
            ]
    return model_combinations

param_combinations=generate_param_combinations(param_grid)
model_combo_df_summary = pd.DataFrame.from_dict(param_combinations, orient='index')
#print(model_combo_df_summary)

In [97]:
model_combinations=ncv.generate_param_combinations(param_grid=param_grid)
model_combo_df_summary = pd.DataFrame.from_dict(model_combinations, orient='index')
#print(model_combo_df_summary)

AttributeError: 'list' object has no attribute 'values'

In [39]:
print(data_selected_df)

     radius_mean  perimeter_mean    area_mean  compactness_mean  \
0          14.68           94.74   684.500000           0.07200   
1          11.50           73.28   407.400000           0.05991   
2          15.85          103.70   782.700000           0.10020   
3          18.82          123.70  1110.000000           0.13890   
4          12.95           83.14   513.700000           0.07943   
..           ...             ...          ...               ...   
507        13.00           82.61   520.200000           0.05073   
508        14.20           92.41   657.616929           0.11080   
509        13.86           90.96   578.900000           0.15170   
510        17.30          113.00   928.200000           0.10410   
511        23.27          152.10  1686.000000           0.11450   

     concavity_mean  concave points_mean  radius_se  perimeter_se  area_se  \
0          0.073950             0.052590     0.4727         3.195    45.40   
1          0.026380             0.02069

In [40]:
X, y=separate_features_target(data_selected_df, target='diagnosis', columns_to_remove=None)
#print(X)

X_train, x_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

print(X_train)

     radius_mean  perimeter_mean  area_mean  compactness_mean  concavity_mean  \
320       12.880           84.45      493.1           0.16610         0.04825   
329       13.610           87.76      572.6           0.07862         0.05285   
173       11.130           71.49      378.4           0.08194         0.04824   
272       16.600          108.30      858.1           0.10230         0.09251   
491       14.580           94.29      658.8           0.08918         0.08222   
..           ...             ...        ...               ...             ...   
106       15.500          102.90      803.1           0.15710         0.15220   
270       14.190           92.87      610.7           0.13060         0.11150   
348       11.040           70.67      372.7           0.07079         0.03546   
435        9.397           59.75      268.8           0.06053         0.03735   
102       11.840           75.51      428.0           0.06900         0.02669   

     concave points_mean  r

### Nested Cross Validation

In [85]:
def model_tuning(model_key, X_train, y_train, inner_cv=3, random_state=42, n_jobs=1):
    splitter = StratifiedKFold(n_splits=inner_cv, shuffle=True, random_state=random_state)
    best_rmse = float('inf')
    best_model = None
    best_params = None

    grid = param_grid[model_key]

    param_combinations = generate_param_combinations({model_key: param_grid[model_key]})[model_key]

    for params in param_combinations:
        proto = models[model_key]
        proto_params = proto.get_params()
        proto_params.update(params)
        estimator = proto.__class__(**proto_params)

        pipeline = make_pipeline(StandardScaler(), estimator)
        scores = cross_val_score(
            pipeline, X_train, y_train,
            scoring='neg_root_mean_squared_error', cv=splitter, n_jobs=n_jobs
        )
        rmse = np.sqrt(-scores.mean())
        print(f"[{model_key}] Tested {params} -> RMSE {rmse:.4f}")
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = pipeline
            best_params = params
            print(f"[{model_key}] New best RMSE {best_rmse:.4f}, params {best_params}")

    return best_model, best_params

# Inner CV function: extracts data and tunes model
def inner_loop(df, target, model_key, columns_to_remove=None, inner_cv=3):
    X, y = separate_features_target(df, target, columns_to_remove)
    return model_tuning(model_key, X.values, y.values, inner_cv)

# Outer CV function: nests inner_loop without leakage
def outer_loop(df, target, model_key, outer_cv=5, random_state=42, columns_to_remove=None, inner_cv=3):
    # Separate full features and target once
    X_full, y_full = separate_features_target(df, target, columns_to_remove)
    splitter = StratifiedKFold(n_splits=outer_cv, shuffle=True, random_state=random_state)
    scores = []
    params_per_fold = []

    for train_idx, test_idx in splitter.split(X_full.values, y_full.values):
        # Partition DataFrame
        df_tr = df.iloc[train_idx].reset_index(drop=True)
        df_te = df.iloc[test_idx].reset_index(drop=True)

        # Inner tuning on training partition
        best_pipe, best_params = inner_loop(df_tr, target, model_key, columns_to_remove, inner_cv)
        params_per_fold.append(best_params)

        # Extract train arrays for fitting
        X_tr, y_tr = separate_features_target(df_tr, target, columns_to_remove)
        X_tr_arr, y_tr_arr = X_tr.values, y_tr.values

        # Extract test arrays for evaluation
        X_te, y_te = separate_features_target(df_te, target, columns_to_remove)
        X_te_arr, y_te_arr = X_te.values, y_te.values

        # Fit pipeline on training data arrays and score on test arrays
        best_pipe.fit(X_tr_arr, y_tr_arr)
        score = best_pipe.score(X_te_arr, y_te_arr)
        scores.append(score)

    return scores, params_per_fold


### Test SVC model

In [86]:
scores, params_per_fold = outer_loop(
     data_selected_df, 'diagnosis', 'SVC',
     outer_cv=5, random_state=42,
     columns_to_remove=None, inner_cv=3
    )
print('Outer CV scores:', scores)
print('Best params per fold:', params_per_fold)

[SVC] Tested {'kernel': 'linear', 'C': 0.1} -> RMSE 0.5114
[SVC] New best RMSE 0.5114, params {'kernel': 'linear', 'C': 0.1}
[SVC] Tested {'kernel': 'linear', 'C': 1} -> RMSE 0.5202
[SVC] Tested {'kernel': 'linear', 'C': 10} -> RMSE 0.5324
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 'scale'} -> RMSE 0.5227
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 'auto'} -> RMSE 0.5227
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.01} -> RMSE 0.5547
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.1} -> RMSE 0.5190
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 'scale'} -> RMSE 0.5066
[SVC] New best RMSE 0.5066, params {'kernel': 'rbf', 'C': 1, 'gamma': 'scale'}
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 'auto'} -> RMSE 0.5066
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 0.01} -> RMSE 0.5237
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 0.1} -> RMSE 0.5157
[SVC] Tested {'kernel': 'rbf', 'C': 10, 'gamma': 'scale'} -> RMSE 0.5159
[SVC] Tested {'kernel': 'rbf', 'C': 10, 

### Test Logistic Regression using Elastic Net model

In [71]:
scores, params_per_fold = outer_loop(
     data_selected_df, 'diagnosis', 'LogisticRegression-elasticnet',
     outer_cv=5, random_state=42,
     columns_to_remove=None, inner_cv=3
    )
print('Outer CV scores:', scores)
print('Best params per fold:', params_per_fold)



[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.0} -> RMSE 0.5365
[LogisticRegression-elasticnet] New best RMSE 0.5365, params {'C': 0.01, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.25} -> RMSE 0.5365
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.5} -> RMSE 0.5365
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.75} -> RMSE 0.5365
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 1.0} -> RMSE 0.5365
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.0} -> RMSE 0.5243
[LogisticRegression-elasticnet] New best RMSE 0.5243, params {'C': 0.1, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.25} -> RMSE 0.5243
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.5} -> RMSE 0.5243
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.75} -> RMSE 0.5243
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 1.0} -> RMSE



[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.0} -> RMSE 0.5190
[LogisticRegression-elasticnet] New best RMSE 0.5190, params {'C': 0.01, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.25} -> RMSE 0.5190
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.5} -> RMSE 0.5190
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.75} -> RMSE 0.5190
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 1.0} -> RMSE 0.5190
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.0} -> RMSE 0.5014
[LogisticRegression-elasticnet] New best RMSE 0.5014, params {'C': 0.1, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.25} -> RMSE 0.5014
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.5} -> RMSE 0.5014
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.75} -> RMSE 0.5014
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 1.0} -> RMSE



[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 0.0} -> RMSE 0.4823
[LogisticRegression-elasticnet] New best RMSE 0.4823, params {'C': 10, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 0.25} -> RMSE 0.4823
[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 0.5} -> RMSE 0.4823
[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 0.75} -> RMSE 0.4823
[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 1.0} -> RMSE 0.4823
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.0} -> RMSE 0.5194
[LogisticRegression-elasticnet] New best RMSE 0.5194, params {'C': 0.01, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.25} -> RMSE 0.5194
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.5} -> RMSE 0.5194
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.75} -> RMSE 0.5194
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 1.0} -> RMSE 0.519



[LogisticRegression-elasticnet] Tested {'C': 1, 'l1_ratio': 0.25} -> RMSE 0.5119
[LogisticRegression-elasticnet] Tested {'C': 1, 'l1_ratio': 0.5} -> RMSE 0.5119
[LogisticRegression-elasticnet] Tested {'C': 1, 'l1_ratio': 0.75} -> RMSE 0.5119
[LogisticRegression-elasticnet] Tested {'C': 1, 'l1_ratio': 1.0} -> RMSE 0.5119
[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 0.0} -> RMSE 0.5003
[LogisticRegression-elasticnet] New best RMSE 0.5003, params {'C': 10, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 0.25} -> RMSE 0.5003
[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 0.5} -> RMSE 0.5003
[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 0.75} -> RMSE 0.5003
[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 1.0} -> RMSE 0.5003
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.0} -> RMSE 0.5218
[LogisticRegression-elasticnet] New best RMSE 0.5218, params {'C': 0.01, 'l1_ratio': 0.0}
[LogisticR



[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.25} -> RMSE 0.5058
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.5} -> RMSE 0.5058
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.75} -> RMSE 0.5058
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 1.0} -> RMSE 0.5058
[LogisticRegression-elasticnet] Tested {'C': 1, 'l1_ratio': 0.0} -> RMSE 0.4843
[LogisticRegression-elasticnet] New best RMSE 0.4843, params {'C': 1, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 1, 'l1_ratio': 0.25} -> RMSE 0.4843
[LogisticRegression-elasticnet] Tested {'C': 1, 'l1_ratio': 0.5} -> RMSE 0.4843
[LogisticRegression-elasticnet] Tested {'C': 1, 'l1_ratio': 0.75} -> RMSE 0.4843
[LogisticRegression-elasticnet] Tested {'C': 1, 'l1_ratio': 1.0} -> RMSE 0.4843
[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 0.0} -> RMSE 0.4893
[LogisticRegression-elasticnet] Tested {'C': 10, 'l1_ratio': 0.25} -> RMSE 0.4893
[LogisticRegressio



[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.0} -> RMSE 0.4999
[LogisticRegression-elasticnet] New best RMSE 0.4999, params {'C': 0.01, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.25} -> RMSE 0.4999
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.5} -> RMSE 0.4999
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.75} -> RMSE 0.4999
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 1.0} -> RMSE 0.4999
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.0} -> RMSE 0.4915
[LogisticRegression-elasticnet] New best RMSE 0.4915, params {'C': 0.1, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.25} -> RMSE 0.4915
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.5} -> RMSE 0.4915
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.75} -> RMSE 0.4915
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 1.0} -> RMSE



### Test Gaussian Naive Bayes model

In [72]:
scores, params_per_fold = outer_loop(
     data_selected_df, 'diagnosis', 'GaussianNB',
     outer_cv=5, random_state=42,
     columns_to_remove=None, inner_cv=3
    )
print('Outer CV scores:', scores)
print('Best params per fold:', params_per_fold)

[GaussianNB] Tested {'var_smoothing': np.float64(1e-09)} -> RMSE 0.5286
[GaussianNB] New best RMSE 0.5286, params {'var_smoothing': np.float64(1e-09)}
[GaussianNB] Tested {'var_smoothing': np.float64(1e-08)} -> RMSE 0.5286
[GaussianNB] Tested {'var_smoothing': np.float64(1e-07)} -> RMSE 0.5286
[GaussianNB] Tested {'var_smoothing': np.float64(1e-06)} -> RMSE 0.5286
[GaussianNB] Tested {'var_smoothing': np.float64(1e-05)} -> RMSE 0.5286
[GaussianNB] Tested {'var_smoothing': np.float64(0.0001)} -> RMSE 0.5286
[GaussianNB] Tested {'var_smoothing': np.float64(0.001)} -> RMSE 0.5286
[GaussianNB] Tested {'var_smoothing': np.float64(0.01)} -> RMSE 0.5324
[GaussianNB] Tested {'var_smoothing': np.float64(0.1)} -> RMSE 0.5442
[GaussianNB] Tested {'var_smoothing': np.float64(1e-09)} -> RMSE 0.4971
[GaussianNB] New best RMSE 0.4971, params {'var_smoothing': np.float64(1e-09)}
[GaussianNB] Tested {'var_smoothing': np.float64(1e-08)} -> RMSE 0.4971
[GaussianNB] Tested {'var_smoothing': np.float64(1e-

### Test Linear Discriminant Analysis model

In [73]:
scores, params_per_fold = outer_loop(
     data_selected_df, 'diagnosis', 'LDA',
     outer_cv=5, random_state=42,
     columns_to_remove=None, inner_cv=3
    )
print('Outer CV scores:', scores)
print('Best params per fold:', params_per_fold)

[LDA] Tested {'solver': 'svd'} -> RMSE 0.5405
[LDA] New best RMSE 0.5405, params {'solver': 'svd'}
[LDA] Tested {'solver': 'lsqr', 'shrinkage': None} -> RMSE 0.5405
[LDA] Tested {'solver': 'lsqr', 'shrinkage': 'auto'} -> RMSE 0.5157
[LDA] New best RMSE 0.5157, params {'solver': 'lsqr', 'shrinkage': 'auto'}
[LDA] Tested {'solver': 'eigen', 'shrinkage': None} -> RMSE 0.5405
[LDA] Tested {'solver': 'eigen', 'shrinkage': 'auto'} -> RMSE 0.5157
[LDA] Tested {'solver': 'svd'} -> RMSE 0.5155
[LDA] New best RMSE 0.5155, params {'solver': 'svd'}
[LDA] Tested {'solver': 'lsqr', 'shrinkage': None} -> RMSE 0.5155
[LDA] Tested {'solver': 'lsqr', 'shrinkage': 'auto'} -> RMSE 0.5109
[LDA] New best RMSE 0.5109, params {'solver': 'lsqr', 'shrinkage': 'auto'}
[LDA] Tested {'solver': 'eigen', 'shrinkage': None} -> RMSE 0.5155
[LDA] Tested {'solver': 'eigen', 'shrinkage': 'auto'} -> RMSE 0.5109
[LDA] Tested {'solver': 'svd'} -> RMSE 0.5299
[LDA] New best RMSE 0.5299, params {'solver': 'svd'}
[LDA] Tested 

### Test Random Forests model

In [74]:
scores, params_per_fold = outer_loop(
     data_selected_df, 'diagnosis', 'RandomForest',
     outer_cv=5, random_state=42,
     columns_to_remove=None, inner_cv=3
    )
print('Outer CV scores:', scores)
print('Best params per fold:', params_per_fold)

[RandomForest] Tested {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1} -> RMSE 0.5280
[RandomForest] New best RMSE 0.5280, params {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1}
[RandomForest] Tested {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2} -> RMSE 0.5365
[RandomForest] Tested {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 1} -> RMSE 0.5236
[RandomForest] New best RMSE 0.5236, params {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 1}
[RandomForest] Tested {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 2} -> RMSE 0.5286
[RandomForest] Tested {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1} -> RMSE 0.5369
[RandomForest] Tested {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2} -> RM

### Test LightGBM model

In [None]:
scores, params_per_fold = outer_loop(
     data_selected_df, 'diagnosis', 'LightGBM',
     outer_cv=5, random_state=42,
     columns_to_remove=None, inner_cv=3
    )
print('Outer CV scores:', scores)
print('Best params per fold:', params_per_fold)