In [31]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

In [32]:
# load data
df_X = pd.read_csv('../data/X.csv', index_col=0)
df_y = pd.read_csv('../data/y.csv', index_col=0)

# keep selected columns
selected_features = [2056, 2058, 1559, 2083, 1065, 1581, 1583, 1087, 67, 74, 2130, 2662, 2154, 623, 1654, 1656, 1657, 1663, 2182, 2183, 1160, 2184, 2185, 1677, 1678, 1679, 2196, 2717, 2206, 671, 2207, 2208, 2209, 2723, 2212, 2213, 2211, 2725, 2214, 2724, 2218, 2210, 2732, 2221, 174, 2730, 2219, 2220, 2733, 2223, 2224, 2225, 2742, 2736, 2740, 189, 192, 2776, 1243, 734, 2275, 229, 230, 2789, 744, 2791, 742, 743, 745, 761, 1788, 765, 1789, 2817, 2818, 771, 261, 1289, 2828, 1293, 1302, 791, 1303, 1306, 800, 801, 803, 805, 669, 808, 814, 817, 818, 673, 674, 836, 838, 841, 842, 2379, 676, 2382, 849, 851, 853, 854, 855, 2205, 857, 679, 858, 863, 1889, 1890, 1891, 1892, 1894, 1895, 1896, 1897, 1898, 1899, 1902, 1905, 1907, 1910, 888, 385, 386, 2729, 2486, 2734, 2735, 455, 1994, 462, 2017, 487, 489, 2026, 2547, 2549]
new_df_X = df_X.iloc[:, selected_features]

X = new_df_X.to_numpy()
y = df_y.values.ravel()  # 0 is HER2+, 1 is HR+, 2 is Triple Negative

print(len(selected_features))

142


In [55]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score
from scipy.stats import randint as sp_randint
import time
import numpy as np

def nested_cv_rfc(x_data, y_targets):
    # Random Forest classifier to be optimized
    rfc = RandomForestClassifier(random_state=2)

    # Define the parameter space for RandomizedSearchCV
    param_dist = {
        "n_estimators": sp_randint(10, 750),
        "max_leaf_nodes": sp_randint(20, 1000),
        "max_depth": sp_randint(20, 500),
        "min_samples_split": sp_randint(2, 250),
        "max_features": sp_randint(3, 100)
    }

    # Number of parameter settings that are sampled
    n_iter_search = 50
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=1)
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=2)

    # Execute RandomizedSearchCV
    random_search = RandomizedSearchCV(rfc, param_distributions=param_dist,
                                       n_iter=n_iter_search, scoring=make_scorer(f1_score, average='micro'), cv=inner_cv, verbose=1)

    # Using outer CV for assessing the performance
    outer_scores = cross_val_score(random_search, x_data, y_targets, cv=outer_cv, scoring=make_scorer(f1_score, average='micro'))

    print("Nested CV score (mean):", np.mean(outer_scores))

    # Refitting on the entire dataset (can be omitted if only interested in performance estimate)
    random_search.fit(x_data, y_targets)  # This line will be executed with the best parameters found in the inner CV
    best_params_random = random_search.best_params_
    
    # print("Best parameters from Random Search:", best_params_random)

    # # Defining a narrower search space for GridSearchCV around the best parameters found
    # param_grid = {
    #     "n_estimators": [best_params_random['n_estimators'] - 3, best_params_random['n_estimators'], best_params_random['n_estimators'] + 3],
    #     "max_leaf_nodes": [best_params_random['max_leaf_nodes'] - 5, best_params_random['max_leaf_nodes'], best_params_random['max_leaf_nodes'] + 5],
    #     "max_depth": [best_params_random['max_depth'] - 5, best_params_random['max_depth'], best_params_random['max_depth'] + 5],
    #     "min_samples_split": [best_params_random['min_samples_split'] - 3, best_params_random['min_samples_split'], best_params_random['min_samples_split'] + 3],
    #     "max_features": [max(1, best_params_random['max_features'] - 3), best_params_random['max_features'], best_params_random['max_features'] + 4]
    # }

    # # Ensure parameters are within the original bounds
    # for key in param_grid.keys():
    #     param_grid[key] = [np.max([x, param_dist[key].a]) for x in param_grid[key]]

    # grid_search = GridSearchCV(rfc, param_grid=param_grid, scoring=make_scorer(f1_score, average='micro'), cv=inner_cv, verbose=1)
    # grid_search.fit(x_data, y_targets)
    # best_params_grid = grid_search.best_params_

    # print("Best parameters from Grid Search:", best_params_grid)

    return best_params_random

# Usage: best_params = nested_cv_rfc(x_data, y_targets)


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

In [57]:
m_rf = RandomForestClassifier(n_estimators = 20, max_features = 3, random_state=42).fit(X_train, y_train)
y_pred = m_rf.predict(X_test)

print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.69      0.85      0.76        13
           1       0.50      0.50      0.50         8
           2       0.67      0.44      0.53         9

    accuracy                           0.63        30
   macro avg       0.62      0.60      0.60        30
weighted avg       0.63      0.63      0.62        30



In [58]:
best_params = nested_cv_rfc(X_train, y_train)

Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Nested CV score (mean): 0.9428571428571428
Fitting 4 folds for each of 50 candidates, totalling 200 fits


In [59]:
m_rf_optimized = RandomForestClassifier(random_state=42, n_estimators = int(best_params["n_estimators"]), max_depth = int(best_params["max_depth"]), max_leaf_nodes = int(best_params["max_leaf_nodes"]), min_samples_split = int(best_params["min_samples_split"]), max_features = int(best_params["max_features"])).fit(X_train, y_train)
y_pred = m_rf_optimized.predict(X_test)

print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        13
           1       0.33      0.38      0.35         8
           2       0.33      0.33      0.33         9

    accuracy                           0.60        30
   macro avg       0.56      0.54      0.55        30
weighted avg       0.62      0.60      0.61        30

