In [1]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [2]:
# load data
df_X = pd.read_csv('../data/X.csv', index_col=0)
df_y = pd.read_csv('../data/y.csv', index_col=0)

# keep selected columns
selected_features = [2056, 2058, 1559, 2083, 1065, 1581, 1583, 1087, 67, 74, 2130, 2662, 2154, 623, 1654, 1656, 1657, 1663, 2182, 2183, 1160, 2184, 2185, 1677, 1678, 1679, 2196, 2717, 2206, 671, 2207, 2208, 2209, 2723, 2212, 2213, 2211, 2725, 2214, 2724, 2218, 2210, 2732, 2221, 174, 2730, 2219, 2220, 2733, 2223, 2224, 2225, 2742, 2736, 2740, 189, 192, 2776, 1243, 734, 2275, 229, 230, 2789, 744, 2791, 742, 743, 745, 761, 1788, 765, 1789, 2817, 2818, 771, 261, 1289, 2828, 1293, 1302, 791, 1303, 1306, 800, 801, 803, 805, 669, 808, 814, 817, 818, 673, 674, 836, 838, 841, 842, 2379, 676, 2382, 849, 851, 853, 854, 855, 2205, 857, 679, 858, 863, 1889, 1890, 1891, 1892, 1894, 1895, 1896, 1897, 1898, 1899, 1902, 1905, 1907, 1910, 888, 385, 386, 2729, 2486, 2734, 2735, 455, 1994, 462, 2017, 487, 489, 2026, 2547, 2549]
new_df_X = df_X.iloc[:, selected_features]

X = new_df_X.to_numpy()
y = df_y.values.ravel()  # 0 is HER2+, 1 is HR+, 2 is Triple Negative

In [3]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

# Logistic regression

In [4]:
# define the model
log_reg = LogisticRegression(solver='liblinear', multi_class='ovr')

# define the grid of hyperparameters
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2'],
               'max_iter': [100, 200, 300, 400, 500]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(log_reg, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))


Mean cross-validation score: 0.9285714285714285
Best hyperparameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       0.85      0.85      0.85        13
           1       0.50      0.62      0.56         8
           2       0.71      0.56      0.62         9

    accuracy                           0.70        30
   macro avg       0.69      0.68      0.68        30
weighted avg       0.71      0.70      0.70        30



# kNN

In [6]:
# define the model
knn = KNeighborsClassifier()

# define the grid of hyperparameters
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(knn, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

Mean cross-validation score: 0.8
Best hyperparameters: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       0.80      0.62      0.70        13
           1       0.36      0.62      0.45         8
           2       0.67      0.44      0.53         9

    accuracy                           0.57        30
   macro avg       0.61      0.56      0.56        30
weighted avg       0.64      0.57      0.58        30

