In [31]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [32]:
# load data
df_X = pd.read_csv('../data/X.csv', index_col=0)
df_y = pd.read_csv('../data/y.csv', index_col=0)

# keep selected columns
selected_features = [30, 31, 175, 177, 192, 194, 223, 230, 316, 317, 342, 354, 396, 472, 485, 487, 509, 511, 623, 672, 679, 758, 764, 766, 791, 814, 815, 842, 998, 1035, 1055, 1059, 1170, 1243, 1310, 1481, 1642, 1655, 1659, 1663, 1678, 1679, 1683, 1788, 1897, 1899, 1902, 1906, 1907, 1962, 2017, 2023, 2024, 2026, 2184, 2188, 2206, 2207, 2210, 2211, 2213, 2382, 2427, 2457, 2501, 2655, 2656, 2742, 2747, 2750, 2752] 
new_df_X = df_X.iloc[:, selected_features]

X = new_df_X.to_numpy()
y = df_y.values.ravel()  # 0 is HER2+, 1 is HR+, 2 is Triple Negative

In [33]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

# Logistic regression

In [34]:
# define the model
log_reg = LogisticRegression(solver='liblinear', multi_class='ovr')

# define the grid of hyperparameters
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2'],
               'max_iter': [100, 200, 300, 400, 500]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(log_reg, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))


Mean cross-validation score: 0.8857142857142858
Best hyperparameters: {'C': 10, 'max_iter': 100, 'penalty': 'l1'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.70      0.88      0.78         8
           2       0.86      0.67      0.75         9

    accuracy                           0.87        30
   macro avg       0.85      0.85      0.84        30
weighted avg       0.88      0.87      0.87        30



# kNN

In [36]:
# define the model
knn = KNeighborsClassifier()

# define the grid of hyperparameters
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(knn, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

Mean cross-validation score: 0.8428571428571429
Best hyperparameters: {'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        13
           1       0.73      1.00      0.84         8
           2       0.83      0.56      0.67         9

    accuracy                           0.83        30
   macro avg       0.83      0.83      0.81        30
weighted avg       0.84      0.83      0.82        30

