In [1]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [2]:
# load data
df_X = pd.read_csv('../data/X.csv', index_col=0)
df_y = pd.read_csv('../data/y.csv', index_col=0)

y = df_y.values.ravel()  # 0 is HER2+, 1 is HR+, 2 is Triple Negative

# List 2

In [3]:
# keep selected columns
list_2 = [1902, 1956, 1973, 2026, 2058, 2183, 2184, 2207, 2211, 2213, 2547, 2593, 1672, 118, 192, 695, 772, 791, 854, 1061, 1091, 1559, 1643, 1656, 1678, 1900, 2017, 2021, 2024, 2210, 2218, 2750, 2776, 2791, 2817, 2825]
new_df_X = df_X.iloc[:, list_2]

X = new_df_X.to_numpy()

In [4]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

## Logistic regression

In [5]:
# define the model
log_reg = LogisticRegression(solver='liblinear', multi_class='ovr')

# define the grid of hyperparameters
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2'],
               'max_iter': [100, 200, 300, 400, 500]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(log_reg, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))


Mean cross-validation score: 0.9285714285714286
Best hyperparameters: {'C': 1, 'max_iter': 100, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      0.88      0.93         8
           2       0.90      1.00      0.95         9

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.96        30
weighted avg       0.97      0.97      0.97        30



## kNN

In [6]:
# define the model
knn = KNeighborsClassifier()

# define the grid of hyperparameters
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(knn, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

Mean cross-validation score: 0.7714285714285714
Best hyperparameters: {'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       1.00      0.62      0.76        13
           1       0.50      1.00      0.67         8
           2       0.83      0.56      0.67         9

    accuracy                           0.70        30
   macro avg       0.78      0.72      0.70        30
weighted avg       0.82      0.70      0.71        30



# List 3

In [7]:
# keep selected columns
list_3 = [2026, 2184, 2213, 791, 1656, 1678, 1900, 2750]
new_df_X = df_X.iloc[:, list_3]

X = new_df_X.to_numpy()

In [8]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

## Logistic regression

In [9]:
# define the model
log_reg = LogisticRegression(solver='liblinear', multi_class='ovr')

# define the grid of hyperparameters
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2'],
               'max_iter': [100, 200, 300, 400, 500]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(log_reg, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

Mean cross-validation score: 0.9
Best hyperparameters: {'C': 1, 'max_iter': 100, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        13
           1       0.67      0.75      0.71         8
           2       0.86      0.67      0.75         9

    accuracy                           0.83        30
   macro avg       0.82      0.81      0.81        30
weighted avg       0.84      0.83      0.83        30



## kNN

In [10]:
# define the model
knn = KNeighborsClassifier()

# define the grid of hyperparameters
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(knn, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

Mean cross-validation score: 0.9
Best hyperparameters: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.58      0.88      0.70         8
           2       0.80      0.44      0.57         9

    accuracy                           0.80        30
   macro avg       0.79      0.77      0.76        30
weighted avg       0.83      0.80      0.79        30



# List 4

In [11]:
# keep selected columns
list_4 = [2026, 2184, 2213]
new_df_X = df_X.iloc[:, list_4]

X = new_df_X.to_numpy()

In [12]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

## Logistic regression

In [13]:
# define the model
log_reg = LogisticRegression(solver='liblinear', multi_class='ovr')

# define the grid of hyperparameters
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2'],
               'max_iter': [100, 200, 300, 400, 500]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(log_reg, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

Mean cross-validation score: 0.7142857142857143
Best hyperparameters: {'C': 10, 'max_iter': 100, 'penalty': 'l1'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.62      1.00      0.76         8
           2       1.00      0.44      0.62         9

    accuracy                           0.83        30
   macro avg       0.87      0.81      0.79        30
weighted avg       0.90      0.83      0.82        30



## kNN

In [14]:
# define the model
knn = KNeighborsClassifier()

# define the grid of hyperparameters
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(knn, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

Mean cross-validation score: 0.6571428571428573
Best hyperparameters: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        13
           1       0.60      0.75      0.67         8
           2       0.75      0.67      0.71         9

    accuracy                           0.80        30
   macro avg       0.78      0.78      0.78        30
weighted avg       0.82      0.80      0.81        30



# List 5

In [15]:
# keep selected columns
list_5 = [2184]
new_df_X = df_X.iloc[:, list_5]

X = new_df_X.to_numpy()

In [16]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

## Logistic regression

In [17]:
# define the model
log_reg = LogisticRegression(solver='liblinear', multi_class='ovr')

# define the grid of hyperparameters
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2'],
               'max_iter': [100, 200, 300, 400, 500]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(log_reg, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

Mean cross-validation score: 0.5857142857142856
Best hyperparameters: {'C': 10, 'max_iter': 100, 'penalty': 'l1'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.47      1.00      0.64         8
           2       0.00      0.00      0.00         9

    accuracy                           0.70        30
   macro avg       0.49      0.67      0.55        30
weighted avg       0.56      0.70      0.60        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## kNN

In [18]:
# define the model
knn = KNeighborsClassifier()

# define the grid of hyperparameters
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

# configure the nested cross-validation procedure
inner_cv = KFold(n_splits=5, shuffle=True, random_state=12345)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=12345)

# search
grid_search = GridSearchCV(knn, param_grid, cv=inner_cv)
cross_val_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv)

grid_search.fit(X_train, y_train)

print("Mean cross-validation score:", np.mean(cross_val_scores))
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

Mean cross-validation score: 0.6142857142857142
Best hyperparameters: {'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.50      0.25      0.33         8
           2       0.54      0.78      0.64         9

    accuracy                           0.73        30
   macro avg       0.68      0.68      0.66        30
weighted avg       0.73      0.73      0.71        30

