In [1]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score


In [2]:
# load data
df_X = pd.read_csv('../data/X.csv', index_col=0)
df_y = pd.read_csv('../data/y.csv', index_col=0)

X = df_X.to_numpy()
y = df_y.values.ravel()  # 0 is HER2+, 1 is HR+, 2 is Triple Negative

In [3]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

# Logistic regression

## logistic regression using multiclass ovr strategy

In [4]:
# define the model
log_reg = LogisticRegression(solver='liblinear', multi_class='ovr')

# define the grid of hyperparameters
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2'],
               'max_iter': [100, 200, 300, 400, 500]}

# search
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

# classification report
print(classification_report(y_test, y_pred))

Best hyperparameters: {'C': 1, 'max_iter': 100, 'penalty': 'l1'}
Accuracy:  0.8
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.60      0.75      0.67         8
           2       0.71      0.56      0.62         9

    accuracy                           0.80        30
   macro avg       0.77      0.77      0.76        30
weighted avg       0.81      0.80      0.80        30



## logistic regression (0 agaisnt 1 & 2)

In [5]:
# convert y_train and y_test to be binary
y_train_1 = np.where(y_train == 2, 1, y_train)
y_test_1 = np.where(y_test == 2, 1, y_test)

# define the model
log_reg = LogisticRegression(solver='liblinear')

# define the grid of hyperparameters
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2'],
              'max_iter': [100, 200, 300, 400, 500]}

# search
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train_1)
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test_1, y_pred)
print("Accuracy: ", accuracy)

# classification report
print(classification_report(y_test_1, y_pred))

Best hyperparameters: {'C': 1, 'max_iter': 100, 'penalty': 'l1'}
Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        17

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



## logistic regression (1 against 0 & 2)

In [6]:
# convert y_train and y_test to binary
y_train_2 = np.where(y_train == 2, 0, y_train)
y_test_2 = np.where(y_test == 2, 0, y_test)

# define the model
log_reg = LogisticRegression(solver='liblinear')

# define the grid of hyperparameters
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2'],
              'max_iter': [100, 200, 300, 400, 500]}

# search
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train_2)
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test_2, y_pred)
print("Accuracy: ", accuracy)

# classification report
print(classification_report(y_test_2, y_pred))

Best hyperparameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2'}
Accuracy:  0.8333333333333334
              precision    recall  f1-score   support

           0       0.84      0.95      0.89        22
           1       0.80      0.50      0.62         8

    accuracy                           0.83        30
   macro avg       0.82      0.73      0.75        30
weighted avg       0.83      0.83      0.82        30



## logistic regression (2 against 0 & 1)

In [7]:
# convert y_train and y_test to binary
y_train_3 = np.where(y_train == 1, 0, y_train)
y_test_3 = np.where(y_test == 1, 0, y_test)

# define the model
log_reg = LogisticRegression(solver='liblinear')

# define the grid of hyperparameters
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2'],
              'max_iter': [100, 200, 300, 400, 500]}

# search
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train_3)
print("Best hyperparameters:", grid_search.best_params_)

# model evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test_3, y_pred)
print("Accuracy: ", accuracy)

# classification report
print(classification_report(y_test_3, y_pred))

Best hyperparameters: {'C': 1, 'max_iter': 100, 'penalty': 'l1'}
Accuracy:  0.7333333333333333
              precision    recall  f1-score   support

           0       0.78      0.86      0.82        21
           2       0.57      0.44      0.50         9

    accuracy                           0.73        30
   macro avg       0.68      0.65      0.66        30
weighted avg       0.72      0.73      0.72        30



# k-nearest neighbors

In [9]:
# define the model
knn = KNeighborsClassifier()

# define the grid of hyperparameters
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

# search
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)

# evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

# classification report
print(classification_report(y_test, y_pred))

Best hyperparameters: {'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
Accuracy:  0.3333333333333333
              precision    recall  f1-score   support

           0       0.80      0.31      0.44        13
           1       0.26      0.75      0.39         8
           2       0.00      0.00      0.00         9

    accuracy                           0.33        30
   macro avg       0.35      0.35      0.28        30
weighted avg       0.42      0.33      0.30        30

