### Load dataset

In [1]:
import pandas as pd

FILE_PATH = "../Data/patient_genes_literature.csv" # Can be replaced with desired variant for different feature sets
df = pd.read_csv(FILE_PATH)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CACNA2D2  977 non-null    int64 
 1   ESR1      977 non-null    int64 
 2   AGR2      977 non-null    int64 
 3   GATA3     977 non-null    int64 
 4   SLC16A6   977 non-null    int64 
 5   TBC1D9    977 non-null    int64 
 6   INPP4B    977 non-null    int64 
 7   LDHB      977 non-null    int64 
 8   MLPH      977 non-null    int64 
 9   TSPAN1    977 non-null    int64 
 10  STBD1     977 non-null    int64 
 11  STARD3    977 non-null    int64 
 12  RARA      977 non-null    int64 
 13  MCCC2     977 non-null    int64 
 14  PSAT1     977 non-null    int64 
 15  MFGE8     977 non-null    int64 
 16  ANXA9     977 non-null    int64 
 17  PPP1R14C  977 non-null    int64 
 18  SLC44A4   977 non-null    int64 
 19  tnbc      977 non-null    bool  
 20  case_id   977 non-null    object
dtypes: bool(1), int6

# Basic model implementations

### Imports

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from typing import get_args

### Dataset split: training and test data

In [3]:
# Constants
TNBC = "tnbc"
CASE_ID = "case_id"
RANDOM_STATE = 42

# Type annotations
Model = LogisticRegression | SVC | RandomForestClassifier


# Features: all columns except TNBC and CASE_ID columns
X = df.drop(columns=[TNBC, CASE_ID])
# Target variable
y = df[TNBC]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Training size = 0.8 * 977 ≈ 781
# Test size = 0.2 * 977 ≈ 196
print(f"{X_train.shape=}")
print(f"{X_test.shape=}")
print(f"{y_train.shape=}")
print(f"{y_test.shape=}")


X_train.shape=(781, 19)
X_test.shape=(196, 19)
y_train.shape=(781,)
y_test.shape=(196,)


### Helper functions

In [4]:
def get_accuracy_score(y_pred):
    return accuracy_score(y_test, y_pred)

def print_evaluated_model_accuracy(y_pred) -> None:
    print(f"Accuracy: {get_accuracy_score(y_pred):.2f}")    

def print_validated_model_accuracy(model: Model, cv: int = 5) -> None:
    assert isinstance(model, Model), f"Parameter 'model' needs to be one of the following types {[type_.__name__ for type_ in get_args(Model)]}"

    cv_scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
    print(f"Model validation for {type(model).__name__}:")
    print(cv_scores)
    print(f"\nMean accuracy: {cv_scores.mean():.4f}\n")

### Logistic Regression

In [5]:
# Create model
# Bumping max_iter to a higer number than the default 100, resolves the following warning
#       ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.
logistic_regression_model = LogisticRegression(random_state=RANDOM_STATE, solver='lbfgs', max_iter=2700)

# Train the model
logistic_regression_model.fit(X_train, y_train)

# Model predictions
y_pred_logistic_regression = logistic_regression_model.predict(X_test)

# Evaluate model
print_evaluated_model_accuracy(y_pred_logistic_regression)

Accuracy: 0.94


### Support Vector Machine (SVM)

In [6]:
# Create model
svm_model = SVC(random_state=RANDOM_STATE)

# Train the model
svm_model.fit(X_train, y_train)

# Model predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluate model
print_evaluated_model_accuracy(y_pred_svm)

Accuracy: 0.96


### Random Forest

In [7]:
# Create model
random_forest_model = RandomForestClassifier(random_state=RANDOM_STATE)

#Train the model
random_forest_model.fit(X_train, y_train)

# Model predictions
y_pred_random_forest = random_forest_model.predict(X_test)

# Evaluate
print_evaluated_model_accuracy(y_pred_random_forest)

Accuracy: 0.94


## Model basic validations

In [None]:
print_validated_model_accuracy(logistic_regression_model)

print_validated_model_accuracy(svm_model)

print_validated_model_accuracy(random_forest_model)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model validation for LogisticRegression:
[0.92857143 0.94387755 0.95897436 0.93846154 0.94358974]

Mean accuracy: 0.9427

Model validation for SVC:
[0.92857143 0.94897959 0.94871795 0.92820513 0.93333333]

Mean accuracy: 0.9376

