### Load dataset

In [3]:
import pandas as pd

RANDOM_STATE = 42
# patient_genes_literature
FILE_PATH = "../Data/patient_genes_featureSet_2.csv" # Can be replaced with desired variant for different feature sets
variant = 'svm'
df = pd.read_csv(FILE_PATH)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 32 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   FYN      977 non-null    int64 
 1   BRCA1    977 non-null    int64 
 2   FOXC1    977 non-null    int64 
 3   TBC1D1   977 non-null    int64 
 4   LAG3     977 non-null    int64 
 5   CDK6     977 non-null    int64 
 6   GATA3    977 non-null    int64 
 7   CCND1    977 non-null    int64 
 8   PRR4     977 non-null    int64 
 9   EPCAM    977 non-null    int64 
 10  CD274    977 non-null    int64 
 11  PIK3CA   977 non-null    int64 
 12  TOP2A    977 non-null    int64 
 13  DCLK1    977 non-null    int64 
 14  MYC      977 non-null    int64 
 15  LRPPRC   977 non-null    int64 
 16  BRCA2    977 non-null    int64 
 17  TP53     977 non-null    int64 
 18  MKI67    977 non-null    int64 
 19  TTN      977 non-null    int64 
 20  CTLA4    977 non-null    int64 
 21  PTEN     977 non-null    int64 
 22  YE

### Imports

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

%run "..\Model\DataHelpers.ipynb"

### Dataset split: training and test data

In [5]:
X, y, X_train, X_test, y_train, y_test, test_case_id = split_data(df, "tnbc", "case_id")

X_train.shape=(781, 30)
X_test.shape=(196, 30)
y_train.shape=(781,)
y_test.shape=(196,)


### Support Vector Machine (SVM)

In [6]:
# Create model

model = SVC(random_state=RANDOM_STATE, probability=True)

# Train the model
model.fit(X_train, y_train)

# Model predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # For ROC curves etc.

# Save it in a dataframe, to CSV
predictions = pd.DataFrame({
    "case_id": test_case_id,
    "y_test": y_test,
    "y_pred": y_pred,
    "y_prob": y_prob
})
predictions.to_csv(f"../Data/model_output_{variant}.csv", index=False)

# Evaluate model
print_evaluated_model_accuracy(y_test, y_pred)

Accuracy: 0.94


## Model cross validation

In [7]:
metrics = get_cross_validation_metrics(model, X, y, 5)
test_metrics = get_metrics(y_test, y_pred, y_prob)
test_metrics["fold"] = 0 # Initial test metrics (before cross validation)
test = pd.DataFrame([test_metrics])
test.set_index("fold", inplace=True)

print_validated_model_accuracy(model, metrics)

# Prepend test_metrics to metrics dataframe, export and display
metrics = pd.concat([test, metrics])
metrics.to_csv(f"../Data/model_metrics_{variant}.csv", index=False)
metrics

Model validation for SVC:
[0.9285714285714286, 0.9285714285714286, 0.9384615384615385, 0.9230769230769231, 0.9128205128205128]

Mean accuracy: 0.9263



Unnamed: 0_level_0,accuracy,recall,precision,f1_score,roc_auc,true_positive,true_negative,false_positive,false_negative
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.943878,0.652174,0.833333,0.731707,0.975873,15,170,3,8
1,0.928571,0.608696,0.736842,0.666667,0.971852,14,168,5,9
2,0.928571,0.608696,0.736842,0.666667,0.966826,14,168,5,9
3,0.938462,0.565217,0.866667,0.684211,0.962588,13,170,2,10
4,0.923077,0.565217,0.722222,0.634146,0.966127,13,167,5,10
5,0.912821,0.521739,0.666667,0.585366,0.929474,12,166,6,11


### Boruta Feature Selection Example

#### The output of Boruta
#### Can be fed to SVM

In [39]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# module 'numpy' has no attribute 'int'
# https://github.com/scikit-learn-contrib/boruta_py/issues/122#issuecomment-1914122968
np.int = np.int32
np.float = np.float64
np.bool = np.bool_

Boruta_X = X_train.to_numpy()
Boruta_y = y_train.astype(int).to_numpy() 

rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(Boruta_X, Boruta_y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	21
Tentative: 	9
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	21
Tentative: 	7
Rejected: 	2
Iteration: 	10 / 100
Confirmed: 	21
Tentative: 	7
Rejected: 	2
Iteration: 	11 / 100
Confirmed: 	21
Tentative: 	7
Rejected: 	2
Iteration: 	12 / 100
Confirmed: 	21
Tentative: 	7
Rejected: 	2
Iteration: 	13 / 100
Confirmed: 	21
Tentative: 	7
Rejected: 	2
Iteration: 	14 / 100
Confirmed: 	21
Tentative: 	7
Rejected: 	2
Iteration: 	15 / 100
Confirmed: 	21
Tentative: 	7
Rejected: 	2
Iteration: 	16 / 100
Confirmed: 	21
Tentative: 	7
Rejected: 	2
I

In [35]:
Boruta_X[:1]

array([[ 2166,   919,  9038,  5423,   188,  2336,   894,  1182,     4,
        17225,   186,  2026,  5579,   525,  8071, 10446,   664,  1340,
         5556,   270,   149,  3148,  6877,  1778, 44204,   183,   162,
         1704, 21533,  8392]], dtype=int64)

In [36]:
X_train[:1]

Unnamed: 0,FYN,BRCA1,FOXC1,TBC1D1,LAG3,CDK6,GATA3,CCND1,PRR4,EPCAM,...,CTLA4,PTEN,YES1,YOD1,TACSTD2,ROR1,PDCD1,SRC,DDX3X,MALAT1
758,2166,919,9038,5423,188,2336,894,1182,4,17225,...,149,3148,6877,1778,44204,183,162,1704,21533,8392


In [38]:
X_train.columns

Index(['FYN', 'BRCA1', 'FOXC1', 'TBC1D1', 'LAG3', 'CDK6', 'GATA3', 'CCND1',
       'PRR4', 'EPCAM', 'CD274', 'PIK3CA', 'TOP2A', 'DCLK1', 'MYC', 'LRPPRC',
       'BRCA2', 'TP53', 'MKI67', 'TTN', 'CTLA4', 'PTEN', 'YES1', 'YOD1',
       'TACSTD2', 'ROR1', 'PDCD1', 'SRC', 'DDX3X', 'MALAT1'],
      dtype='object')

In [29]:
# check selected features - first 5 features are selected
feat_selector.support_


array([ True, False,  True,  True,  True,  True,  True,  True, False,
        True, False,  True,  True,  True,  True, False,  True, False,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True])

In [30]:
# check ranking of features
feat_selector.ranking_


array([1, 2, 1, 1, 1, 1, 1, 1, 6, 1, 4, 1, 1, 1, 1, 3, 1, 2, 1, 5, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1])

In [31]:
# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(Boruta_X)

In [32]:
X_filtered

array([[ 2166,  9038,  5423, ...,   162,  1704,  8392],
       [  533,   108,  1201, ...,     5,   180,  1186],
       [ 1653,    65,  1606, ...,   432,  2055,  1273],
       ...,
       [  496,   446,   888, ...,     2,  4946, 19823],
       [  523,   175,  1432, ...,    45,  1683,  4749],
       [  760,   135,  1249, ...,    12,  1114,  1540]], dtype=int64)