### Load dataset

In [19]:
import pandas as pd
RANDOM_STATE = 42

%run "DataHelpers.ipynb"

# Can be replaced with desired variant for different feature sets
GENE_FILE_VARIANT = FeatureVariant.RESEARCHPAPERS # For values, see FeatureVariant.print_info()
variant = ModelVariant.SVM                      # For values, see ModelVariant.print_info()

FILE_PATH = f"../Data/patient_genes_{GENE_FILE_VARIANT}.csv"

df = pd.read_csv(FILE_PATH)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 33 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   BRCA1    977 non-null    float64
 1   BRCA2    977 non-null    float64
 2   CD274    977 non-null    float64
 3   MKI67    977 non-null    float64
 4   PDCD1    977 non-null    float64
 5   PIK3CA   977 non-null    float64
 6   TP53     977 non-null    float64
 7   LRPPRC   977 non-null    float64
 8   YOD1     977 non-null    float64
 9   DCLK1    977 non-null    float64
 10  TOP2A    977 non-null    float64
 11  TACSTD2  977 non-null    float64
 12  ROR1     977 non-null    float64
 13  TTN      977 non-null    float64
 14  CTLA4    977 non-null    float64
 15  EGFR     977 non-null    float64
 16  EPCAM    977 non-null    float64
 17  MYC      977 non-null    float64
 18  PTEN     977 non-null    float64
 19  CDK6     977 non-null    float64
 20  DDX3X    977 non-null    float64
 21  SRC      977 non

### Import model function

In [20]:
from sklearn.svm import SVC

### Dataset split: training and test data

In [21]:
X, y, X_train, X_test, y_train, y_test, test_case_ids = split_data(df, "tnbc", True)
print("\nApplied Smote")
X_smote, y_smote, X_train_smote, X_test_smote, y_train_smote, y_test_smote, test_case_ids_smote = split_data_apply_smote(df, "tnbc")

X_train.shape=(781, 31)
X_test.shape=(196, 31)
y_train.shape=(781,)
y_test.shape=(196,)

Applied Smote
X_train.shape=(1379, 31)
X_test.shape=(345, 31)
y_train.shape=(1379,)
y_test.shape=(345,)


### Support Vector Machine (SVM)

In [22]:
# Create model

model = SVC(random_state=RANDOM_STATE, probability=True)

def run_model(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, test_case_ids: pd.Series, is_smote: bool):
    # Train the model
    model.fit(X_train, y_train)

    # Model predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # For ROC curves etc.

    # Save it in a dataframe, to CSV
    predictions = pd.DataFrame({
        "case_id": test_case_ids,
        "y_test": y_test,
        "y_pred": y_pred,
        "y_prob": y_prob
    })
    predictions.to_csv(f"../Data/model_output_{variant}_{GENE_FILE_VARIANT}{'_smote' if is_smote else ''}.csv", index=False)

    return y_pred, y_prob

In [23]:
y_pred, y_prod = run_model(X_train, X_test, y_train, y_test, test_case_ids, False)
print_evaluated_model_accuracy(y_test, y_pred)

Accuracy: 0.94


## Smote applied

In [24]:
y_pred_smote, y_prod_smote = run_model(X_train_smote, X_test_smote, y_train_smote, y_test_smote, test_case_ids_smote, True)

print_evaluated_model_accuracy(y_test_smote, y_pred_smote)

Accuracy: 0.97


## Model cross validation

In [25]:
def run_cross_validation(X: pd.DataFrame, y: pd.Series, y_test: pd.Series, y_pred: pd.Series, y_prob: pd.Series, is_smote: bool) -> pd.DataFrame:
    metrics: pd.DataFrame = get_cross_validation_metrics(model, X, y, cv=5)
    test_metrics = get_metrics(y_test, y_pred, y_prob)
    test_metrics["fold"] = 0 # Initial test metrics (before cross validation)
    test = pd.DataFrame([test_metrics])
    test.set_index("fold", inplace=True)

    print_validated_model_accuracy(model, metrics)

    # Prepend test_metrics to metrics dataframe, export and display
    metrics = pd.concat([test, metrics])
    metrics.to_csv(f"../Data/model_metrics_{variant}_{GENE_FILE_VARIANT}{'_smote' if is_smote else ''}.csv", index=False)
    return metrics

In [26]:
metrics = run_cross_validation(X, y, y_test, y_pred, y_prod, False)
metrics

Model validation for SVC:
[0.9438775510204082, 0.9489795918367347, 0.9333333333333333, 0.9435897435897436, 0.9435897435897436]

Mean accuracy: 0.9427



Unnamed: 0_level_0,accuracy,recall,precision,f1_score,roc_auc,true_positive,true_negative,false_positive,false_negative
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.938776,0.73913,0.73913,0.73913,0.9809,17,167,6,6
1,0.943878,0.782609,0.75,0.765957,0.969088,18,167,6,5
2,0.94898,0.782609,0.782609,0.782609,0.954763,18,168,5,5
3,0.933333,0.608696,0.777778,0.682927,0.962841,14,168,4,9
4,0.94359,0.73913,0.772727,0.755556,0.970677,17,167,5,6
5,0.94359,0.73913,0.772727,0.755556,0.925683,17,167,5,6


In [27]:
metric_smote = run_cross_validation(X_smote, y_smote, y_test_smote, y_pred_smote, y_prod_smote, True)
metric_smote

Model validation for SVC:
[0.9623188405797102, 0.9652173913043478, 0.9739130434782609, 0.9623188405797102, 0.9680232558139535]

Mean accuracy: 0.9664



Unnamed: 0_level_0,accuracy,recall,precision,f1_score,roc_auc,true_positive,true_negative,false_positive,false_negative
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.973913,1.0,0.950276,0.974504,0.995127,172,164,9,0
1,0.962319,0.994186,0.934426,0.96338,0.99385,171,161,12,1
2,0.965217,0.988372,0.944444,0.965909,0.987095,170,163,10,2
3,0.973913,0.976879,0.971264,0.974063,0.991128,169,167,5,4
4,0.962319,0.982659,0.944444,0.963173,0.987599,170,162,10,3
5,0.968023,0.994186,0.944751,0.968839,0.991009,171,162,10,1


# Validation set

In [28]:
def run_model_validation(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, test_case_ids: pd.Series, is_smote: bool):
    # Train the model
    model.fit(X_train, y_train)

    # Model predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # For ROC curves etc.

    # Save it in a dataframe, to CSV
    predictions = pd.DataFrame({
        "case_id": test_case_ids,
        "y_test": y_test,
        "y_pred": y_pred,
        "y_prob": y_prob
    })
    predictions.to_csv(f"../Data/model_output_{variant}_{GENE_FILE_VARIANT}{'_smote' if is_smote else ''}_validation.csv", index=False)

    return y_pred, y_prob

In [31]:
validationSet = pd.read_csv(FILE_PATH)

# validationSet.info()

X, y, X_train, X_test, y_train, y_test, test_case_ids = split_data(validationSet, "tnbc", True)

y_pred_val = model.predict(X)
y_prob_val = model.predict_proba(X)[:, 1] # For ROC curves etc.

predictions = pd.DataFrame({
    "case_id": test_case_ids,
    "y_test": y,
    "y_pred": y_pred_val,
    "y_prob": y_prob_val
})



X_train.shape=(781, 31)
X_test.shape=(196, 31)
y_train.shape=(781,)
y_test.shape=(196,)


In [33]:
predictions.to_csv(f"../Data/model_output_{variant}_{GENE_FILE_VARIANT}_validation.csv", index=False)