### Load dataset

In [24]:
%run "..\\Model\\DataHelpers.ipynb"

In [25]:
FILE_PATH = f"../Data/patient_genes_lasso.csv"
FILE_PATH_VALIDATION = f"../Data/validationset.csv"

GENE_FILE_VARIANT = 'lasso'
featuresLASSO = [ 'CD1A'  , 'CSF2RB', 'EPCAM' , 'ERBB2'  , 'ESR1'    ,
                  'EZH2'  , 'FGB'   , 'FOXA1' , 'FOXC1'  , 'GATA3'   , 
                  'LAMA2' , 'LMNA'  , 'MDGA2' , 'OBSCN'  , 'OGN'     , 
                  'PGR'   , 'SELL'  , 'SRC'   , 'TACSTD2', 'TBC1D22B', 
                  'TFF1'  , 'TGFB3' , 'UBE2C' , 'VTCN1'  ,'WTAP'     , 
                  'YES1'  , 'YOD1' ]

df = pd.read_csv(FILE_PATH)
dfValidation = pd.read_csv(FILE_PATH_VALIDATION)

In [26]:
modelName = 'SVM'
model = getModel(modelName)

# Train/Test with SMOTE

In [27]:
### Dataset split: training and test data, with SMOTE and without SMOTE
X, y, X_train, X_test, y_train, y_test, test_case_ids = split_data_apply_smote(df, "tnbc")

X_train.shape=(1379, 27)
X_test.shape=(345, 27)
y_train.shape=(1379,)
y_test.shape=(345,)


In [28]:
y_pred, y_prod = run_model(model, X_train, X_test, y_train, y_test, test_case_ids, False, modelName)
print_evaluated_model_accuracy(y_test, y_pred)

Accuracy: 0.97


In [29]:
metrics = run_cross_validation(model, X, y, y_test, y_pred, y_prod, False, modelName)

Model validation for SVC:
[0.9681159420289855, 0.9797101449275363, 0.9826086956521739, 0.9652173913043478, 0.9738372093023255]

Mean accuracy: 0.9739



# Validation

In [30]:
X_val = dfValidation[featuresLASSO]
y_val = dfValidation['tnbc']
test_case_ids_val = dfValidation['case_id']

In [31]:
def run_model_validation(model: Model,
                         X_validation: pd.DataFrame,
                         y_validation: pd.Series,
                         test_case_ids_val: pd.Series):
    # Model predictions
    y_pred = model.predict(X_validation)
    y_prob = model.predict_proba(X_validation)[:, 1]  # For ROC curves etc.

    # Save it in a dataframe, to CSV
    predictions = pd.DataFrame({
        "case_id": test_case_ids_val,
        "y_validation": y_validation,
        "y_pred": y_pred,
        "y_prob": y_prob
    })
    predictions.to_csv(f"../Data/model_output_trainTestSMOTE_lasso_validation.csv", index=False)

    return y_pred, y_prob

In [32]:
def run_cross_validation_validation(model: Model, 
                                    X: pd.DataFrame,
                                    y: pd.Series, 
                                    y_validation: pd.Series,
                                    y_pred: pd.Series,
                                    y_prob: pd.Series) -> pd.DataFrame:
    metrics: pd.DataFrame = get_cross_validation_metrics(model, X, y, cv=5)
    test_metrics = get_metrics(y_validation, y_pred, y_prob)
    test_metrics["fold"] = 0 # Initial test metrics (before cross validation)
    test = pd.DataFrame([test_metrics])
    test.set_index("fold", inplace=True)

    print_validated_model_accuracy(model, metrics)

    # Prepend test_metrics to metrics dataframe, export and display
    metrics = pd.concat([test, metrics])
    metrics.to_csv(f"../Data/model_metrics_trainTestSMOTE_lasso_validation.csv", index=False)
    return metrics

In [33]:
y_pred, y_prod = run_model_validation(model, X_val, y_val, test_case_ids_val)
print_evaluated_model_accuracy(y_val, y_pred)

Accuracy: 0.48


In [34]:
metrics = run_cross_validation_validation(model, X, y, y_val, y_pred, y_prod)

Model validation for SVC:
[0.9681159420289855, 0.9797101449275363, 0.9826086956521739, 0.9652173913043478, 0.9738372093023255]

Mean accuracy: 0.9739

