### Load dataset

In [83]:
%run "DataHelpers.ipynb"

In [94]:
FILE_PATH = f"../Data/patient_genes_lasso.csv"
FILE_PATH_VALIDATION = f"../Data/validationset.csv"

GENE_FILE_VARIANT = 'lasso'
featuresLASSO = [ 'CD1A'  , 'CSF2RB', 'EPCAM' , 'ERBB2'  , 'ESR1'    ,
                  'EZH2'  , 'FGB'   , 'FOXA1' , 'FOXC1'  , 'GATA3'   , 
                  'LAMA2' , 'LMNA'  , 'MDGA2' , 'OBSCN'  , 'OGN'     , 
                  'PGR'   , 'SELL'  , 'SRC'   , 'TACSTD2', 'TBC1D22B', 
                  'TFF1'  , 'TGFB3' , 'UBE2C' , 'VTCN1'  ,'WTAP'     , 
                  'YES1'  , 'YOD1' ]

df = pd.read_csv(FILE_PATH)
dfValidation = pd.read_csv(FILE_PATH_VALIDATION)

In [115]:
dfValidation = pd.concat([dfValidation[dfValidation['tnbc'] ==False], dfValidation[dfValidation['tnbc'] ==True].head(15)])

In [116]:
modelName = 'SVM'
model = getModel(modelName)

# Train/Test

In [117]:
### Dataset split: training and test data, with SMOTE and without SMOTE
X, y, X_train, X_test, y_train, y_test, test_case_ids = split_data(df, "tnbc", True)

X_train.shape=(781, 27)
X_test.shape=(196, 27)
y_train.shape=(781,)
y_test.shape=(196,)


In [118]:
y_pred, y_prod = run_model(model, X_train, X_test, y_train, y_test, test_case_ids, False, modelName)
print_evaluated_model_accuracy(y_test, y_pred)

Accuracy: 0.95


In [119]:
metrics = run_cross_validation(model, X, y, y_test, y_pred, y_prod, False, modelName)

Model validation for SVC:
[0.9489795918367347, 0.9489795918367347, 0.958974358974359, 0.9487179487179487, 0.9487179487179487]

Mean accuracy: 0.9509



# Validadtion

In [120]:
X_val = dfValidation[featuresLASSO]
y_val = dfValidation['tnbc']
test_case_ids_val = dfValidation['case_id']

In [121]:
def run_model_validation(model: Model,
                         X_validation: pd.DataFrame,
                         y_validation: pd.Series,
                         test_case_ids_val: pd.Series):
    # Model predictions
    y_pred = model.predict(X_validation)
    y_prob = model.predict_proba(X_validation)[:, 1]  # For ROC curves etc.

    # Save it in a dataframe, to CSV
    predictions = pd.DataFrame({
        "case_id": test_case_ids_val,
        "y_validation": y_validation,
        "y_pred": y_pred,
        "y_prob": y_prob
    })
    predictions.to_csv(f"../Data/model_output_lasso_validation.csv", index=False)

    return y_pred, y_prob

In [122]:
def run_cross_validation_validation(model: Model, 
                                    X: pd.DataFrame,
                                    y: pd.Series, 
                                    y_validation: pd.Series,
                                    y_pred: pd.Series,
                                    y_prob: pd.Series) -> pd.DataFrame:
    metrics: pd.DataFrame = get_cross_validation_metrics(model, X, y, cv=5)
    test_metrics = get_metrics(y_validation, y_pred, y_prob)
    test_metrics["fold"] = 0 # Initial test metrics (before cross validation)
    test = pd.DataFrame([test_metrics])
    test.set_index("fold", inplace=True)

    print_validated_model_accuracy(model, metrics)

    # Prepend test_metrics to metrics dataframe, export and display
    metrics = pd.concat([test, metrics])
    metrics.to_csv(f"../Data/model_metrics_lasso_validation.csv", index=False)
    return metrics

In [123]:
y_pred, y_prod = run_model_validation(model, X_val, y_val, test_case_ids_val)
print_evaluated_model_accuracy(y_val, y_pred)

Accuracy: 0.91


In [124]:
metrics = run_cross_validation_validation(model, X, y, y_val, y_pred, y_prod)

Model validation for SVC:
[0.9489795918367347, 0.9489795918367347, 0.958974358974359, 0.9487179487179487, 0.9487179487179487]

Mean accuracy: 0.9509



In [35]:
print_evaluated_model_accuracy(y_val, y_val_pred)

Accuracy: 0.22


In [None]:
metrics: pd.DataFrame = get_cross_validation_metrics(model, X, y, cv=5)

In [None]:
run_cross_validation

In [26]:
from sklearn.metrics import accuracy_score, classification_report

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

Validation Accuracy: 0.2159090909090909
              precision    recall  f1-score   support

       False       0.18      1.00      0.30        44
        True       1.00      0.06      0.11       220

    accuracy                           0.22       264
   macro avg       0.59      0.53      0.20       264
weighted avg       0.86      0.22      0.14       264



In [12]:
dfValidation.columns

Index(['MCF2L2', 'FOXC1', 'YBX1', 'IGF2BP2', 'FSCN1', 'GABRP', 'SOX10',
       'CENPA', 'PADI2', 'CDC20', 'BCL11A', 'HAPLN3', 'ANP32E', 'SFT2D2',
       'B3GNT5', 'ANKS6', 'FOXA1', 'PSAT1', 'TBX19', 'CDCA7', 'CENPW', 'UGT8',
       'RGMA', 'EN1', 'FAM171A1', 'CD1A', 'CSF2RB', 'EPCAM', 'ERBB2', 'ESR1',
       'EZH2', 'FGB', 'FOXA1.1', 'FOXC1.1', 'GATA3', 'LAMA2', 'LMNA', 'MDGA2',
       'OBSCN', 'OGN', 'PGR', 'SELL', 'SRC', 'TACSTD2', 'TBC1D22B', 'TFF1',
       'TGFB3', 'UBE2C', 'VTCN1', 'WTAP', 'YES1', 'YOD1', 'tnbc', 'case_id'],
      dtype='object')

In [13]:
df.columns

Index(['CD1A', 'CSF2RB', 'EPCAM', 'ERBB2', 'ESR1', 'EZH2', 'FGB', 'FOXA1',
       'FOXC1', 'GATA3', 'LAMA2', 'LMNA', 'MDGA2', 'OBSCN', 'OGN', 'PGR',
       'SELL', 'SRC', 'TACSTD2', 'TBC1D22B', 'TFF1', 'TGFB3', 'UBE2C', 'VTCN1',
       'WTAP', 'YES1', 'YOD1', 'tnbc', 'case_id'],
      dtype='object')

# Check stes

In [79]:
#validationSet_002_log2_ipv_logln
FILE_PATH_VALIDATION = f"../Data/validationset.csv"
FILE_PATH_VALIDATION_002 = f"../Data/validationSet_002_log2_ipv_logln.csv"

In [80]:
dfValidationReg = pd.read_csv(FILE_PATH_VALIDATION)
dfValidation002 = pd.read_csv(FILE_PATH_VALIDATION_002)

In [81]:
dfValidationReg

Unnamed: 0,MCF2L2,FOXC1,YBX1,IGF2BP2,FSCN1,GABRP,SOX10,CENPA,PADI2,CDC20,...,TBC1D22B,TFF1,TGFB3,UBE2C,VTCN1,WTAP,YES1,YOD1,tnbc,case_id
0,2.906656,-0.045557,-3.404675,0.784476,-1.030411,-0.178807,0.548661,-2.150723,0.014748,-2.064972,...,0.218273,-0.171400,1.878036,-1.979163,-0.817563,0.321491,-3.407671,-0.297077,False,GSM1589130
1,3.251919,-0.578161,-2.489785,-0.405011,-1.139367,-0.237349,0.341292,-2.022550,-0.096569,-1.961587,...,0.262102,-0.341459,1.886232,-2.141611,-0.925359,0.142187,-2.517126,0.595815,False,GSM1589132
2,-0.994381,-0.289689,-0.592803,-0.044653,-0.694771,0.250623,0.948817,-2.135303,-0.236685,-1.886303,...,-0.318314,1.812879,-0.805355,-2.244029,0.669864,-0.144367,-0.606983,-0.405455,False,GSM1589135
3,0.542396,-0.357438,-1.657265,0.266690,-1.064841,0.153747,0.556115,-2.144512,0.034366,-2.239165,...,-0.304682,1.588648,1.478626,-2.172633,0.313476,0.010790,-1.144171,-0.318650,False,GSM1589136
4,2.880789,0.087706,-3.048958,0.275307,-1.107604,-0.182131,0.661719,-1.975151,-0.130464,-2.156158,...,-0.086079,1.918943,1.139478,-2.126164,0.243492,0.337446,-2.396946,-0.065357,False,GSM1589139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,-0.307491,1.890169,-0.362390,0.862283,0.464287,1.248310,1.821276,-0.649643,0.231845,-0.424073,...,0.153752,-0.694506,-0.154748,-0.603112,1.337267,0.008891,1.046461,0.637177,True,GSM1977806
260,-0.787413,0.360567,-0.191468,1.306563,-0.513155,1.042681,0.568771,0.712811,-0.089895,0.083339,...,0.984941,-0.582045,0.541873,0.775003,0.943340,0.899344,0.004209,0.596294,True,GSM1977807
261,-0.627182,-0.976734,-0.269421,-0.674288,-0.048087,-1.879732,-1.281948,0.178204,-0.596554,-0.183282,...,-0.712770,-0.494037,0.415678,-0.236496,-0.255125,-0.964955,0.342264,-0.740909,True,GSM1977808
262,0.469178,-0.884371,0.265612,0.259593,0.334620,-0.806102,-0.446514,-0.631191,-1.124660,-0.655525,...,-0.588675,0.009444,0.603756,-0.832212,-1.175730,-0.509135,-1.919895,0.104807,True,GSM1977809


In [82]:
dfValidation002

Unnamed: 0,MCF2L2,FOXC1,YBX1,IGF2BP2,FSCN1,GABRP,SOX10,CENPA,PADI2,CDC20,...,TBC1D22B,TFF1,TGFB3,UBE2C,VTCN1,WTAP,YES1,YOD1,tnbc,case_id
0,2.906656,-0.045557,-3.404675,0.784476,-1.030411,-0.178807,0.548661,-2.150723,0.014748,-2.064972,...,0.218273,-0.171400,1.878036,-1.979163,-0.817563,0.321491,-3.407671,-0.297077,False,GSM1589130
1,3.251919,-0.578161,-2.489785,-0.405011,-1.139367,-0.237349,0.341292,-2.022550,-0.096569,-1.961587,...,0.262102,-0.341459,1.886232,-2.141611,-0.925359,0.142187,-2.517126,0.595815,False,GSM1589132
2,-0.994381,-0.289689,-0.592803,-0.044653,-0.694771,0.250623,0.948817,-2.135303,-0.236685,-1.886303,...,-0.318314,1.812879,-0.805355,-2.244029,0.669864,-0.144367,-0.606983,-0.405455,False,GSM1589135
3,0.542396,-0.357438,-1.657265,0.266690,-1.064841,0.153747,0.556115,-2.144512,0.034366,-2.239165,...,-0.304682,1.588648,1.478626,-2.172633,0.313476,0.010790,-1.144171,-0.318650,False,GSM1589136
4,2.880789,0.087706,-3.048958,0.275307,-1.107604,-0.182131,0.661719,-1.975151,-0.130464,-2.156158,...,-0.086079,1.918943,1.139478,-2.126164,0.243492,0.337446,-2.396946,-0.065357,False,GSM1589139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,-0.307491,1.890169,-0.362390,0.862283,0.464287,1.248310,1.821276,-0.649643,0.231845,-0.424073,...,0.153752,-0.694506,-0.154748,-0.603112,1.337267,0.008891,1.046461,0.637177,True,GSM1977806
260,-0.787413,0.360567,-0.191468,1.306563,-0.513155,1.042681,0.568771,0.712811,-0.089895,0.083339,...,0.984941,-0.582045,0.541873,0.775003,0.943340,0.899344,0.004209,0.596294,True,GSM1977807
261,-0.627182,-0.976734,-0.269421,-0.674288,-0.048087,-1.879732,-1.281948,0.178204,-0.596554,-0.183282,...,-0.712770,-0.494037,0.415678,-0.236496,-0.255125,-0.964955,0.342264,-0.740909,True,GSM1977808
262,0.469178,-0.884371,0.265612,0.259593,0.334620,-0.806102,-0.446514,-0.631191,-1.124660,-0.655525,...,-0.588675,0.009444,0.603756,-0.832212,-1.175730,-0.509135,-1.919895,0.104807,True,GSM1977809


In [108]:
df[df['tnbc']==False]
997/115

8.669565217391304

In [113]:
#332 total
dfValidation[dfValidation['tnbc'] ==True]  #220
dfValidation[dfValidation['tnbc'] ==False] 

dfValidation[dfValidation['tnbc'] ==True].head(20)

Unnamed: 0,MCF2L2,FOXC1,YBX1,IGF2BP2,FSCN1,GABRP,SOX10,CENPA,PADI2,CDC20,...,TBC1D22B,TFF1,TGFB3,UBE2C,VTCN1,WTAP,YES1,YOD1,tnbc,case_id
112,-0.163981,1.691638,1.528404,1.526723,-1.592138,0.999286,2.297211,1.028926,-0.21532,0.945575,...,-0.00203,-0.886626,-0.123615,1.040317,0.159182,-0.100783,2.796527,2.165617,True,GSM1589058
113,-0.488798,1.531626,1.527889,1.289593,-1.374284,0.955714,2.05451,1.016185,-0.203311,0.899762,...,0.038836,-0.913973,-0.893895,0.95781,0.148716,-0.664334,2.619827,1.299883,True,GSM1589059
114,-0.198854,0.216323,0.687347,-1.359696,2.364885,1.461778,1.386387,1.18227,-0.190582,0.624134,...,-0.132213,-0.899293,-0.532909,1.070009,-0.803913,-0.377545,0.09518,-0.127662,True,GSM1589060
115,0.503709,1.233748,0.688273,-0.342586,1.267982,0.798976,2.223217,0.751371,-0.159751,0.952122,...,0.012517,-0.910885,-0.033825,1.12742,0.600881,-0.396303,-0.198054,-0.090862,True,GSM1589061
116,-0.025715,1.194999,1.503451,1.56118,1.005184,1.49275,1.728801,0.872014,-0.469725,0.778729,...,-1.114025,-0.905298,-0.008528,0.306684,0.550856,-0.503862,1.797447,-0.352537,True,GSM1589062
117,-0.330107,0.67047,-0.397707,-1.828104,0.289915,1.276253,2.421869,0.299095,0.091798,0.695323,...,0.02643,-0.914654,-0.429029,0.107109,0.165502,-0.435364,-0.219763,-0.139151,True,GSM1589063
118,0.520854,1.536707,0.25825,1.92191,-1.435818,1.025624,-0.163363,0.684653,-0.154175,1.427224,...,-0.137024,-0.843319,0.403726,0.90684,0.837267,0.13584,1.822109,-0.155946,True,GSM1589054
119,0.560061,1.535018,0.417131,1.812176,-1.165432,1.084507,-0.133023,0.78569,-0.139477,1.396801,...,0.092403,-0.90538,0.750262,0.816615,0.894673,-0.361743,2.136433,-0.056679,True,GSM1589055
120,0.266075,0.670089,0.082085,0.83919,2.041665,1.341805,1.913625,0.953002,-0.175881,0.953465,...,-0.002017,-0.910456,0.025725,0.638467,-0.368061,-0.413918,0.182491,-0.095479,True,GSM1589056
121,-0.536471,1.199058,0.536826,0.560419,0.758834,0.847697,1.35916,0.815348,-0.499343,0.518356,...,-0.955723,-0.907308,0.1482,0.521181,0.96722,-0.545877,0.758471,-0.168323,True,GSM1589045
