In [None]:
## Denne notebooken er et skall for Grid Search, kryssvalidering og prediksjon på eksternt datasett
## Skriptet ble kjørt igjennom én gang for hvert undersett hentet ut fra filen "Skall for egenskapsutvelging ved RENT.ipynb"

In [None]:
# Importering av nødvendige pakker
from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import PassiveAggressiveClassifier
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

In [None]:
# Importering av treningsdata fra OUS
X = pd.read_csv('Undersett02DFS.csv', index_col=0)
X = pd.DataFrame(X)
y = pd.read_excel('data/tabular_data/ous/response_ous.xlsx', index_col=0)
y = pd.DataFrame(y)
y = y['event_DFS']

In [None]:
# Sjekk av korrelasjon mellom egenskaper i undersett 
cmat = Xtrain.corr()
cmat

# Dersom to egenskaper har høyere korrelasjon enn 0.985 fjernes egenskapen ved bruk av følgende kode:
#Xtrain = Xtrain.drop(labels=['egenskap'],axis=1)

# Konvertere X og y til numpy arrays
X_train=X.values
y_train=y.values

In [None]:
# Importering av testdata fra MAASTRO
X_test = pd.read_csv('maastro.csv')
X_test = pd.DataFrame(X_test)

X_test = X_test[[X.columns]]
y_test = pd.read_excel('data/tabular_data/maastro/response_maastro.xlsx')
y_test = pd.DataFrame(y_test)
y_test = y_test['DFS_event']

In [None]:
# Definering av rammeverk for de ulike klassifiseringsalgoritmene

# Logistisk regresjon
pipe_lr = make_pipeline(StandardScaler(),
                        LR())

# Random forest
pipe_rf = make_pipeline(StandardScaler(),
                        RandomForestClassifier())  

# KNN
pipe_kn = make_pipeline(StandardScaler(), 
                        KNeighborsClassifier())

# SVM
pipe_svm = make_pipeline(StandardScaler(),
                         SVC())

# AdaBoost
pipe_ab = make_pipeline(StandardScaler(), 
                        AdaBoostClassifier())

# QDA
pipe_qda = make_pipeline(StandardScaler(), 
                         QuadraticDiscriminantAnalysis())

# Passive aggressive
pipe_pa = make_pipeline(StandardScaler(),
                        PassiveAggressiveClassifier())

# LightGBM
pipe_lgb= make_pipeline(StandardScaler(), 
                        LGBMClassifier())



## Grid search

I denne seksjonen søker de ulike klassifiseringsalgoritmene gjennom ulike parameterkombinasjoner, for å finne kombinasjonen som gir høyest ytelse.

For hver klassifiseringsalgoritme utføres følgende:
- Parameter-grid med ulike verdier av ulike hyperparametere defineres
- Grid search settes opp
- Grid search-algoritmen tilpasses med treningsdataen
- Hyperparameterkombinasjonen som ga høyest ytelse under kryssvalidering lagres

In [None]:
# Logistisk regresjon

penalty  = ['l1', 'l2', 'elasticnet', 'none']
max_iter = [100, 500, 1000]
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
C = [0.1,1,10,100]

param_grid_lr   = {'logisticregression__max_iter': max_iter, 'logisticregression__penalty': penalty,
                   'logisticregression__solver': solver, 'logisticregression__C': C}

gs_lr = GridSearchCV(estimator=pipe_lr, 
                  param_grid=param_grid_lr, 
                  scoring='matthews_corrcoef',
                  cv=RepeatedStratifiedKFold(n_repeats=10, n_splits =5, random_state=1),
                  n_jobs=-1)


gs_lr = gs_lr.fit(X_train, y_train)
print(gs_lr.best_score_)
print(gs_lr.best_params_)

clf_lr = gs_lr.best_estimator_

In [None]:
# Random forest

criterion  = ['gini', 'entropy']
max_depth = [2,4,6,8,10]
n_estimators = [20,35,50,65,80,95,110]

param_grid_tree   = {'randomforestclassifier__max_depth': max_depth, 'randomforestclassifier__n_estimators': n_estimators, 'randomforestclassifier__criterion': criterion}

gs_tree = GridSearchCV(estimator=pipe_rf, 
                  param_grid=param_grid_tree, 
                  scoring='matthews_corrcoef', 
                  cv=RepeatedStratifiedKFold(n_repeats=10, n_splits =5, random_state=1),
                  n_jobs=-1)

gs_tree = gs_tree.fit(X_train, y_train)
print(gs_tree.best_score_)
print(gs_tree.best_params_)

clf_tree = gs_tree.best_estimator_

In [None]:
# K-nærmeste naboer

n_neighbors = [2,3,4,5,6,7,8]
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
weights = ['uniform', 'distance']
leaf_size = [10,20,30,40,50]
metric = ['minkowski', 'euclidian']
p = [1,2]


param_grid_kn   = {'kneighborsclassifier__n_neighbors': n_neighbors, 'kneighborsclassifier__algorithm': algorithm,
                   'kneighborsclassifier__weights': weights, 'kneighborsclassifier__leaf_size': leaf_size,
                   'kneighborsclassifier__metric': metric,'kneighborsclassifier__p': p }

gs_kn = GridSearchCV(estimator=pipe_kn, 
                  param_grid=param_grid_kn, 
                  scoring='matthews_corrcoef', 
                  cv=RepeatedStratifiedKFold(n_repeats=10, n_splits =5, random_state=1),
                  n_jobs=-1)

gs_kn = gs_kn.fit(X_train, y_train)
print(gs_kn.best_score_
print(gs_kn.best_params_)

clf_kn = gs_kn.best_estimator_


In [None]:
# Support vector machines
C = [0.1,1,10,100]
kernel = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
gamma = ['scale', 'auto', 0.1, 0.3, 0.5, 0.7]

param_grid_svm   = {'svc__C': C, 'svc__kernel': kernel,
                   'svc__gamma': gamma }

gs_svm = GridSearchCV(estimator=pipe_svm, 
                  param_grid=param_grid_svm, 
                  scoring='matthews_corrcoef', 
                  cv=RepeatedStratifiedKFold(n_repeats=10, n_splits =5, random_state=1),
                  n_jobs=-1)

gs_svm = gs_svm.fit(X_train, y_train)
print(gs_svm.best_score_)
print(gs_svm.best_params_)

clf_svm = gs_svm.best_estimator_


In [None]:
# AdaBoost

n_estimators = [20,35,50,65,80,95,110]
learning_rate = [0.5,0.75,1,1.25,1.5]

param_grid_ab   = {'adaboostclassifier__n_estimators': n_estimators, 'adaboostclassifier__learning_rate': learning_rate}

gs_ab = GridSearchCV(estimator=pipe_ab, 
                  param_grid=param_grid_ab, 
                  scoring='matthews_corrcoef', 
                  cv=RepeatedStratifiedKFold(n_repeats=10, n_splits =5, random_state=1),
                  n_jobs=-1)

gs_ab = gs_ab.fit(X_train, y_train)
print(gs_ab.best_score_)
print(gs_ab.best_params_)

clf_ab = gs_ab.best_estimator_

In [None]:
# Kvadratisk diskriminantanalyse

reg_param= [0, 0.25, 0.5, 0.75, 1, 1.25,1.5,1.75,2]

param_grid_qda   = {'quadraticdiscriminantanalysis__reg_param': reg_param}

gs_qda = GridSearchCV(estimator=pipe_qda, 
                  param_grid=param_grid_qda, 
                  scoring='matthews_corrcoef', 
                  cv=RepeatedStratifiedKFold(n_repeats=10, n_splits =5, random_state=1),
                  n_jobs=-1)

gs_qda = gs_qda.fit(X_train, y_train)
print(gs_qda.best_score_)
print(gs_qda.best_params_)

clf_qda = gs_qda.best_estimator_

In [None]:
# Passive aggressive

C = [0.1,1,10,100]
max_iter = [100,500,1000,1300]

param_grid_pa   = {'passiveaggressiveclassifier__C': C, 'passiveaggressiveclassifier__max_iter': max_iter}

gs_pa = GridSearchCV(estimator=pipe_pa, 
                  param_grid=param_grid_pa, 
                  scoring='matthews_corrcoef', 
                  cv=RepeatedStratifiedKFold(n_repeats=10, n_splits =5, random_state=1),
                  n_jobs=-1)

gs_pa = gs_pa.fit(X_train, y_train)
print(gs_pa.best_score_)
print(gs_pa.best_params_)

clf_pa = gs_pa.best_estimator_

In [None]:
# Light gradient boosting machine

boosting_type=['gbdt', 'dart', 'goss', 'rf']
num_leaves = [11, 21, 31, 41, 51]
n_estimators = [20,35,50,65,80,95,110]
learning_rate = [0.5,0.75,1,1.25,1.5]

param_grid_lgb   = {'lgbmclassifier__boosting_type': boosting_type, 'lgbmclassifier__num_leaves': num_leaves,
                   'lgbmclassifier__n_estimators': n_estimators, 'lgbmclassifier__learning_rate': learning_rate}

gs_lgb = GridSearchCV(estimator=pipe_lgb, 
                  param_grid=param_grid_lgb, 
                  scoring='matthews_corrcoef', 
                  cv=RepeatedStratifiedKFold(n_repeats=10, n_splits =5, random_state=1),
                  n_jobs=-1)

gs_lgb = gs_lgb.fit(X_train, y_train)
print(gs_lgb.best_score_)
print(gs_lgb.best_params_)
clf_lgb = gs_lgb.best_estimator_


## Kryssvalidering og prediksjon

I denne seksjonen utføres en 5-foldet stratifisert kryssvalidering repetert 100 ganger. Deretter testes de ulike tilpassede klassifiseringsalgoritmene på det eksterne datasettet. Dette repeteres for hver klassifiseringsalgoritme.

In [None]:
classifiers = [clf_lr, clf_rf, clf_kn, clf_svm, clf_qda, clf_pa, clf_lgb]

for clf in classifiers:

    kfold_lr = RepeatedStratifiedKFold(n_repeats=100,n_splits=5).split(X_train, y_train)

    f1 = []
    f1_flipp = []
    acc = []
    roc = [] 
    mcc =[]

    for train, test in kfold_lr:
        clf.fit(X_train[train], y_train[train])
        f1_s = f1_score( y_train[test], clf.predict(X_train[test]))
        f1_flipp_s = f1_score( 1-y_train[test], 1-clf.predict(X_train[test]))
        acc_s = accuracy_score( y_train[test], clf.predict(X_train[test]))
        roc_s = roc_auc_score( y_train[test], clf.predict(X_train[test]))
        mcc_s = matthews_corrcoef(y_train[test], clf.predict(X_train[test]))
    
        f1.append(f1_s)
        f1_flipp.append(f1_flipp_s)
        acc.append(acc_s)
        roc.append(roc_s)
        mcc.append(mcc_s)
    
    print(f'Resultater - kryssvalidering for {clf} :')
    print(f'F1 1: {np.mean(f1):.3f} +/- {np.std(f1):.3f}')
    print(f'F1 0: {np.mean(f1_flipp):.3f} +/- {np.std(f1_flipp):.3f}')
    print(f'Accuracy: {np.mean(acc):.3f} +/- {np.std(acc):.3f}')
    print(f'ROC AUC: {np.mean(roc):.3f} +/- {np.std(roc):.3f}')
    print(f'MCC: {np.mean(mcc):.3f} +/- {np.std(mcc):.3f}')

    # Prediksjon på usett data
    prediction_model = clf.fit(X_train, y_train)
    print(f'Prediksjon på eksternt datasett for {clf}:')
    print(f'F1 1: {f1_score(y_test, prediction_model.predict(X_test))}')
    print(f'F1 0: {f1_score(1 - y_test, 1-prediction_model.predict(X_test))}')
    print(f'Accuracy: {accuracy_score(y_test, prediction_model.predict(X_test))}')
    print(f'ROC AUC: {roc_auc_score(y_test, prediction_model.predict(X_test))}')
    print(f'MCC: {matthews_corrcoef(y_test, prediction_model.predict(X_test))}')