<a href="https://colab.research.google.com/github/siksimi/DLhands_on/blob/master/20200606_hands_on_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import roc_auc_score, f1_score

In [0]:
df_int = pd.read_csv('GvM_training_cohort.csv')
df_ext = pd.read_csv('GvM_ext_cohort.csv')

In [0]:
bool_idx = df_int.columns.str.startswith('CE_mask') | \
    df_int.columns.str.startswith('periT2_mask') & \
    (~ df_int.columns.str.contains('EdgeContrast'))
bool_idx = df_ext.columns.str.startswith('CE_mask') | \
    df_ext.columns.str.startswith('periT2_mask') & \
    (~ df_ext.columns.str.contains('EdgeContrast'))

X_int = df_int.loc[:, bool_idx]
y_int = df_int['Label']
X_ext = df_ext.loc[:, bool_idx]
y_ext = df_ext['Label']

In [7]:
X_train, X_val, y_train, y_val = \
  train_test_split (X_int, y_int, test_size=0.3, random_state=2020, stratify=y_int)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_ext = scaler.transform(X_ext)

print('No of total in training/validation sets: {0} / {1}'.format(len(X_train), len(X_val)))
print('Proportion of Mets in training/validation sets: {0} / {1}'.format(y_train.sum()/len(y_train), y_val.sum()/len(y_val)))

No of total in training/validation sets: 116 / 51
Proportion of Mets in training/validation sets: 0.3448275862068966 / 0.35294117647058826


In [9]:
combs = [(k, c) for k in [20, 40, 60, 80, 100, 120, 140, 160, 180, 200] \
        for c in [0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.3, 0.4, 0.5]]

best_auc = 0

for k, c in combs:
  fs = SelectKBest(mutual_info_classif, k = k)
  fs.fit(X_train, y_train)
  X_train_t = fs.transform(X_train)
  X_val_t = fs.transform(X_val)
  X_ext_t = fs.transform(X_ext)

  model = LogisticRegression(C = c, penalty='l1', solver='liblinear', random_state=0)
  model.fit(X_train_t, y_train)
  y_pred = model.predict(X_val_t)
  auc = roc_auc_score(y_val, y_pred)
  f1 = f1_score(y_val, y_pred)
  print('Training with k = {0}, c = {1}\n AUROC = {2}, F1 score = {3}'.format(k, c, auc, f1))

  y_pred = model.predict(X_ext_t)
  auc_ext = roc_auc_score(y_ext, y_pred)
  f1_ext = f1_score(y_ext, y_pred)
  print('Inference with k = {0}, c = {1}\n AUROC = {2}, F1 score = {3}\n'.format(k, c, auc_ext, f1_ext))

  if auc > best_auc:
    best_auc = auc
    best_model = [fs, model, auc, auc_ext]
  
print(best_model)

Training with k = 20, c = 0.03
 AUROC = 0.5, F1 score = 0.0
Inference with k = 20, c = 0.03
 AUROC = 0.5, F1 score = 0.0

Training with k = 20, c = 0.04
 AUROC = 0.8131313131313131, F1 score = 0.7567567567567567
Inference with k = 20, c = 0.04
 AUROC = 0.699375, F1 score = 0.6478873239436619

Training with k = 20, c = 0.05
 AUROC = 0.8838383838383838, F1 score = 0.8421052631578948
Inference with k = 20, c = 0.05
 AUROC = 0.745, F1 score = 0.6956521739130435

Training with k = 20, c = 0.06
 AUROC = 0.898989898989899, F1 score = 0.8648648648648649
Inference with k = 20, c = 0.06
 AUROC = 0.765, F1 score = 0.7164179104477612

Training with k = 20, c = 0.07
 AUROC = 0.898989898989899, F1 score = 0.8648648648648649
Inference with k = 20, c = 0.07
 AUROC = 0.7949999999999999, F1 score = 0.75

Training with k = 20, c = 0.08
 AUROC = 0.898989898989899, F1 score = 0.8648648648648649
Inference with k = 20, c = 0.08
 AUROC = 0.8049999999999999, F1 score = 0.7619047619047619

Training with k = 20,