In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 17:10:53 2020

@author: wanxiang.shen@u.nus.edu
"""

import warnings,os
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import auc as calculate_auc

import matplotlib.pyplot as plt
import seaborn as sns
from joblib import load, dump

from aggmap import AggMap, AggModel, loadmap
from aggmap import show
np.random.seed(666) #just for reaptable results


from tqdm import tqdm

tqdm.pandas(ascii=True)

color = sns.color_palette("rainbow_r", 6) #PiYG
sns.palplot(color)

In [2]:
dfx  = pd.read_excel('./COVID19.xlsx', sheet_name='data')
dfy  = pd.read_excel('./COVID19.xlsx', sheet_name='sample_info')
mp = loadmap('./saved_model/aggmap.mp')

scale_method = 'standard'


dfx = dfx[dfx.columns[1:]]
cols = ["p-%s" % c for c in dfx.columns]
dfx.columns = cols


Y = pd.get_dummies(dfy['class']).values.astype(float)

In [3]:
data_save_folder = '/raid/shenwanxiang/COVID-19/COV-C'
X_noisys_5 = load(os.path.join(data_save_folder, 'Agg5_noisys.data')) 
X_noisys_1 = load(os.path.join(data_save_folder, 'Agg1_noisys.data')) 

In [4]:
X_clean_5 = X_noisys_5[0] # the first one is the clean set that used for training
X_clean_1 = X_noisys_1[0] # the first one is the clean set that used for training


## single channel

In [5]:
n_splits = 5 #5-fold reapeat 5 times

run_all = []

stddevs = np.arange(0, 0.5, 0.08)

for repeat_seed in [8, 16, 32, 64, 128]: #repeats random seeds
    outer = KFold(n_splits = n_splits, shuffle = True, random_state = repeat_seed)
    outer_idx = list(outer.split(range(len(Y))))

    for i, idx in enumerate(outer_idx):

        train_idx, valid_idx = idx
        fold_num = "fold_%s" % str(i+1).zfill(2) 

        ## clean set to train
        trainX = X_clean_1[train_idx]
        validX = X_clean_1[valid_idx]
        
        trainY = Y[train_idx]
        validY = Y[valid_idx]
        
        
        print("\n input train and test X shape is %s, %s " % (trainX.shape,  validX.shape))
        clf = AggModel.MultiClassEstimator(epochs = 50, 
                                           batch_size = 4,
                                           verbose = 0, gpuid=2)
        clf.fit(trainX, trainY, validX, validY)
        
        for X, std in zip(X_noisys_1, stddevs): 
             ## noise set to valid
            TestX = X[valid_idx]
            TestY = Y[valid_idx]
            loss, acc = clf._model.evaluate(TestX, TestY, verbose=0)
            res = {'fold_num':fold_num, 'repeat_seed':repeat_seed, 'std':std, 'acc':acc, 'loss':loss}
            print(res)
            run_all.append(res)


 input train and test X shape is (289, 10, 9, 1), (73, 10, 9, 1) 
{'epochs': 50, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 4, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 10000, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.0, 'acc': 0.9726027, 'loss': 0.056679780352605534}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.08, 'acc': 0.9726027, 'loss': 0.08222468421883779}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.16, 'acc': 0.89041096, 'loss': 0.2881857352714016}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.24, 'acc': 0.7671233, 'loss': 0.6910792507537423}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.32, 'acc': 0.6849315, 'loss': 1.052782176292106}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.4, 'acc': 0.60273975, 'loss': 1.4129714214638487}
{'fold_num': 'fold_01', 'r

In [6]:
df1 = pd.DataFrame(run_all)

In [7]:
df1.groupby('std').mean()

Unnamed: 0_level_0,repeat_seed,acc,loss
std,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,49.6,0.930959,0.174787
0.08,49.6,0.921507,0.216729
0.16,49.6,0.854673,0.379057
0.24,49.6,0.761918,0.641566
0.32,49.6,0.704947,0.934791
0.4,49.6,0.655152,1.235957
0.48,49.6,0.618706,1.519363


## multi-channel

In [8]:
n_splits = 5 #5-fold reapeat 5 times
run_all_5 = []

for repeat_seed in [8, 16, 32, 64, 128]: #repeats random seeds
    outer = KFold(n_splits = n_splits, shuffle = True, random_state = repeat_seed)
    outer_idx = list(outer.split(range(len(Y))))

    for i, idx in enumerate(outer_idx):

        train_idx, valid_idx = idx
        fold_num = "fold_%s" % str(i+1).zfill(2) 

        ## clean set to train
        trainX = X_clean_5[train_idx]
        validX = X_clean_5[valid_idx]
        
        trainY = Y[train_idx]
        validY = Y[valid_idx]

        print("\n input train and test X shape is %s, %s " % (trainX.shape,  validX.shape))
        clf = AggModel.MultiClassEstimator(epochs = 50,
                                           batch_size = 4,
                                           verbose = 0, gpuid=2)
        clf.fit(trainX, trainY, validX, validY)
        
        for X, std in zip(X_noisys_5, stddevs): 
             ## noise set to valid
            TestX = X[valid_idx]
            TestY = Y[valid_idx]
            loss, acc = clf._model.evaluate(TestX, TestY, verbose=0)
            res = {'fold_num':fold_num, 'repeat_seed':repeat_seed, 'std':std, 'acc':acc, 'loss':loss}
            print(res)
            run_all_5.append(res)


 input train and test X shape is (289, 10, 9, 5), (73, 10, 9, 5) 
{'epochs': 50, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 4, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 10000, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.0, 'acc': 0.94520545, 'loss': 0.11625060826948244}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.08, 'acc': 0.9589041, 'loss': 0.13956762217495539}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.16, 'acc': 0.82191783, 'loss': 0.4495305990519589}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.24, 'acc': 0.739726, 'loss': 0.9942153577935205}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.32, 'acc': 0.65753424, 'loss': 1.582890321130622}
{'fold_num': 'fold_01', 'repeat_seed': 8, 'std': 0.4, 'acc': 0.6438356, 'loss': 2.1917784801901203}
{'fold_num': 'fold_01', 're

In [9]:
df5 = pd.DataFrame(run_all_5)

In [10]:
df5.groupby(['std']).mean()

Unnamed: 0_level_0,repeat_seed,acc,loss
std,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,49.6,0.947527,0.132088
0.08,49.6,0.932017,0.159737
0.16,49.6,0.846986,0.349882
0.24,49.6,0.740845,0.758493
0.32,49.6,0.662321,1.270043
0.4,49.6,0.611012,1.81466
0.48,49.6,0.565677,2.343317
