In [62]:
!nvidia-smi

Mon Mar 18 10:43:50 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0              31W / 250W |    326MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [63]:
!pip install --upgrade scikit-learn



In [64]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
from tqdm import tqdm

# Pytorch
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter

import sklearn
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [65]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

class cfg:
    seed = 5201314      # Your seed number, you can pick your lucky number. :)
    select_all = True   # Whether to use all features.
    valid_ratio = 0.1   # validation_size = train_size * valid_ratio
    n_epochs = 3000     # Number of epochs.            
    batch_size = 64
    lr_lambda = 0.996
    optimizer = 'Adam'
    early_stop = 600  # 600   # If model has not improved for this many consecutive epochs, stop training.     
    train_path = '/kaggle/input/rtmets-sarcopenia/RT_spine_NESMS_info/train.csv'
    valid_path = '/kaggle/input/rtmets-sarcopenia/RT_spine_NESMS_info/valid.csv'
    all_data_path = '/kaggle/input/rtmets-sarcopenia/RT_spine_NESMS_info/all.csv'
    train_val_test_list_path = '/kaggle/input/rtmets-sarcopenia/train_val_test_split.csv'

    radiomics_idx = 7
    pred_days = 42 # 42, 90, 365
    y_data_reverse = True
    data_using = 'radiomics' # 'clinical', 'radiomics', 'all'
    data_sampling = 'no-sampling' # 'no-sampling' | 'over-sampling' | 'under-sampling'
    
    # hyper-parameters for the optimizer (depends on which optimizer you are using)
    opt_params = {
        'lr': 1e-2,                        # learning rate of Adam
#         'momentum': 0.9,
        'weight_decay': 5e-3              # weight decay of Adam   
    }
    
    assert pred_days in (42, 90, 365)
    assert data_using in ('clinical', 'radiomics', 'all')

    post_fix = '' if select_all else 'selected'
    model_name = f'{pred_days}d_{post_fix}_{data_sampling}_{data_using}_model'
    save_path = f'./models/'  # Your model will be saved here.
    pred_path = f'./preds/{model_name}_pred.csv'
    
    

In [66]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)                        
        with torch.no_grad():                   
            pred = model(x)                     
            preds.append(pred.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()  
    return preds

def plot_learning_curve(loss_record, title=''):
    ''' Plot learning curve of your DNN (train & dev loss) '''
    total_steps = len(loss_record['train'])
    x_1 = range(total_steps)
    x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])]
    figure(figsize=(6, 4))
    plt.plot(x_1, loss_record['train'], c='tab:red', label='train')
    plt.plot(x_2, loss_record['dev'], c='tab:cyan', label='dev')
    plt.ylim(0.0, 5.)
    plt.xlabel('Training steps')
    plt.ylabel('MSE loss')
    plt.title('Learning curve of {}'.format(title))
    plt.legend()
    plt.show()

def evaluate_preds(y_test, preds, threshold=0.5):
    _preds = []
    for i, v in enumerate(preds):
        _preds.append(1 if (v > threshold) else 0)

    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_test, _preds).ravel()
#     print(f'tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}')

    acc = (tp+tn)/(tn+fp+fn+tp)
    # acc = sklearn.metrics.accuracy_score(y_test, _preds)

    tpr = tp/(tp+fn) # Recall, tpr, sensitivity
    # sensitivity = sklearn.metrics.recall_score(y_test, _preds)

    tnr = tn/(tn+fp) # tnr, specificity
    
    ppv = tp/(tp+fp) # Precision & ppv
    # precision = sklearn.metrics.precision_score(y_test, _preds)

    npv = tn/(tn+fn)
    
#     try:
#         f1 = ((2*ppv*tpr)/(ppv+tpr))
#     except:
#         f1 = 0
    # f1 = sklearn.metrics.f1_score(y_test, _preds)

    y_test = 1 - np.array(y_test)
    auc = sklearn.metrics.roc_auc_score(y_test, preds)
    auc = 1 - auc if auc < 0.5 else auc
    
#     fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, preds)
#     roc_auc = sklearn.metrics.auc(fpr, tpr)
#     display = sklearn.metrics.RocCurveDisplay(
#         fpr=fpr,
#         tpr=tpr,
#         roc_auc=roc_auc
#     )
#     display.plot()
#     plt.show()

    print(f'acc: {acc:.4}, tpr: {tpr:.4}, tnr: {tnr:.4}, ppv: {ppv:.4}, npv: {npv:.4}, auc: {auc:.4}')

    return acc, auc

def save_pred(preds, file, label=None):
    ''' Save predictions to specified file '''
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        header = ['id', 'pred'] if label is None else ['id', 'pred', 'label']
        writer.writerow(header)
        for i, p in enumerate(preds):
            if label is None:
                writer.writerow([i, p])
            else:
                writer.writerow([i, p, label[i]])

In [67]:
def data_preparing(data_path):
    res = pd.read_csv(data_path)
    data = res.values
    header = res.columns.to_numpy()
    
    x_data, y_42d_data, y_90d_data, y_365d_data = data[:, :-3], data[:, -3], data[:, -2], data[:, -1]
    
    return x_data, y_42d_data, y_90d_data, y_365d_data, header

def distribution_map_preparing(train_val_test_list_path):
    test = pd.read_csv(train_val_test_list_path, header=None)
    data = test.values.tolist()
    new_data = []
    for each in data:
        each = [i for i in each if str(i) != 'nan']
        each = each[1:]
        new_data.append(each)
    return new_data

In [68]:
class RTDataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self,
                 sampling,
                 days,
                 k,
                 train_val_test_list_path,
                 clinical_data_path,
                 radiomics_idx,
                 data_using,
                 mode,
                 reverse
                ):
        assert sampling in ('over-sampling', 'under-sampling', 'no-sampling')
        assert days in (42, 90, 365)
        
        print(f'Dataset Properties:\nSampling:{sampling}\nDays:{days}\nF:{k}\nData_using:{data_using}\nMode:{mode}')
        
        split_map = distribution_map_preparing(train_val_test_list_path)
        x_data, y_42d_data, y_90d_data, y_365d_data, self.header = data_preparing(clinical_data_path)
        
        if data_using == 'clinical':
            self.x_data = x_data[:, radiomics_idx:]
        elif data_using == 'radiomics':
            self.x_data = x_data[:, :radiomics_idx]
        
        if days == 42:
            self.y_data = y_42d_data
            base_idx = 0
        elif days == 90:
            self.y_data = y_90d_data
            base_idx = 15
        elif days == 365:
            self.y_data = y_365d_data
            base_idx = 30
        else:
            raise
        
        if sampling == 'under-sampling':
            base_idx += 45
        elif sampling == 'no-sampling':
            base_idx += 90
            
        base_idx += k * 3
        
        if mode == 'train':
            self.split_map = split_map[base_idx] + split_map[base_idx + 1]
        elif mode == 'val':
            self.split_map = split_map[base_idx + 2]
        else:
            raise
            
        if reverse:
            self.y_data = 1 - self.y_data

    def __getitem__(self, origin_idx):
        idx = int(self.split_map[origin_idx] - 1)
        clinical_values = torch.tensor(self.x_data[idx], dtype=torch.float32)

        # label
        label = self.y_data[idx]
        label = torch.tensor(label, dtype=torch.float)
        
        return clinical_values, label

    def __len__(self):
        return len(self.split_map)

In [69]:
class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        # TODO: modify model's structure, be aware of dimensions. 
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
#             nn.Dropout(p=0.5),
#             nn.ReLU(),
#             nn.Linear(128, 64),
#             nn.Dropout(p=0.4),
            nn.ReLU(),
            nn.Linear(64, 32),
#             nn.Dropout(p=0.3),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1) # (B, 1) -> (B)
        return x

In [70]:
def feature_selector(features, targets, header=None):
    kbest = SelectKBest(score_func=f_regression, k=30)
    fit = kbest.fit(features, targets)
    
    if header is not None:
#         print(
#             { header[i]: kbest.scores_[i] for i in range(len(header)) }
#         )
        dfscores = pd.DataFrame(fit.scores_)
        dfcolumns = pd.DataFrame(header)

        # concat two dataframes for better visualization 
        featureScores = pd.concat([dfcolumns, dfscores], axis=1)
        featureScores.columns = ['Specs', 'Score']  # naming the dataframe columns
        largest_30 = featureScores.nlargest(30, 'Score')
        print(largest_30)
        return largest_30.index.to_numpy()
    
    return np.array([])

In [71]:
def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set)) 
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

def data_reverse(y_data):
    return 1 - y_data

def select_feat(train_data, test_data, valid_ratio, seed, select_all=True, days=42, y_data_reverse=False, cross_valid=1):
    if days == 42:
        y_idx = -3
    elif days == 90:
        y_idx = -2
    elif days == 365:
        y_idx = -1
    else:
        raise

    if (cross_valid == 1):
        '''Selects useful features to perform regression'''
        train_data, valid_data = train_valid_split(train_data, valid_ratio, seed)
        
        # Print out the data size.
        print(f"""train_data size: {train_data.shape} 
        valid_data size: {valid_data.shape} 
        test_data size: {test_data.shape}""")
        
        y_train, y_valid, y_test = train_data[:,y_idx], valid_data[:,y_idx], test_data[:, y_idx]
        raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-3], valid_data[:,:-3], test_data[:,:-3]

        if y_data_reverse:
            y_train, y_valid, y_test = data_reverse(y_train), data_reverse(y_valid), data_reverse(y_test)

        if select_all:
            feat_idx = list(range(raw_x_train.shape[1]))
        else:
    #         feat_idx = list(range(35, raw_x_train.shape[1])) # TODO: Select suitable feature columns.
    #         pri_feats = [35, 36, 47, 48, 52]
    #         feat_idx = [35, 36, 47, 48, 52] + [i + 18 for i in pri_feats] + [i + 36 for i in pri_feats[:-1]]
            feat_idx = [5, 6, 7, 9, 11, 14, 19, 20, 24, 25, 29, 41, 58, 67, 68]
            
        return [[raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid, y_test]]
    else:
        data = np.concatenate((train_data, test_data), axis=0)
        fold_size = data.shape[0] // cross_valid
        folds = []
        for i in range(cross_valid):
            _data = data.copy()
            fold_1 = [fold_size * i, fold_size * (i + 1)]
            fold_2 = [fold_size * (i + 1), fold_size * (i + 2)] if i < (cross_valid - 1) else [0, fold_size * (i + 1)]
            valid = _data[: fold_1]
            test = _data[fold_1: fold_2]
            train = _data[:fold_1[0]] + (_data[:fold_2[1]] if i < (cross_valid - 1) else _data[fold_1[1]:])
            y_train, y_valid, y_test = train[:, y_idx], valid[:, y_idx], test[:, y_idx]
            raw_x_train, raw_x_valid, raw_x_test = train[:, :-3], valid[:, :-3], test[:, :-3]
            folds.append([raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid, y_test])

        return folds

In [72]:
def trainer(train_loader, valid_loader, model, cfg, device, k, model_name):
    criterion = nn.MSELoss(reduction='mean') # Define your loss function, do not modify this.

    # Define your optimization algorithm. 
    # TODO: Please check https://pytorch.org/docs/stable/optim.html to get more available algorithms.
    # TODO: L2 regularization (optimizer(weight decay...) or implement by your self).
    # optimizer = torch.optim.RMSprop(model.parameters(), lr=cfg.learning_rate, momentum=0.9) 
    # optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate) 
    # Setup optimizer
    optimizer = getattr(torch.optim, cfg.optimizer)(model.parameters(), **cfg.opt_params)
    writer = SummaryWriter() # Writer of tensoboard.
    lambda1 = lambda epoch: cfg.lr_lambda ** epoch
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)

    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count, best_auc = cfg.n_epochs, math.inf, 0, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
        # train_pbar = tqdm(train_loader, position=0, leave=True)
        
        for x, y in train_loader:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x)
            loss = criterion(pred, y)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
            # Display current epoch number and loss on tqdm progress bar.
            # train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            # train_pbar.set_postfix({'loss': loss.detach().item()})

        scheduler.step()
        mean_train_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        
        labels = []
        preds = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)
                preds += pred.detach().cpu().numpy().tolist()
                labels += y.detach().cpu().numpy().tolist()

            loss_record.append(loss.item())
        
        acc, auc = evaluate_preds(labels, preds)
        
        mean_valid_loss = sum(loss_record)/len(loss_record)
        print(f'Fold{k+1} Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)
       
        # Save your best model
        if auc > best_auc:
            best_auc = auc
            torch.save(model.state_dict(), cfg.save_path + '{}_{}_auc{:.4}.ckpt'.format(model_name, epoch, auc))
            save_pred(preds, cfg.save_path + '{}_f{}_{}_auc{:.4}.csv'.format(model_name, k+1, epoch, auc), labels)
            print('Fold{} Saving model with auc {:.3f}...'.format(k+1, best_auc))

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), cfg.save_path + '{}_{}_acc{:.4}.ckpt'.format(model_name, epoch, auc))
            save_pred(preds, cfg.save_path + '{}_f{}_{}_acc{:.4}.csv'.format(model_name, k+1, epoch, auc), labels)
            print('Fold{} Saving model with loss {:.3f}...'.format(k+1, best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= cfg.early_stop:
            print('\nModel is not improving, so we halt the training session.')
            return

In [73]:
# Set seed for reproducibility
same_seed(cfg.seed)

# train_data size: 3009 x 89 (35 states + 18 features x 3 days) 
# test_data size: 997 x 88 (without last day's positive rate)
# train_data, test_data = pd.read_csv(cfg.train_path).values, pd.read_csv(cfg.valid_path).values
# header = pd.read_csv(cfg.train_path).columns.to_numpy()

# Select features
# x_train, x_valid, x_test, y_train, y_valid, y_test
# folds = select_feat(
#     train_data,
#     test_data,
#     cfg.valid_ratio,
#     cfg.seed,
#     cfg.select_all,
#     cfg.pred_days,
#     cfg.y_data_reverse,
#     cfg.cross_valid
# )

# [x_train, x_valid, x_test, y_train, y_valid, y_test] = folds[0]

# Evaluate features' KBest
# feature_selector(x_train, y_train, header)

# Print out the number of features.

# train_dataset, valid_dataset, test_dataset = RTDataset(x_train, y_train), \
#                                             RTDataset(x_valid, y_valid), \
#                                             RTDataset(x_test)
# print(f'number of features: {len(train_dataset.__getitem__(0)[0])}')

# Pytorch data loader loads pytorch dataset into batches.
# train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, pin_memory=True)
# valid_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=True, pin_memory=True)
# test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False, pin_memory=True)

In [74]:
for days in [42, 90, 365]:
    for data_using in ['clinical', 'radiomics']:
        for k in range(5):
            model_name = f'{days}d_{cfg.post_fix}_{cfg.data_sampling}_{data_using}_model'
            print(f'model name: {model_name}\n')
            
            # Data
            trainset = RTDataset(
                clinical_data_path=cfg.all_data_path,
                train_val_test_list_path=cfg.train_val_test_list_path,
                mode="train",
                k=k,
#                 days=cfg.pred_days,
                days=days,
                reverse=cfg.y_data_reverse,
                sampling=cfg.data_sampling,
                data_using=data_using,
                radiomics_idx=cfg.radiomics_idx
            )
            print('training set size: {}\n'.format(trainset.__len__()))
            trainloader = DataLoader(trainset, batch_size=cfg.batch_size, shuffle=True, drop_last=True)

            valset = RTDataset(
                clinical_data_path=cfg.all_data_path,
                train_val_test_list_path=cfg.train_val_test_list_path,
                mode="val",
                k=k,
#                 days=cfg.pred_days,
                days=days,
                reverse=cfg.y_data_reverse,
                sampling=cfg.data_sampling,
                data_using=data_using,
                radiomics_idx=cfg.radiomics_idx
            )
            print('val set size: {}\n'.format(valset.__len__()))
            valloader = DataLoader(valset, batch_size=cfg.batch_size, shuffle=False)

            model = Model(input_dim=len(trainset.__getitem__(0)[0])).to(device) # put your model and data on the same computation device.
            trainer(trainloader, valloader, model, cfg, device, k, model_name)

model name: 42d__no-sampling_clinical_model

Dataset Properties:
Sampling:no-sampling
Days:42
F:0
Data_using:clinical
Mode:train
training set size: 730

Dataset Properties:
Sampling:no-sampling
Days:42
F:0
Data_using:clinical
Mode:val
val set size: 181

acc: 0.7127, tpr: 0.4, tnr: 0.7628, ppv: 0.2128, npv: 0.8881, auc: 0.6377
Fold1 Epoch [1/3000]: Train loss: 3.8605, Valid loss: 0.4809
Fold1 Saving model with auc 0.638...
Fold1 Saving model with loss 0.481...
acc: 0.8619, tpr: 0.0, tnr: 1.0, ppv: nan, npv: 0.8619, auc: 0.5756
Fold1 Epoch [2/3000]: Train loss: 0.2628, Valid loss: 0.2637
Fold1 Saving model with loss 0.264...
acc: 0.8564, tpr: 0.08, tnr: 0.9808, ppv: 0.4, npv: 0.8693, auc: 0.621
Fold1 Epoch [3/3000]: Train loss: 0.1587, Valid loss: 0.1203
Fold1 Saving model with loss 0.120...


  ppv = tp/(tp+fp) # Precision & ppv
  ppv = tp/(tp+fp) # Precision & ppv
  ppv = tp/(tp+fp) # Precision & ppv


acc: 0.8619, tpr: 0.0, tnr: 1.0, ppv: nan, npv: 0.8619, auc: 0.5205
Fold1 Epoch [4/3000]: Train loss: 0.1347, Valid loss: 0.1283
acc: 0.8564, tpr: 0.0, tnr: 0.9936, ppv: 0.0, npv: 0.8611, auc: 0.5592
Fold1 Epoch [5/3000]: Train loss: 0.1331, Valid loss: 0.1203
Fold1 Saving model with loss 0.120...
acc: 0.8619, tpr: 0.0, tnr: 1.0, ppv: nan, npv: 0.8619, auc: 0.5031
Fold1 Epoch [6/3000]: Train loss: 0.1210, Valid loss: 0.1398
acc: 0.8619, tpr: 0.04, tnr: 0.9936, ppv: 0.5, npv: 0.8659, auc: 0.5623
Fold1 Epoch [7/3000]: Train loss: 0.1264, Valid loss: 0.1211
acc: 0.8619, tpr: 0.0, tnr: 1.0, ppv: nan, npv: 0.8619, auc: 0.5349
Fold1 Epoch [8/3000]: Train loss: 0.1165, Valid loss: 0.1212


  ppv = tp/(tp+fp) # Precision & ppv
  ppv = tp/(tp+fp) # Precision & ppv
  ppv = tp/(tp+fp) # Precision & ppv


acc: 0.8619, tpr: 0.0, tnr: 1.0, ppv: nan, npv: 0.8619, auc: 0.5638
Fold1 Epoch [9/3000]: Train loss: 0.1191, Valid loss: 0.1185
Fold1 Saving model with loss 0.118...
acc: 0.8564, tpr: 0.0, tnr: 0.9936, ppv: 0.0, npv: 0.8611, auc: 0.579
Fold1 Epoch [10/3000]: Train loss: 0.1149, Valid loss: 0.1188
acc: 0.8619, tpr: 0.0, tnr: 1.0, ppv: nan, npv: 0.8619, auc: 0.5764
Fold1 Epoch [11/3000]: Train loss: 0.1124, Valid loss: 0.1201
acc: 0.8564, tpr: 0.0, tnr: 0.9936, ppv: 0.0, npv: 0.8611, auc: 0.6136
Fold1 Epoch [12/3000]: Train loss: 0.1120, Valid loss: 0.1170
Fold1 Saving model with loss 0.117...
acc: 0.8564, tpr: 0.0, tnr: 0.9936, ppv: 0.0, npv: 0.8611, auc: 0.5823
Fold1 Epoch [13/3000]: Train loss: 0.1110, Valid loss: 0.1293
acc: 0.8564, tpr: 0.0, tnr: 0.9936, ppv: 0.0, npv: 0.8611, auc: 0.6446
Fold1 Epoch [14/3000]: Train loss: 0.1062, Valid loss: 0.1137
Fold1 Saving model with auc 0.645...
Fold1 Saving model with loss 0.114...
acc: 0.8619, tpr: 0.0, tnr: 1.0, ppv: nan, npv: 0.8619, auc

  ppv = tp/(tp+fp) # Precision & ppv
  ppv = tp/(tp+fp) # Precision & ppv
  ppv = tp/(tp+fp) # Precision & ppv


acc: 0.8564, tpr: 0.0, tnr: 0.9936, ppv: 0.0, npv: 0.8611, auc: 0.6515
Fold1 Epoch [19/3000]: Train loss: 0.1061, Valid loss: 0.1160
acc: 0.8508, tpr: 0.04, tnr: 0.9808, ppv: 0.25, npv: 0.8644, auc: 0.68
Fold1 Epoch [20/3000]: Train loss: 0.1058, Valid loss: 0.1167
acc: 0.8619, tpr: 0.0, tnr: 1.0, ppv: nan, npv: 0.8619, auc: 0.7049
Fold1 Epoch [21/3000]: Train loss: 0.1157, Valid loss: 0.1109
Fold1 Saving model with auc 0.705...
Fold1 Saving model with loss 0.111...
acc: 0.8564, tpr: 0.0, tnr: 0.9936, ppv: 0.0, npv: 0.8611, auc: 0.6708
Fold1 Epoch [22/3000]: Train loss: 0.1074, Valid loss: 0.1266
acc: 0.8619, tpr: 0.04, tnr: 0.9936, ppv: 0.5, npv: 0.8659, auc: 0.6867
Fold1 Epoch [23/3000]: Train loss: 0.1111, Valid loss: 0.1111


  ppv = tp/(tp+fp) # Precision & ppv


acc: 0.8564, tpr: 0.04, tnr: 0.9872, ppv: 0.3333, npv: 0.8652, auc: 0.7033
Fold1 Epoch [24/3000]: Train loss: 0.1098, Valid loss: 0.1164
acc: 0.8564, tpr: 0.0, tnr: 0.9936, ppv: 0.0, npv: 0.8611, auc: 0.7141
Fold1 Epoch [25/3000]: Train loss: 0.1092, Valid loss: 0.1090
Fold1 Saving model with auc 0.714...
Fold1 Saving model with loss 0.109...
acc: 0.8619, tpr: 0.04, tnr: 0.9936, ppv: 0.5, npv: 0.8659, auc: 0.6713
Fold1 Epoch [26/3000]: Train loss: 0.1006, Valid loss: 0.1130
acc: 0.8619, tpr: 0.0, tnr: 1.0, ppv: nan, npv: 0.8619, auc: 0.6985
Fold1 Epoch [27/3000]: Train loss: 0.1012, Valid loss: 0.1277
acc: 0.8564, tpr: 0.0, tnr: 0.9936, ppv: 0.0, npv: 0.8611, auc: 0.7387
Fold1 Epoch [28/3000]: Train loss: 0.1092, Valid loss: 0.1062
Fold1 Saving model with auc 0.739...
Fold1 Saving model with loss 0.106...


  ppv = tp/(tp+fp) # Precision & ppv


acc: 0.8619, tpr: 0.04, tnr: 0.9936, ppv: 0.5, npv: 0.8659, auc: 0.6679
Fold1 Epoch [29/3000]: Train loss: 0.1025, Valid loss: 0.1122
acc: 0.8619, tpr: 0.08, tnr: 0.9872, ppv: 0.5, npv: 0.8701, auc: 0.7374
Fold1 Epoch [30/3000]: Train loss: 0.1025, Valid loss: 0.1112
acc: 0.8619, tpr: 0.04, tnr: 0.9936, ppv: 0.5, npv: 0.8659, auc: 0.7179
Fold1 Epoch [31/3000]: Train loss: 0.0988, Valid loss: 0.1065
acc: 0.8619, tpr: 0.04, tnr: 0.9936, ppv: 0.5, npv: 0.8659, auc: 0.6974
Fold1 Epoch [32/3000]: Train loss: 0.0984, Valid loss: 0.1083
acc: 0.8674, tpr: 0.04, tnr: 1.0, ppv: 1.0, npv: 0.8667, auc: 0.6979
Fold1 Epoch [33/3000]: Train loss: 0.1021, Valid loss: 0.1145
acc: 0.8619, tpr: 0.04, tnr: 0.9936, ppv: 0.5, npv: 0.8659, auc: 0.691
Fold1 Epoch [34/3000]: Train loss: 0.0993, Valid loss: 0.1094
acc: 0.8619, tpr: 0.08, tnr: 0.9872, ppv: 0.5, npv: 0.8701, auc: 0.7026
Fold1 Epoch [35/3000]: Train loss: 0.0978, Valid loss: 0.1127
acc: 0.8674, tpr: 0.04, tnr: 1.0, ppv: 1.0, npv: 0.8667, auc: 0.68

KeyboardInterrupt: 

In [None]:
import requests

requests.get('https://hooks.zapier.com/hooks/catch/18160905/3cvu8pz/')

In [None]:
# model = Model(input_dim=len(train_dataset.__getitem__(0)[0])).to(device)
# model.load_state_dict(torch.load(cfg.save_path))
# preds = predict(test_loader, model, device) 
# save_pred(preds, cfg.pred_path)         

### Data Distribution

In [None]:
# test_data = pd.read_csv(cfg.valid_path).values
# y42 = test_data[:, -3]
# y42_unique, y42_counts = np.unique(y42, return_counts=True)
# # dict(zip(unique, counts))
# # np.asarray((unique, counts)).T
# y90 = test_data[:, -2]
# y90_unique, y90_counts = np.unique(y90, return_counts=True)
# y365 = test_data[:, -1]
# y365_unique, y365_counts = np.unique(y365, return_counts=True)
# print(
#     y42, dict(zip(y42_unique, y42_counts)),
#     y90, dict(zip(y90_unique, y90_counts)),
#     y365, dict(zip(y365_unique, y365_counts))
# )

In [None]:
# all_scorer = sklearn.metrics.get_scorer_names()
# print(all_scorer)

def evaluate_preds(preds, y_test, threshold=0.5):
    _preds = []
    for i, v in enumerate(preds):
        _preds.append(1 if (v > threshold) else 0)

    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_test, _preds).ravel()
    print(tn, fp, fn, tp)

    acc = (tp+tn)/(tn+fp+fn+tp)
    # acc = sklearn.metrics.accuracy_score(y_test, _preds)

    sensitivity = tp/(tp+fn) # Recall
    # sensitivity = sklearn.metrics.recall_score(y_test, _preds)

    precision = tp/(tp+fp) # Precision & ppv
    # precision = sklearn.metrics.precision_score(y_test, _preds)

    npv = tn/(tn+fn)

    specificity = tn/(tn+fp)

    try:
        f1 = ((2*precision*sensitivity)/(precision+sensitivity))
    except:
        f1 = 0
    # f1 = sklearn.metrics.f1_score(y_test, _preds)

    auc = sklearn.metrics.roc_auc_score(y_test, preds)

    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, preds)
    print(fpr)
    print(tpr)
    print(thresholds)
    roc_auc = sklearn.metrics.auc(fpr, tpr)
    display = sklearn.metrics.RocCurveDisplay(
        fpr=fpr,
        tpr=tpr,
        roc_auc=roc_auc
    )
    display.plot()
    plt.show()

    print(f'model_name: {cfg.model_name}')
    print(f'acc: {acc}\nsensitivity: {sensitivity}\nspecificity: {specificity}\nppv(precision): {precision}\nnpv: {npv}\nF1-score: {f1}\nAUC: {auc}')

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

# evaluate_preds(preds, y_test, 0.6597)

In [None]:
# |sensitivity - specificity| < 0.1
# 挑選 model 的方式
# ppv, npv, F1-score 刪除