# Preparation

## Import Libraries

In [4]:
!apt-get update && apt-get install -y python3-opencv

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:9 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:12 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Packages [917 kB]
Get:13 http://archive.

In [3]:
import os, torch, copy, cv2, sys, random
# from datetime import datetime, timezone, timedelta
from PIL import Image
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, confusion_matrix, average_precision_score
import seaborn as sns

## Connect google drive & Change directory

In [5]:
from google.colab import drive


drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab Notebooks/팀 프로젝트/COVID-19_detection_with_CT/')

Mounted at /content/drive


## Set Arguments & hyperparameters

In [6]:
# 시드(seed) 설정

RANDOM_SEED = 2022

torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [7]:
# parameters

### 데이터 디렉토리 설정 ###
DATA_DIR= 'data'
NUM_CLS = 2

EPOCHS = 10
BATCH_SIZE = 17
LEARNING_RATE = 0.0005
EARLY_STOPPING_PATIENCE = 3
INPUT_SHAPE = 384
K_FOLDS = 5
origin_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
NUM_ORIGIN_DATA = len(origin_df)

os.environ["CUDA_VISIBLE_DEVICES"]="0"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# check device
DEVICE

device(type='cuda')

In [9]:
# Set Augmentation variable
VFLIP = True
HFLIP = True
ROTATE = True
FLIP_ROTATE = True
ANGLE = None

## Data Augmetation

In [10]:
# ./data/augmentation 위치에 이미지를 생성하도록 만듦
# vflip : 상하로 뒤집을건지, hflip : 좌우로 뒤집을건지, rotate : 회전을 할건지 (한다면) angle : 지정한값으로 할건지(비우면 random)
def createimage(vflip=True, hflip=True, rotate=True, flip_rotate=True, angle=None):
    # 난수 일관성을 위한 시드 초기화
    np.random.seed(RANDOM_SEED)
    origin_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
    
    if not os.path.isdir(os.path.join(DATA_DIR, 'augmentation2')):
        os.makedirs(os.path.join(DATA_DIR, 'augmentation2'))

    # save original image
    for i in tqdm(range(NUM_ORIGIN_DATA)):
        img = cv2.imread(f'./{DATA_DIR}/train/{i}.png')
        cv2.imwrite(f'./{DATA_DIR}/augmentation2/{i}.png', img)
        

    # vertical flip
    if vflip:
        for i in tqdm(range(NUM_ORIGIN_DATA)):
            img = cv2.imread(f'./{DATA_DIR}/train/{i}.png')
            vflip_img = cv2.flip(img, 0)
            cv2.imwrite(f'./{DATA_DIR}/augmentation2/{i}_vflip.png', vflip_img)
            origin_df.loc[len(origin_df)] = [f'{i}_vflip.png', origin_df.iloc[i]['COVID']]

        
    # horizontal filp
    if hflip:
        for i in tqdm(range(NUM_ORIGIN_DATA)):
            img = cv2.imread(f'./{DATA_DIR}/train/{i}.png')
            hflip_img = cv2.flip(img, 1)
            cv2.imwrite(f'./{DATA_DIR}/augmentation2/{i}_hflip.png', hflip_img)
            origin_df.loc[len(origin_df)] = [f'{i}_hflip.png', origin_df.iloc[i]['COVID']]

    # rotate
    if rotate:
        for i in tqdm(range(NUM_ORIGIN_DATA)):
            img = cv2.imread(f'./{DATA_DIR}/train/{i}.png')
            h, w = img.shape[:2]
            # rotate by Specific value
            if angle:
                rotation = cv2.getRotationMatrix2D((h/2, w/2), angle, 1)
                rotate_img = cv2.warpAffine(img, rotation, (h, w))
            # rotate by random value(5 ~ 35)
            else:
                rand_angle = np.random.randint(30)
                rotation = cv2.getRotationMatrix2D((h/2, w/2), rand_angle, 1)
                rotate_img = cv2.warpAffine(img, rotation, (h, w))
            cv2.imwrite(f'./{DATA_DIR}/augmentation2/{i}_rotate.png', rotate_img)
            origin_df.loc[len(origin_df)] = [f'{i}_rotate.png', origin_df.iloc[i]['COVID']]

    # flip-rotate
    if flip_rotate:
        for i in tqdm(range(NUM_ORIGIN_DATA)):
            img = cv2.imread(f'./{DATA_DIR}/train/{i}.png')
            hflip_img = cv2.flip(img, 1)
            h, w = hflip_img.shape[:2]
            # rotate by Specific value
            if angle:
                rotation = cv2.getRotationMatrix2D((h/2, w/2), angle, 1)
                rotate_img = cv2.warpAffine(hflip_img, rotation, (h, w))
            # rotate by random value
            else:
                rand_angle = np.random.randint(30)
                rotation = cv2.getRotationMatrix2D((h/2, w/2), 5+rand_angle, 1)
                rotate_img = cv2.warpAffine(hflip_img, rotation, (h, w))
            cv2.imwrite(f'./{DATA_DIR}/augmentation2/{i}_flip_rotate.png', rotate_img)
            origin_df.loc[len(origin_df)] = [f'{i}_flip_rotate.png', origin_df.iloc[i]['COVID']]

    print('\nsave csv file...')
    origin_df.to_csv(os.path.join(DATA_DIR, 'augmentation2.csv'))
    print("ALL JOBS FINISHED")

    return origin_df

코랩이라 그런진 몰라도 기존에 이미 변환시킨 파일이 있는 상태에서 createimage를 실행시키면 시간이 너무 오래걸림.<br>
어차피 이전에 이미 이미지를 생성했다면 필요한 파일은 다 있으므로 생략해도 됨

In [9]:
# new_df = createimage(VFLIP, HFLIP, ROTATE, FLIP_ROTATE, ANGLE)

# Define Dataloader

## Train & Validation Set loader

In [19]:
class CustomDataset(Dataset):
    ####################################################################
    ####          인자값에 mode 삭제 후 idx를 추가해줘야함          ####
    ####################################################################
    def __init__(self, data_dir, idx, input_shape):
        self.data_dir = data_dir
        ####################################################################
        ####          mode를 삭제했기 때문에 이부분도 수정해줌          ####
        ####################################################################
        self.idx = idx
        self.input_shape = input_shape
        
        # Loading dataset
        self.db = self.data_loader()
        
        # Dataset split
        ####################################################################
        ####          mode를 삭제했기 때문에 이부분도 수정해줌          ####
        ####################################################################
        self.db = self.db.iloc[self.idx]
        self.db.reset_index(inplace=True)
            
        # Transform function
        self.transform = transforms.Compose([transforms.Resize(self.input_shape),
                                             transforms.ToTensor(),
                                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

    def data_loader(self):
        ####################################################################
        ####          mode를 삭제했기 때문에 이부분도 수정해줌          ####
        ####################################################################        
        print('Loading dataset..')
        if not os.path.isdir(self.data_dir):
            print(f'!!! Cannot find {self.data_dir}... !!!')
            sys.exit()
        
        # (COVID : 1, No : 0)
        ####################################################################
        ####          읽어올 label 정보 csv를 train에서 바꿔줌          ####
        ####################################################################
        db = pd.read_csv(os.path.join(self.data_dir, 'augmentation2.csv'))
        
        return db

    def __len__(self):
        return len(self.db)

    def __getitem__(self, index):
        data = copy.deepcopy(self.db.loc[index])

        # Loading image
        ####################################################################
        ####           이미지를 읽어올 경로를 train에서 바꿔줌          ####
        ####################################################################
        cvimg = cv2.imread(os.path.join(self.data_dir,'augmentation2',data['file_name']), cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        if not isinstance(cvimg, np.ndarray):
            raise IOError("Fail to read %s" % data['file_name'])

        # Preprocessing images
        trans_image = self.transform(Image.fromarray(cvimg))

        ####################################################################
        ####           리턴값에 data['file_name'] 추가해줘야함          ####
        ####################################################################
        return trans_image, data['COVID'], data['file_name']

# Define Model(VGG-16)

In [20]:
from torchvision.models import vgg16


class VGG16(nn.Module):
    def __init__(self, NUM_CLS):
        super(VGG16, self).__init__()
        self.vgg = vgg16(pretrained=False)
        self.features_conv = self.vgg.features
        self.linear = nn.Sequential(
            nn.Linear(73728, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, NUM_CLS),
            nn.Softmax(dim=1)
        )
        
    def forward(self, x):
        x = self.features_conv(x)
        x = torch.flatten(x,1)
        x = self.linear(x)
        return x

# Define Utils

## EarlyStopper

In [21]:
class LossEarlyStopper():
    """Early stopper
    
    Attributes:
        patience (int): loss가 줄어들지 않아도 학습할 epoch 수
        patience_counter (int): loss 가 줄어들지 않을 때 마다 1씩 증가, 감소 시 0으로 리셋
        min_loss (float): 최소 loss
        stop (bool): True 일 때 학습 중단

    """

    def __init__(self, patience: int)-> None:
        self.patience = patience

        self.patience_counter = 0
        self.min_loss = np.Inf
        self.stop = False
        self.save_model = False

    def check_early_stopping(self, loss: float)-> None:
        """Early stopping 여부 판단"""  

        if self.min_loss == np.Inf:
            self.min_loss = loss
            return None

        elif loss > self.min_loss:
            self.patience_counter += 1
            msg = f"Early stopping counter {self.patience_counter}/{self.patience}"

            if self.patience_counter == self.patience:
                self.stop = True
                
        elif loss <= self.min_loss:
            self.patience_counter = 0
            self.save_model = True
            msg = f"Validation loss decreased {self.min_loss} -> {loss}"
            self.min_loss = loss
        
        print(msg)

## Trainer

In [22]:
class Trainer():
    """ epoch에 대한 학습 및 검증 절차 정의"""
    
    def __init__(self, loss_fn, model, device, metric_fn, optimizer=None, scheduler=None):
        """ 초기화
        """
        self.loss_fn = loss_fn
        self.model = model
        self.device = device
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.metric_fn = metric_fn

    def train_epoch(self, dataloader, epoch_index):
        """ 한 epoch에서 수행되는 학습 절차"""
        
        self.model.train()
        train_total_loss = 0
        target_lst = []
        pred_lst = []
        prob_lst = []

        for batch_index, (img, label, _) in enumerate(dataloader):
            img = img.to(self.device)
            label = label.to(self.device).float()
            
            pred = self.model(img)
            
            loss = self.loss_fn(pred[:,1], label)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            self.scheduler.step()
            
            train_total_loss += loss.item()
            prob_lst.extend(pred[:, 1].cpu().tolist())
            target_lst.extend(label.cpu().tolist())
            pred_lst.extend(pred.argmax(dim=1).cpu().tolist())

            torch.cuda.empty_cache()

        self.train_mean_loss = train_total_loss / batch_index
        self.train_score, f1 = self.metric_fn(y_pred=pred_lst, y_answer=target_lst)
        msg = f'\nEpoch {epoch_index}, Train loss: {self.train_mean_loss}, Acc: {self.train_score}, F1-Macro: {f1}'
        print(msg)

    def validate_epoch(self, dataloader, epoch_index):
        """ 한 epoch에서 수행되는 검증 절차
        """
        self.model.eval()
        val_total_loss = 0
        target_lst = []
        pred_lst = []
        prob_lst = []

        for batch_index, (img, label, _) in enumerate(dataloader):
            img = img.to(self.device)
            label = label.to(self.device).float()
            pred = self.model(img)
            
            loss = self.loss_fn(pred[:,1], label)
            val_total_loss += loss.item()
            prob_lst.extend(pred[:, 1].cpu().tolist())
            target_lst.extend(label.cpu().tolist())
            pred_lst.extend(pred.argmax(dim=1).cpu().tolist())

            torch.cuda.empty_cache()
            
        self.val_mean_loss = val_total_loss / batch_index
        self.validation_score, f1 = self.metric_fn(y_pred=pred_lst, y_answer=target_lst)
        msg = f'\nEpoch {epoch_index}, Val loss: {self.val_mean_loss}, Acc: {self.validation_score}, F1-Macro: {f1}'
        print(msg)

## Metrics

In [23]:
from sklearn.metrics import accuracy_score, f1_score

def get_metric_fn(y_pred, y_answer):
    """ 성능을 반환하는 함수"""
    
    assert len(y_pred) == len(y_answer), 'The size of prediction and answer are not same.'
    accuracy = accuracy_score(y_answer, y_pred)
    f1 = f1_score(y_answer, y_pred, average='macro')
    return accuracy, f1

# Train

## Load utils

In [24]:
# Set optimizer, scheduler, loss function, metric function
loss_fn = nn.BCELoss()
metric_fn = get_metric_fn
early_stopper = LossEarlyStopper(patience=EARLY_STOPPING_PATIENCE)

## K-Fold 학습 진행

In [11]:
from sklearn.model_selection import KFold


kfold = KFold(n_splits=K_FOLDS, shuffle=True, random_state=RANDOM_SEED)

메모리 부족으로(Cuda memory out)로 실행이안됨. 다행히 1fold의 학습은 가능하므로 각 세션마다 하나의 fold조합을 학습시키면서 모델을 저장했음

In [17]:
train_loss_list = [[] for _ in range(EPOCHS)]
val_loss_list = [[] for _ in range(EPOCHS)]

for fold, (train_idx, val_idx) in enumerate(kfold.split(range(646))):
    print(f'============================{fold+1}th fold============================')

    # cuda memory out 문제 때문에 각 fold 별로 한 번씩 실행해줬음
    if fold == 0:
        continue
    
    if fold == 1:
        continue

    if fold == 2:
        continue

    if fold == 3:
        continue


    augmentations = 0
    length = len(train_idx)
    if VFLIP:
        augmentations += 1
        vflip_idx = train_idx[:length] + (NUM_ORIGIN_DATA * augmentations)
        train_idx = np.concatenate((train_idx, vflip_idx))
    if HFLIP:
        augmentations += 1
        hflip_idx = train_idx[:length] + (NUM_ORIGIN_DATA * augmentations)
        train_idx = np.concatenate((train_idx, hflip_idx))
    if ROTATE:
        augmentations += 1
        rotate_idx = train_idx[:length] + (NUM_ORIGIN_DATA * augmentations)
        train_idx = np.concatenate((train_idx, rotate_idx))
    if FLIP_ROTATE:
        augmentations += 1
        frotate_idx = train_idx[:length] + (NUM_ORIGIN_DATA * augmentations)
        train_idx = np.concatenate((train_idx, frotate_idx))

    train_dataset = CustomDataset(data_dir=DATA_DIR, idx=train_idx, input_shape=INPUT_SHAPE)
    validation_dataset = CustomDataset(data_dir=DATA_DIR, idx=val_idx, input_shape=INPUT_SHAPE)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validation_dataloader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

    model = VGG16(NUM_CLS).to(DEVICE)

    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler =  optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e5, max_lr=0.0001, epochs=EPOCHS, steps_per_epoch=len(train_dataloader))

    # Set trainer
    trainer = Trainer(loss_fn, model, DEVICE, metric_fn, optimizer, scheduler)

    for epoch_index in tqdm(range(EPOCHS)):
        trainer.train_epoch(train_dataloader, epoch_index)
        trainer.validate_epoch(validation_dataloader, epoch_index)

        train_loss = trainer.train_mean_loss
        val_loss = trainer.val_mean_loss
        
        train_loss_list[epoch_index].append(train_loss)
        val_loss_list[epoch_index].append(val_loss)

        early_stopper.check_early_stopping(loss=trainer.val_mean_loss)

        if early_stopper.stop:
            print('Early stopped')
            break

        if early_stopper.save_model:
            torch.save(model.state_dict(), f"{fold}th_vgg16.pt")
            early_stopper.save_model = False

        del train_loss, val_loss
        torch.cuda.empty_cache()


Loading dataset..
Loading dataset..


  0%|          | 0/10 [00:00<?, ?it/s]


Epoch 0, Train loss: 0.6936127441494089, Acc: 0.5442940038684719, F1-Macro: 0.5402983389249992


 10%|█         | 1/10 [17:18<2:35:42, 1038.06s/it]


Epoch 0, Val loss: 0.7790010145732335, Acc: 0.6666666666666666, F1-Macro: 0.6525524585029753

Epoch 1, Train loss: 0.6372991958142895, Acc: 0.6460348162475822, F1-Macro: 0.6460212551120785


 20%|██        | 2/10 [20:14<1:10:48, 531.08s/it] 


Epoch 1, Val loss: 0.9865264509405408, Acc: 0.6666666666666666, F1-Macro: 0.6626938279112192
Early stopping counter 1/3

Epoch 2, Train loss: 0.557808152939144, Acc: 0.7276595744680852, F1-Macro: 0.7270212702127021

Epoch 2, Val loss: 0.6388797632285527, Acc: 0.7364341085271318, F1-Macro: 0.7261488511488512
Validation loss decreased 0.7790010145732335 -> 0.6388797632285527


 30%|███       | 3/10 [23:18<43:28, 372.61s/it]  


Epoch 3, Train loss: 0.4663315060499467, Acc: 0.7752417794970986, F1-Macro: 0.7745993992959219

Epoch 3, Val loss: 0.5461782940796444, Acc: 0.7751937984496124, F1-Macro: 0.7749774436090224
Validation loss decreased 0.6388797632285527 -> 0.5461782940796444


 40%|████      | 4/10 [26:21<29:47, 297.94s/it]


Epoch 4, Train loss: 0.39018706320539903, Acc: 0.818568665377176, F1-Macro: 0.8183676315902624


 50%|█████     | 5/10 [29:17<21:09, 253.94s/it]


Epoch 4, Val loss: 0.6024786744798932, Acc: 0.7984496124031008, F1-Macro: 0.7948874755381605
Early stopping counter 1/3

Epoch 5, Train loss: 0.30500842723995447, Acc: 0.8646034816247582, F1-Macro: 0.8642758820155004

Epoch 5, Val loss: 0.48888204991817474, Acc: 0.7751937984496124, F1-Macro: 0.7743258731978042
Validation loss decreased 0.5461782940796444 -> 0.48888204991817474


 60%|██████    | 6/10 [32:20<15:18, 229.66s/it]


Epoch 6, Train loss: 0.2169843999374854, Acc: 0.9090909090909091, F1-Macro: 0.9087671128776728


 70%|███████   | 7/10 [35:16<10:36, 212.31s/it]


Epoch 6, Val loss: 0.6240857727825642, Acc: 0.8062015503875969, F1-Macro: 0.8054533389636243
Early stopping counter 1/3

Epoch 7, Train loss: 0.13387517403737692, Acc: 0.9481624758220503, F1-Macro: 0.9479802415324285


 80%|████████  | 8/10 [38:12<06:41, 200.76s/it]


Epoch 7, Val loss: 0.6325884993587222, Acc: 0.813953488372093, F1-Macro: 0.8120446818844098
Early stopping counter 2/3

Epoch 8, Train loss: 0.08482989236884016, Acc: 0.9682785299806577, F1-Macro: 0.9681670134750682


 80%|████████  | 8/10 [41:08<10:17, 308.56s/it]


Epoch 8, Val loss: 0.7395175695419312, Acc: 0.8062015503875969, F1-Macro: 0.8057813911472447
Early stopping counter 3/3
Early stopped





## Check loss & acc

전체 FOLD를 순회하지 못했으므로 아래 acc & loss 그래프 셀은 생략함

In [None]:
import matplotlib.pyplot as plt


fig, loss_ax = plt.subplots()

mean_train_loss = [np.mean(train_loss_list[i]) for i in range(EPOCHS)]
mean_val_loss = [np.mean(val_loss_list[i]) for i in range(EPOCHS)]

loss_ax.plot(mean_train_loss, 'y', label='train loss')
loss_ax.plot(mean_val_loss, 'r', label='val loss')
loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
loss_ax.legend(loc='lower left')

plt.show()

In [None]:
np.argmin(mean_val_loss)

# Inference


### Load Model

In [25]:
model1_path = '0th_vgg16.pt'
model2_path = '1th_vgg16.pt'
model3_path = '2th_vgg16.pt'
model4_path = '3th_vgg16.pt'
model5_path = '4th_vgg16.pt'

In [26]:
# fold1 model
model1 = VGG16(NUM_CLS).to(DEVICE)
model1.load_state_dict(torch.load(model1_path, map_location='cpu'))

# fold2 model
model2 = VGG16(NUM_CLS).to(DEVICE)
model2.load_state_dict(torch.load(model2_path, map_location='cpu'))

# fold3 model
model3 = VGG16(NUM_CLS).to(DEVICE)
model3.load_state_dict(torch.load(model3_path, map_location='cpu'))

# fold4 model
model4 = VGG16(NUM_CLS).to(DEVICE)
model4.load_state_dict(torch.load(model4_path, map_location='cpu'))

# fold5 model
model5 = VGG16(NUM_CLS).to(DEVICE)
model5.load_state_dict(torch.load(model5_path, map_location='cpu'))

<All keys matched successfully>

### define socoring funtion

In [37]:
# 평가 함수 정의
def get_clf_eval(y_actual, y_pred):
    accuracy = accuracy_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred)
    recall = recall_score(y_actual, y_pred)
    PR_AUC = average_precision_score(y_actual, y_pred)
    AUC = roc_auc_score(y_actual, y_pred)
    F1 = f1_score(y_actual, y_pred)
    print(f'\n전체 {len(y_actual)}개 validation data 중 양성(1) data {sum(y_actual)}개')
    print('정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('AUC: {:.4f}'.format(AUC))
    print('F1: {:.4f}'.format(F1))

### Cheak CV score

In [38]:
for fold, (_, val_idx) in enumerate(kfold.split(range(646))):
    print(f'============================{fold+1}th fold============================')
    validation_dataset = CustomDataset(data_dir=DATA_DIR, idx=val_idx, input_shape=INPUT_SHAPE)
    validation_dataloader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

    validation_actual = []
    validation_pred_lst = []

    if fold == 0:
        model1.eval()
        with torch.no_grad():
            for batch_index, (img, label, _) in tqdm(enumerate(validation_dataloader)):
                img = img.to(DEVICE)
                pred = model1(img)
                validation_actual += (list(label.numpy()))
                validation_pred_lst.extend(pred.argmax(dim=1).tolist())
            get_clf_eval(validation_actual, validation_pred_lst)
    
    elif fold == 1:
        model2.eval()
        with torch.no_grad():
            for batch_index, (img, label, _) in tqdm(enumerate(validation_dataloader)):
                img = img.to(DEVICE)
                pred = model2(img)
                validation_actual += (list(label.numpy()))
                validation_pred_lst.extend(pred.argmax(dim=1).tolist())
            get_clf_eval(validation_actual, validation_pred_lst)
    
    elif fold == 2:
        model3.eval()
        with torch.no_grad():
            for batch_index, (img, label, _) in tqdm(enumerate(validation_dataloader)):
                img = img.to(DEVICE)
                pred = model3(img)
                validation_actual += (list(label.numpy()))
                validation_pred_lst.extend(pred.argmax(dim=1).tolist())
            get_clf_eval(validation_actual, validation_pred_lst)

    elif fold == 3:
        model4.eval()
        with torch.no_grad():
            for batch_index, (img, label, _) in tqdm(enumerate(validation_dataloader)):
                img = img.to(DEVICE)
                pred = model4(img)
                validation_actual += (list(label.numpy()))
                validation_pred_lst.extend(pred.argmax(dim=1).tolist())
            get_clf_eval(validation_actual, validation_pred_lst)

    elif fold == 4:
        model5.eval()
        with torch.no_grad():
            for batch_index, (img, label, _) in tqdm(enumerate(validation_dataloader)):
                img = img.to(DEVICE)
                pred = model5(img)
                validation_actual += (list(label.numpy()))
                validation_pred_lst.extend(pred.argmax(dim=1).tolist())
            get_clf_eval(validation_actual, validation_pred_lst)


Loading dataset..


8it [00:04,  1.71it/s]



전체 130개 validation data 중 양성(1) data 62개
정확도: 0.8385
정밀도: 0.8361
재현율: 0.8226
AUC: 0.8378
F1: 0.8293
Loading dataset..


8it [00:04,  1.76it/s]



전체 129개 validation data 중 양성(1) data 52개
정확도: 0.7907
정밀도: 0.7119
재현율: 0.8077
AUC: 0.7935
F1: 0.7568
Loading dataset..


8it [00:04,  1.75it/s]



전체 129개 validation data 중 양성(1) data 63개
정확도: 0.8760
정밀도: 0.8406
재현율: 0.9206
AUC: 0.8770
F1: 0.8788
Loading dataset..


8it [00:04,  1.74it/s]



전체 129개 validation data 중 양성(1) data 66개
정확도: 0.7597
정밀도: 0.7778
재현율: 0.7424
AUC: 0.7601
F1: 0.7597
Loading dataset..


8it [00:04,  1.74it/s]


전체 129개 validation data 중 양성(1) data 61개
정확도: 0.7752
정밀도: 0.7667
재현율: 0.7541
AUC: 0.7741
F1: 0.7603





# Make submission

### Define predict function

In [27]:
# 예측함수(soft voting)
def predict(models, loader):
    model1, model2, model3, model4, model5 = models

    file_lst = []
    pred_lst = []
    prob_lst = []

    model1.eval()
    model2.eval()
    model3.eval()
    model4.eval()
    model5.eval()

    with torch.no_grad():
        for batch_index, (img, _, file_num) in tqdm(enumerate(test_dataloader)):
            img = img.to(DEVICE)
        
            prob1 = model1(img)
            prob2 = model2(img)
            prob3 = model3(img)
            prob4 = model4(img)
            prob5 = model5(img)

            prob = (prob1 + prob2 + prob3 + prob4 + prob5) / 5
            file_lst.extend(list(file_num))
            pred_lst.extend(prob.argmax(dim=1).tolist())
            prob_lst.extend(prob[:, 1].tolist())
    
    return pred_lst, prob_lst, file_lst

### Load dataset

In [40]:
class TestDataset(Dataset):
    def __init__(self, data_dir, input_shape):
        self.data_dir = data_dir
        self.input_shape = input_shape
        
        # Loading dataset
        self.db = self.data_loader()
        
        # Transform function
        self.transform = transforms.Compose([transforms.Resize(self.input_shape),
                                             transforms.ToTensor(),
                                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

    def data_loader(self):
        print('Loading test dataset..')
        if not os.path.isdir(self.data_dir):
            print(f'!!! Cannot find {self.data_dir}... !!!')
            sys.exit()
        
        db = pd.read_csv(os.path.join(self.data_dir, 'sample_submission.csv'))
        return db
    
    def __len__(self):
        return len(self.db)
    
    def __getitem__(self, index):
        data = copy.deepcopy(self.db.loc[index])
        
        # Loading image
        cvimg = cv2.imread(os.path.join(self.data_dir,'test',data['file_name']), cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
        if not isinstance(cvimg, np.ndarray):
            raise IOError("Fail to read %s" % data['file_name'])

        # Preprocessing images
        trans_image = self.transform(Image.fromarray(cvimg))


        ####################################################################
        ####                 리턴값에 -1 추가해줘야함                   ####
        ####################################################################
        return trans_image, -1, data['file_name']

In [41]:
# Load dataset & dataloader
test_dataset = TestDataset(data_dir=DATA_DIR, input_shape=INPUT_SHAPE)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Loading test dataset..


## 추론 진행

In [39]:
models = [model1, model2, model3, model4, model5]
pred, prob, file_lst = predict(models, test_dataloader)

6it [00:08,  1.38s/it]


## 결과 저장

In [41]:
# prediction
df = pd.DataFrame({'file_name':file_lst, 'COVID':pred})
df.to_csv('prediction(VGG16_ensemble).csv', index=False)

# probability
df2 = pd.DataFrame({'file_name':file_lst, 'COVID':prob})
df2.to_csv('probability(VGG16_ensemble).csv', index=False)