In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import random
import numpy as np
import os

seed = 50
os.environ['PYTHONHASHSEED']=str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

In [3]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [4]:
data_path = '/home/baebro/nipa_ws/Pneumonia Binary Classification/chest_xray/'

train_path = data_path + 'train/'
valid_path = data_path +'val/'
test_path = data_path + 'test/'

In [5]:
from torchvision import transforms

transform_train = transforms.Compose([
    transforms.Resize((250, 250)),
    transforms.CenterCrop(180),
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomVerticalFlip(0.2),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

transform_test = transforms.Compose([
    transforms.Resize((250, 250)),
    transforms.CenterCrop(180),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

In [6]:
from torchvision.datasets import ImageFolder

datasets_train = ImageFolder(root=train_path, transform=transform_train)
datasets_valid = ImageFolder(root=valid_path, transform=transform_test)

In [7]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x7fcb91b1bbd0>

In [8]:
from torch.utils.data import DataLoader

batch_size = 8

loader_train = DataLoader(dataset=datasets_train, batch_size=batch_size,
                         shuffle=True, worker_init_fn=seed_worker,
                         generator=g, num_workers=2)
loader_valid = DataLoader(dataset=datasets_valid, batch_size=batch_size,
                         shuffle=False, worker_init_fn=seed_worker,
                         generator=g, num_workers=2)

In [11]:
models_list = []

from efficientnet_pytorch import EfficientNet

efficientnet_b1 = EfficientNet.from_pretrained('efficientnet-b1', num_classes=2)
efficientnet_b2 = EfficientNet.from_pretrained('efficientnet-b2', num_classes=2)
efficientnet_b3 = EfficientNet.from_pretrained('efficientnet-b3', num_classes=2)

efficientnet_b1 = efficientnet_b1.to(device)
efficientnet_b2 = efficientnet_b2.to(device)
efficientnet_b3 = efficientnet_b3.to(device)

models_list.append(efficientnet_b1)
models_list.append(efficientnet_b2)
models_list.append(efficientnet_b3)

Loaded pretrained weights for efficientnet-b1
Loaded pretrained weights for efficientnet-b2
Loaded pretrained weights for efficientnet-b3


In [12]:
for idx, model in enumerate(models_list):
    num_params = sum(param.numel() for param in model.parameters())
    print(f'model {idx+1} parameter : {num_params}')

model 1 parameter : 6515746
model 2 parameter : 7703812
model 3 parameter : 10699306


In [13]:
import torch.nn as nn

criterion = nn.CrossEntropyLoss()

optimizer1 = torch.optim.AdamW(models_list[0].parameters(), lr=0.0006, weight_decay=0.001)
optimizer2 = torch.optim.AdamW(models_list[1].parameters(), lr=0.0006, weight_decay=0.001)
optimizer3 = torch.optim.AdamW(models_list[2].parameters(), lr=0.0006, weight_decay=0.001)

In [14]:
from transformers import get_cosine_schedule_with_warmup

epochs = 20

scheduler1 = get_cosine_schedule_with_warmup(optimizer1, num_warmup_steps=len(loader_train)*3, num_training_steps=len(loader_train)*epochs)
scheduler2 = get_cosine_schedule_with_warmup(optimizer2, num_warmup_steps=len(loader_train)*3, num_training_steps=len(loader_train)*epochs)
scheduler3 = get_cosine_schedule_with_warmup(optimizer3, num_warmup_steps=len(loader_train)*3, num_training_steps=len(loader_train)*epochs)

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm

def train(model, loader_train, loader_valid, criterion, optimizer, scheduler=None, epochs=10, save_file='model_state_dict.pth'):

    valid_loss_min = np.inf

    for epoch in range(epochs):

        print(f'epoch [{epoch+1}/{epochs}]')
        model.train()
        epoch_train_loss = 0

        # mini batch train loop
        for images, labels in loader_train:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            epoch_train_loss + loss.item()
            loss.backward()
            optimizer.step()

            if scheduler != None:
                scheduler.step()

        print(f'\ttrain loss : {epoch_train_loss/len(loader_train):4f}')

        model.eval()
        epoch_valid_loss = 0
        preds_list = []
        true_list = []

        # mini batch validation loop
        with torch.no_grad():
            for images, labels in loader_valid:
                images = images.to(device)
                labels = labels.to(device)

                outputs = model(images)
                loss = criterion(outputs, labels)
                epoch_valid_loss += loss.item()

                preds = torch.max(outputs.cpu(), dim=1)[1].numpy()
                true = labels.cpu().numpy()

                preds_list.extend(preds)
                true_list.extend(true)
        print(f'\tvalid loss : {epoch_valid_loss/len(loader_valid):4f}')

        val_accuracy_score = accuracy_score(true_list, preds_list)
        val_recall = recall_score(true_list, preds_list)
        val_f1_score = f1_score(true_list, preds_list)

        print(f'accuracy : {val_accuracy_score:.4f} / recall : {val_recall:.4f} / f1 score : {val_f1_score:.4f}')

        if epoch_valid_loss <= valid_loss_min:
            print(f'\t### valid loss decrease ({valid_loss_min:.4f} --> {epoch_valid_loss:.4f}). model saved')
            torch.save(model.state_dict(), save_file)
            valid_loss_min = epoch_valid_loss

    return torch.load(save_file)

In [17]:
model_state_dict = train(model=models_list[0],
                        loader_train=loader_train,
                        loader_valid=loader_valid,
                        criterion=criterion,
                        optimizer=optimizer1,
                        scheduler=scheduler1,
                        epochs=epochs)

models_list[0].load_state_dict(model_state_dict)

epoch [1/20]
	train loss : 0.000000
	valid loss : 0.634758
accuracy : 0.6875 / recall : 1.0000 / f1 score : 0.7619
	### valid loss decrease (inf --> 1.2695). model saved
epoch [2/20]
	train loss : 0.000000
	valid loss : 0.685525
accuracy : 0.7500 / recall : 1.0000 / f1 score : 0.8000
epoch [3/20]
	train loss : 0.000000
	valid loss : 0.155701
accuracy : 0.9375 / recall : 1.0000 / f1 score : 0.9412
	### valid loss decrease (1.2695 --> 0.3114). model saved
epoch [4/20]
	train loss : 0.000000
	valid loss : 0.073515
accuracy : 1.0000 / recall : 1.0000 / f1 score : 1.0000
	### valid loss decrease (0.3114 --> 0.1470). model saved
epoch [5/20]
	train loss : 0.000000
	valid loss : 0.042132
accuracy : 1.0000 / recall : 1.0000 / f1 score : 1.0000
	### valid loss decrease (0.1470 --> 0.0843). model saved
epoch [6/20]
	train loss : 0.000000
	valid loss : 1.627919
accuracy : 0.5625 / recall : 1.0000 / f1 score : 0.6957
epoch [7/20]
	train loss : 0.000000
	valid loss : 0.835772
accuracy : 0.7500 / re

  return torch.load(save_file)


<All keys matched successfully>

In [18]:
model_state_dict = train(model=models_list[1],
                        loader_train=loader_train,
                        loader_valid=loader_valid,
                        criterion=criterion,
                        optimizer=optimizer2,
                        scheduler=scheduler2,
                        epochs=epochs)

models_list[1].load_state_dict(model_state_dict)

epoch [1/20]
	train loss : 0.000000
	valid loss : 1.238746
accuracy : 0.5625 / recall : 1.0000 / f1 score : 0.6957
	### valid loss decrease (inf --> 2.4775). model saved
epoch [2/20]
	train loss : 0.000000
	valid loss : 1.169718
accuracy : 0.6250 / recall : 1.0000 / f1 score : 0.7273
	### valid loss decrease (2.4775 --> 2.3394). model saved
epoch [3/20]
	train loss : 0.000000
	valid loss : 0.315567
accuracy : 0.8750 / recall : 0.8750 / f1 score : 0.8750
	### valid loss decrease (2.3394 --> 0.6311). model saved
epoch [4/20]
	train loss : 0.000000
	valid loss : 0.175190
accuracy : 0.9375 / recall : 1.0000 / f1 score : 0.9412
	### valid loss decrease (0.6311 --> 0.3504). model saved
epoch [5/20]
	train loss : 0.000000
	valid loss : 0.994805
accuracy : 0.6250 / recall : 1.0000 / f1 score : 0.7273
epoch [6/20]
	train loss : 0.000000
	valid loss : 0.647799
accuracy : 0.6875 / recall : 1.0000 / f1 score : 0.7619
epoch [7/20]
	train loss : 0.000000
	valid loss : 0.302811
accuracy : 0.8750 / re

  return torch.load(save_file)


<All keys matched successfully>

In [19]:
model_state_dict = train(model=models_list[2],
                        loader_train=loader_train,
                        loader_valid=loader_valid,
                        criterion=criterion,
                        optimizer=optimizer3,
                        scheduler=scheduler3,
                        epochs=epochs)

models_list[2].load_state_dict(model_state_dict)

epoch [1/20]
	train loss : 0.000000
	valid loss : 0.308354
accuracy : 0.8125 / recall : 0.8750 / f1 score : 0.8235
	### valid loss decrease (inf --> 0.6167). model saved
epoch [2/20]
	train loss : 0.000000
	valid loss : 1.031675
accuracy : 0.6250 / recall : 1.0000 / f1 score : 0.7273
epoch [3/20]
	train loss : 0.000000
	valid loss : 0.187313
accuracy : 1.0000 / recall : 1.0000 / f1 score : 1.0000
	### valid loss decrease (0.6167 --> 0.3746). model saved
epoch [4/20]
	train loss : 0.000000
	valid loss : 0.142113
accuracy : 1.0000 / recall : 1.0000 / f1 score : 1.0000
	### valid loss decrease (0.3746 --> 0.2842). model saved
epoch [5/20]
	train loss : 0.000000
	valid loss : 0.448994
accuracy : 0.6250 / recall : 1.0000 / f1 score : 0.7273
epoch [6/20]
	train loss : 0.000000
	valid loss : 0.200800
accuracy : 0.8750 / recall : 0.8750 / f1 score : 0.8750
epoch [7/20]
	train loss : 0.000000
	valid loss : 0.506417
accuracy : 0.7500 / recall : 1.0000 / f1 score : 0.8000
epoch [8/20]
	train loss

  return torch.load(save_file)


<All keys matched successfully>

In [20]:
datasets_test = ImageFolder(root=test_path, transform=transform_test)
loader_test = DataLoader(dataset=datasets_test, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker, generator=g, num_workers=2)

In [46]:
def predict(model, loader_test, return_true=False):
    model.eval()
    preds_list = []
    true_list = []

    with torch.no_grad():
        for images, labels in loader_test:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)

            preds = torch.max(outputs.cpu(), dim=1)[1].numpy()
            true = labels.cpu().numpy()

            preds_list.extend(preds)
            true_list.extend(true)

    if return_true:
        return true_list, preds_list
    else:
        return preds_list

In [47]:
true_list, preds_list = predict(model=models_list[0], loader_test=loader_test, return_true=True)

In [48]:
preds_list2 = predict(model=models_list[1], loader_test=loader_test)

In [50]:
preds_list3 = predict(model=models_list[2], loader_test=loader_test)

In [52]:
print('#'*5, 'efficientnet-b1 final pred score', '#'*5)
print(f'accuracy score : {accuracy_score(true_list,preds_list):.4f}')
print(f'recall score : {recall_score(true_list, preds_list):.4f}')
print(f'f1 score : {f1_score(true_list, preds_list):.4f}')

##### efficientnet-b1 final pred score #####
accuracy score : 0.9006
recall score : 0.9641
f1 score : 0.9238


In [53]:
print('#'*5, 'efficientnet-b2 final pred score', '#'*5)
print(f'accuracy score : {accuracy_score(true_list,preds_list2):.4f}')
print(f'recall score : {recall_score(true_list, preds_list2):.4f}')
print(f'f1 score : {f1_score(true_list, preds_list2):.4f}')

##### efficientnet-b2 final pred score #####
accuracy score : 0.9071
recall score : 0.9897
f1 score : 0.9301


In [54]:
print('#'*5, 'efficientnet-b2 final pred score', '#'*5)
print(f'accuracy score : {accuracy_score(true_list,preds_list3):.4f}')
print(f'recall score : {recall_score(true_list, preds_list3):.4f}')
print(f'f1 score : {f1_score(true_list, preds_list3):.4f}')

##### efficientnet-b2 final pred score #####
accuracy score : 0.8894
recall score : 0.9846
f1 score : 0.9176


ensemble prediction

In [56]:
ensemble_preds = []

for i in range(len(preds_list)):
    pred_element = np.round((preds_list[i]+preds_list2[i]+preds_list3[i])/3)
    ensemble_preds.append(pred_element)

In [57]:
print('#'*5, 'ensemble final pred score', '#'*5)
print(f'accuracy score : {accuracy_score(true_list,ensemble_preds):.4f}')
print(f'recall score : {recall_score(true_list, ensemble_preds):.4f}')
print(f'f1 score : {f1_score(true_list, ensemble_preds):.4f}')

##### ensemble final pred score #####
accuracy score : 0.9038
recall score : 0.9821
f1 score : 0.9274
