In [None]:
!pip install -r requirements.txt

In [2]:
import os
import sys
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
from PIL import Image

import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, DataLoader
import timm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    print(f"SUCCES {seed} SEED FIXING")

seed_everything(42)

SUCCES 42 SEED FIXING


In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
root_dir = '../dataset/Xray'

In [5]:
df_info = pd.read_csv('../dataset/Xray/metadata.csv')

In [6]:
findings = ['Nodule', 'Pneumonia', 'Pneumothorax', 'Pleural_effusion', 'Cardiomegaly', 
            'Fibrosis', 'Pneumoperitoneum', 'Mediastinal_widening', 'Calcification', 'Atelectasis']
threshold = 35
df_norm = pd.DataFrame(columns = df_info.columns)
df_abnorm = pd.DataFrame(columns = df_info.columns)

In [7]:
for i, check in tqdm(df_info.iterrows()):
    abnormality = False
    for j, find in enumerate (findings):
        if check[find] > threshold:
            abnormality = True
            break
    if abnormality == True:
        df_abnorm.loc[len(df_abnorm)] = check
    else:
        df_norm.loc[len(df_norm)] = check     

In [9]:
print('number of normal data: '+ str(len(df_norm)) + ', number of abnormal data: ' + str(len(df_abnorm)))

number of normal data: 29621, number of abnormal data: 50379


In [10]:
img_lst = []
label_lst = []
def img_label_pair (df, img_lst, label_lst, label):
    for i, check in tqdm(df.iterrows()):
        img_lst.append(check['Path'])
        label_lst.append(label)
    return img_lst, label_lst

img_lst, label_lst = img_label_pair(df_norm, img_lst, label_lst, 0)
img_lst, label_lst = img_label_pair(df_abnorm, img_lst, label_lst, 1)

train_idx, valid_idx = train_test_split(np.arange(len(label_lst)), test_size=0.2, shuffle=True, stratify=label_lst)        

29621it [00:01, 16832.59it/s]
50379it [00:02, 16925.37it/s]


In [11]:
class XrayDataset(Dataset):
    def __init__(self, mode):
        self.img_lists = img_lst
        if mode == 'train':
            self.dataset_idx = sorted(train_idx)
        elif mode == 'valid':
            self.dataset_idx = sorted(valid_idx)
        #self.transforms = transforms
    
    def __len__(self):
        return len(self.dataset_idx)
    
    def __getitem__(self, idx):
        img_path = os.path.join(root_dir, self.img_lists[self.dataset_idx[idx]])
        #image = Image.open(img_path)
        img = torchvision.transforms.ToTensor()(Image.open(img_path))
        #np_img = self.transforms(image=np.array(image))['image']
        if label_lst[self.dataset_idx[idx]] == 0:
            return img, torch.tensor(0)
        else:
            return img, torch.tensor(1)

In [12]:
train_dataset = XrayDataset('train')
valid_dataset = XrayDataset('valid')

In [13]:
train_loader = DataLoader(train_dataset, batch_size=16, num_workers=2, drop_last=True, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, num_workers=2, drop_last=False, shuffle=False)

In [15]:
model = timm.create_model('efficientnet_b4', pretrained = True, num_classes = 2)
model.to(device)

EfficientNet(
  (conv_stem): Conv2d(3, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn1): BatchNormAct2d(
    48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
    (drop): Identity()
    (act): SiLU(inplace=True)
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): DepthwiseSeparableConv(
        (conv_dw): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
        (bn1): BatchNormAct2d(
          48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (se): SqueezeExcite(
          (conv_reduce): Conv2d(48, 12, kernel_size=(1, 1), stride=(1, 1))
          (act1): SiLU(inplace=True)
          (conv_expand): Conv2d(12, 48, kernel_size=(1, 1), stride=(1, 1))
          (gate): Sigmoid()
        )
        (conv_pw): Conv2d(48, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn2): BatchNormAct2d(
    

In [16]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5, gamma=0.7)

In [17]:
# training
print ('Start Training')
print ('-'*30)

best_val_auc = 0
early_stop_cnt = 0
for epoch in range(10):

    for idx, data in enumerate(train_loader):
        train_data, train_labels = data
        train_data, train_labels  = train_data.to(device), train_labels.to(device)
        y_pred = model(train_data)
        y_pred = torch.sigmoid(y_pred)
        loss = criterion(y_pred, train_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # validation  
        if idx % 1500 == 0:
            model.eval()
            with torch.no_grad():    
                test_pred = []
                test_true = [] 
                for jdx, data in enumerate(valid_loader):
                    test_data, test_labels = data
                    test_data = test_data.to(device)
                    y_pred = model(test_data)
                    y_pred = torch.argmax(torch.sigmoid(y_pred), dim=1)
                    test_pred.append(y_pred.detach().cpu().numpy())
                    test_true.append(test_labels.numpy())

                test_true = np.concatenate(test_true)
                test_pred = np.concatenate(test_pred)
                val_auc = roc_auc_score(test_true, test_pred)
                model.train()

                if best_val_auc < val_auc:
                    best_val_auc = val_auc
                    early_stop_cnt = 0
                    torch.save(model.state_dict(), '../xray_submission/xray_final.pth')
                    print('Best saved')
                else:
                    early_stop_cnt += 1
                    if early_stop_cnt == 6:
                        print('Epoch=%s, BatchID=%s, Val_auc=%.4f, Best_Val_auc=%.4f'%(epoch, idx, val_auc, best_val_auc))
                        sys.exit()

                print('Epoch=%s, BatchID=%s, Val_auc=%.4f, Best_Val_auc=%.4f'%(epoch, idx, val_auc, best_val_auc))

Start Training
------------------------------
Best saved
Epoch=0, BatchID=0, Val_auc=0.4997, Best_Val_auc=0.4997
Best saved
Epoch=0, BatchID=1500, Val_auc=0.7727, Best_Val_auc=0.7727
Best saved
Epoch=0, BatchID=3000, Val_auc=0.8243, Best_Val_auc=0.8243
Best saved
Epoch=1, BatchID=0, Val_auc=0.8353, Best_Val_auc=0.8353
Best saved
Epoch=1, BatchID=1500, Val_auc=0.8410, Best_Val_auc=0.8410
Epoch=1, BatchID=3000, Val_auc=0.8319, Best_Val_auc=0.8410
Epoch=2, BatchID=0, Val_auc=0.7628, Best_Val_auc=0.8410
Epoch=2, BatchID=1500, Val_auc=0.8320, Best_Val_auc=0.8410
Epoch=2, BatchID=3000, Val_auc=0.8318, Best_Val_auc=0.8410
Epoch=3, BatchID=0, Val_auc=0.8001, Best_Val_auc=0.8410
Best saved
Epoch=3, BatchID=1500, Val_auc=0.8446, Best_Val_auc=0.8446
Best saved
Epoch=3, BatchID=3000, Val_auc=0.8455, Best_Val_auc=0.8455
Epoch=4, BatchID=0, Val_auc=0.8216, Best_Val_auc=0.8455
Best saved
Epoch=4, BatchID=1500, Val_auc=0.8460, Best_Val_auc=0.8460
Best saved
Epoch=4, BatchID=3000, Val_auc=0.8535, Best_

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [23]:
test_dir = '/home/competition/dataset/Xray/test'

In [24]:
class XrayTestset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.img_lists = list(glob(os.path.join(self.data_dir, '*.png')))

    def __len__(self):
        return len(self.img_lists)
    
    def __getitem__(self, idx):
        #label = 0
        img_path = self.img_lists[idx]
        #im = Image.open(img_path).resize((224,224))
        img = torchvision.transforms.ToTensor()(Image.open(img_path))
        # 1차원 image -> 3차원
        imgs = []
        for i in range(3):
            imgs.append(img)
        image = torch.cat(imgs)

        return image, img_path

In [25]:
test_dataset = XrayTestset(test_dir)
test_loader = DataLoader(test_dataset, batch_size = 16, num_workers = 2, drop_last = False, shuffle = False)

In [None]:
model = timm.create_model('efficientnet_b4', pretrained=True, num_classes=2)
model.to(device)
model.load_state_dict(torch.load('../xray_submission/xray_final.pth'))

# training
print('Start Testing')
print('-'*30)

model.eval()
with torch.no_grad():    
    test_pred = []
    data_path_lst = []
    for jdx, (test_data, data_path) in enumerate(test_loader):
        test_data = test_data.cuda()
        y_pred = model(test_data)
        y_pred = torch.argmax(torch.sigmoid(y_pred), dim=1)
        test_pred.append(y_pred.detach().cpu().numpy())
        data_path_lst.append(data_path)

    test_pred = np.concatenate(test_pred)
    data_path_lst = np.concatenate(data_path_lst)

pred_info = {}

for pred, img_path in zip(test_pred, data_path_lst):
    file_name = img_path.split('/')[-1]
    pred_info[file_name] = pred

sort_pred = dict(sorted(pred_info.items()))
print(sort_pred.items())


submission = pd.read_csv('../result_xray/1000_sample_submission.csv')
submission['result'] = sort_pred.values()

submission.to_csv('../xray_submission/xray_final.csv', index = False)
submission.head()