In [None]:
TEMP_DIR = '/kaggle/working/dogs-vs-cats'

In [None]:
# Dependencies
import PIL
import numpy as np
import pandas as pd
import random
import torch
import torchvision
import math

import matplotlib.pyplot as plt

import os
import zipfile
from copy import deepcopy

# progress bar
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_nb

from sklearn.model_selection import KFold, StratifiedKFold

데이터셋의 이미지를 다루기 위한 간단한 함수들

In [None]:
from torchvision import transforms

mean = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float32)
std = torch.tensor([0.229, 0.224, 0.225], dtype=torch.float32)
normalize = transforms.Normalize(mean.tolist(), std.tolist())
unnormalize = transforms.Normalize((-mean / std).tolist(), (1.0 / std).tolist())

def get_image(path):
    return PIL.Image.open(path)

def image_to_tensor(image):
    tf = transforms.Compose([
        transforms.Resize((256,256)),
        transforms.ToTensor(),
        normalize
    ])
    return tf(image)

input에 있는 이미지를 가져오기 위해 zip 파일을 읽고 임시 디렉토리에 압축을 푼다.

In [None]:
import re

# bad data-sets
bad_dog_ids = [5604, 6413, 8736, 8898, 9188, 
               9517, 10161, 10190, 10237, 10401, 10797, 11186]
bad_cat_ids = [2939, 3216, 4688, 4833, 5418, 
               6215, 7377, 8456, 8470, 11565, 12272]
bad_ids = bad_dog_ids + bad_cat_ids


def mkdir(path):
    l = path.split('/')
    for i in range(len(l)):
        p = os.path.join('/', *l[:i+1])
        if p and not os.path.exists(p):
            os.mkdir(p)

def abs_list_dir(path):
    ret = []
    for d, _, files in os.walk(path):
        ret += [os.path.join(d, f) for f in files]
    return ret

# extract zips and remove bads
for zipname in ['test1', 'train']:
    zipfilename = f'/kaggle/input/dogs-vs-cats/{zipname}.zip'
    zipdirname = os.path.join(TEMP_DIR, zipname) # tempTEMP
    print(f'extract zip file: {zipdirname}')
    # extract if not exists
    if not os.path.isdir(zipdirname):
        with zipfile.ZipFile(zipfilename, 'r') as zip:
            zip.extractall(TEMP_DIR)
    # remove bad files
    files = abs_list_dir(zipdirname)
    for url in tqdm(files, desc=f'Remove bad images ({zipdirname})'):
        if os.path.isdir(url):
            continue
        image_id = re.findall(r'\d+', url)[0]
        if int(image_id) in bad_ids:
            os.remove(url)


`ImageLoader`에서 읽을 수 있도록 아래의 형태로 맞춰준다.

```
train/
|- dog/
|- |- dog.XXXX.jpg
|- cat/
|- |- cat.XXXX.jpg
test/
|- dog/
|- |- dog.XXXX.jpg
|- cat/
|- |- cat.XXXX.jpg

```

이 노트북에 사용된 Dataset은 Competition이라 test set에 label이 존재하지 않는다. 따라서 검증을 위해 train set을 쪼개어 활용해야한다.

- `train` -> `train` / `test`
- `test` -> `submit`

`train`은 다시 K Fold를 사용하여, epoch 내에서 valid set을 활용할 것이다.

`submit`은 직접 읽고 실행하여 예측한 label을 저장할 것이므로, 따로 포맷을 맞추지 않을 것이다.

```
submit/
|- XXXX.jpg
```

In [None]:
# directory to arrange images for generic data loader
mkdir(TEMP_DIR + '/train/dog')
mkdir(TEMP_DIR + '/train/cat')
mkdir(TEMP_DIR + '/test/dog')
mkdir(TEMP_DIR + '/test/cat')
mkdir(TEMP_DIR + '/submit')

def organize_files(files, dst, desc=''):
    for file in tqdm(files, desc=desc):
        if not os.path.isdir(file):
            basename = os.path.basename(file)
            label, imgid, ext = basename.split('.')
            os.rename(file, os.path.join(dst, label, basename))

def organize_submit_files(files, dst, desc=''):
    for file in tqdm(files, desc=desc):
        if not os.path.isdir(file):
            basename = os.path.basename(file)
            imgid, ext = basename.split('.')
            os.rename(file, os.path.join(dst, basename))
            
# Get absolute path list of image files
train_files = abs_list_dir(TEMP_DIR + '/train') #[:8000] # uncomment for debug with small size
submit_files = abs_list_dir(TEMP_DIR + '/test1')

# Reorganize images
# train : test = 8 : 2
train_size = len(train_files)
train_split_size = train_size * 8 // 10
organize_files(train_files[:train_split_size], TEMP_DIR + '/train', desc='Organize train files')
organize_files(train_files[train_split_size:], TEMP_DIR + '/test', desc='Organize test files')
organize_submit_files(submit_files, TEMP_DIR + '/submit', desc='Organize submit files')

In [None]:
train_set = torchvision.datasets.ImageFolder(TEMP_DIR + '/train', transform=image_to_tensor)
test_set = torchvision.datasets.ImageFolder(TEMP_DIR + '/test', transform=image_to_tensor)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=False)
print(train_set, test_set)

이미지 로드가 잘 되었는지 배치 하나만 읽어서 확인해본다.

In [None]:
plt.figure(figsize=(16, 5))
plt.suptitle("Train data/label", fontsize=24)
images, labels = next(iter(train_loader))
label_to_title = {0: 'cat(0)', 1: 'dog(1)'}
for index in range(16): # batch size
    plt.subplot(2, 8, index + 1)
    plt.title(label_to_title[labels[index].numpy().item()])
    image = unnormalize(images[index])
    plt.imshow(image.permute(1,2,0).numpy())
    plt.axis("off")
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MLPModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Conv2d(3, 32, 5),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(32, 64, 3),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(64, 128, 5),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(128, 256, 5),
            nn.ReLU(),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(160000, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.Linear(128, 2),
            nn.LogSoftmax(dim=1)
        )

    def forward(self, x):
        x = self.layer(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

In [None]:
# CPU/GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

# 모델 선택하기

## 1. 직접 구현한 CNN Model

In [None]:
model = MLPModel().to(device)

criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

print(model)

## 2. VGG16 모델

In [None]:
import collections

model = torchvision.models.vgg16_bn(pretrained=True)

# Freeze our feature parameters as we don't wanna retrain them to the new data
for param in model.parameters():
  param.requires_grad = False

# # 1. Override flassifier layer
# model.classifier = nn.Sequential(
#     nn.Linear(25088, 500),
#     nn.ReLU(),
#     nn.Dropout(0.3),
#     nn.Linear(500, 2)
# )
# 2. Append flassifier layer
my_classifier = nn.Sequential(
    nn.Linear(1000, 500),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(500, 2)
)
model = nn.Sequential(collections.OrderedDict([
    ('net', model),
    ('classifier', my_classifier)
]))

# Gather the parameters to be optimized/updated in this run. If we are
#  finetuning we will be updating all parameters. However, if we are
#  doing feature extract method, we will only update the parameters
#  that we have just initialized, i.e. the parameters with requires_grad
#  is True.
print("Params to learn:")
params_to_update = []
for name, param in model.named_parameters():
    if param.requires_grad == True:
        params_to_update.append(param)
        print("\t",name)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params_to_update, lr=2*1e-3)

train_set = torchvision.datasets.ImageFolder(TEMP_DIR + '/train', transform=transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    normalize
]))

print(model)

# 모델 학습하기

모델의 순전파/역전파 시작

In [None]:
train_losses = {}
valid_losses = {}
valid_accs = {}
best = (0, -1, 1e9, {}) # (fold, valid_idx, loss, model.state)
EPOCHS = 30

model.to(device)

fold_count = 3
kfold = KFold(n_splits=fold_count)
batch_size = 64

# append result by fold -> by epoch
def add_fold_result(epoch, d, loss):
    if epoch in d:
        d[epoch].append(loss)
    else:
        d[epoch] = [loss]
    return d

def avg_group_by_fold(losses):
    return [np.array(losses[f]).mean() for f in losses]


# for tracking by fold
train_losses_by_fold = [[] for _ in range(fold_count)]
valid_losses_by_fold = [[] for _ in range(fold_count)]
valid_acc_by_fold = [[] for _ in range(fold_count)]

# Run apoch
for epoch in range(EPOCHS):
    early_stop = None
    # K-Fold cross validation
    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_set)):
        # Split dataset and loader
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_idx)
        # Use as train/valid set from train data set by k-fold
        train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, sampler=train_subsampler)
        valid_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, sampler=valid_subsampler)

        running_loss = 0.0
        
        # Train
        model.train()
        for inputs, labels in tqdm(train_loader, desc=f'train model ({epoch+1}/{EPOCHS} epoch, fold={fold})'):
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model.forward(inputs) # 예측값 산출 
            loss = criterion(outputs, labels) # 손실함수 계산
            
            optimizer.zero_grad()
            loss.backward() # 손실함수 기준으로 역전파 선언
            optimizer.step() # 가중치 최적화

            running_loss += loss.item()

        # train loss (average)
        train_loss = running_loss / len(train_loader)
        train_losses_by_fold[fold].append(train_loss)
        train_losses = add_fold_result(epoch, train_losses, train_loss)
        
        model_state = deepcopy(model.state_dict())
    
        # valid loss (just for check)
        model.eval()
        with torch.no_grad():
            valid_loss_sum = 0
            acc_sum = 0
            test_var = False
            for images, labels in valid_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images).to(device)
                _, pred = torch.max(outputs.data, 1)
                if test_var == False:
                    print('outputs', outputs.data)
                    print('pred', pred)
                    print('labels', labels)
                    print(pred == labels)
                    test_var = True
                valid_loss_sum += criterion(outputs, labels).item()
                acc_sum += (pred == labels).sum().item() / labels.size(0)
            print('acc_sum', acc_sum, 'len(valid_loader)', len(valid_loader))
            acc = acc_sum / len(valid_loader) * 100.0
            valid_loss = valid_loss_sum / len(valid_loader)
            valid_losses_by_fold[fold].append(valid_loss)
            valid_losses = add_fold_result(epoch, valid_losses, valid_loss)
        
        print('[%d] fold=%d, train loss: %.6f, valid loss: %.6f, valid acc: %.3f%%' % (epoch + 1, fold, train_loss, valid_loss, acc))
        
        valid_accs = add_fold_result(epoch, valid_accs, acc)

        # get best
        if valid_loss < best[2]:
            best = (epoch, valid_idx, valid_loss, model_state)

        tl = np.array(train_losses_by_fold[fold]).mean()
        vl = np.array(valid_losses_by_fold[fold]).mean()

        # Early stop - when valid loss is 3 times higher than train loss
        # But, Do not stop early before 5 epoch because too short
        if epoch >= 5 and len(train_losses_by_fold[fold]) > 1 and tl * 3 < vl:
            early_stop = 'valid loss is 3 times higher than train loss'
            break
        # Early stop - trained well enough (average loss is under 0.1 (means 90% accuracy))
        if np.array([tl, vl]).mean() < 0.1:
            early_stop = 'trained well enough'
            break
    if early_stop != None:
        print('Early stop - caused by', early_stop)
        break

# save model
SAVE_BEST_PATH = '/kaggle/working/dogs-vs-cats/best_parameters.pth'
torch.save(best[-1], SAVE_BEST_PATH)
print(best[:-1])

# Draw chart
plt.figure(figsize=(20, 4))

plt.subplot(1, 2, 1)
y_train, y_valid = avg_group_by_fold(train_losses), avg_group_by_fold(valid_losses)
plt.plot(y_train, label="train loss")
plt.plot(y_valid, label="valid loss")
plt.title("Training Loss")
plt.xlabel("epoch")
plt.ylim(top=min(10, max(y_train + y_valid) + 0.1), bottom=-0.01)
plt.axvline(best[0], color='red')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(avg_group_by_fold(valid_accs), label="accuracy")
plt.title("Valid accuracy")
plt.xlabel("epoch")
plt.legend()

plt.show()

각 Fold별 train/valid loss 그래프를 그려본다. (특정 fold에서 학습의 문제가 있었나 확인해보기 위해)

In [None]:
plt.figure(figsize=(25, 5))
for fold in range(fold_count):
    y_train, y_valid = train_losses_by_fold[fold], valid_losses_by_fold[fold]
    if len(y_train) < 1: continue
    plt.subplot(1, fold_count, fold + 1)
    plt.plot(y_train, label="train loss")
    plt.plot(y_valid, label="valid loss")
    plt.title(f'Loss (fold={fold})')
    plt.xlabel("epoch")
    plt.ylim(top=min(10, max(y_train + y_valid) + 0.1), bottom=-0.01)
    plt.axvline(best[0], color='red')
    plt.legend()
plt.show()

In [None]:
def get_accuracy(model, loader):
    model.to(device)
    model.eval()
    with torch.no_grad():
        acc = 0
        once = False
        for images, labels in tqdm(loader):
            outputs = model(images.to(device))
            _, pred = torch.max(outputs.data, 1)
            acc += torch.sum(pred.to(device) == labels.to(device)).to('cpu')
            if once == False:
                print(outputs.data)
                print(pred)
                print(labels)
                print(pred.to(device) == labels.to(device))
                once = True
        return acc / len(loader.dataset)


# model = MLPModel()
model = torchvision.models.vgg16_bn(pretrained=True)

# VGG 네트워크로 저장한 것이 아니기 때문에, 같은 구조로 맞춘 후 읽어야한다.
model = nn.Sequential(collections.OrderedDict([
    ('net', model),
    ('classifier', my_classifier)
]))
model.load_state_dict(torch.load(SAVE_BEST_PATH), strict=False)

# epoch 도중에 사용한 이름과 겹쳐서, 다시 재정의한다.
train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=False)

# Get accuracy
train_acc = get_accuracy(model, train_loader)
print('train accuracy:', '{:.5f} %'.format(train_acc * 100))

test_acc = get_accuracy(model, test_loader)
print(' test accuracy:', '{:.5f} %'.format(test_acc * 100))

# `test1` 데이터를 제출을 위해 csv 포맷으로 저장

In [None]:
class SubmitImageFolder(torch.utils.data.Dataset):
    def __init__(self, root, transform):
        self.root = root
        self.files = filter(lambda s: s.endswith('.jpg'), os.listdir(root))
        self.files = list(map(lambda s: os.path.join(root, s), self.files))
        self.transform = transform

    def __len__(self):
        return len(self.files)

    def __getitem__(self, index):
        path = self.files[index]
        image = self.transform(get_image(path))
        id_ = os.path.basename(path).split('.')[0]
        return (int(id_), image)


submit_set = SubmitImageFolder(TEMP_DIR + '/submit', transform=image_to_tensor)
submit_loader = torch.utils.data.DataLoader(submit_set, batch_size=64)

sample_pred_images = None

model.to(device)

with torch.no_grad():
    pred_ids = torch.empty(0)
    pred_labels = torch.empty(0)
    for ids, images in tqdm(submit_loader):
        images = images.to(device)
        outputs = model(images)
        _, pred = torch.max(outputs.data, 1)
        # append list
        pred_ids = torch.cat([pred_ids, ids])
        pred_labels = torch.cat([pred_labels, pred.to('cpu')])
        # save image to show
        if sample_pred_images is None:
            sample_pred_images = (images, pred)

    pred_df = pd.DataFrame({'id': pred_ids, 'label': pred_labels})
    pred_df['id'] = pred_df['id'].astype('int')
    pred_df['label'] = pred_df['label'].astype('int')

pred_df.head()

일부 데이터셋의 결과를 눈으로 확인해보자

In [None]:
sample_images, sample_preds = sample_pred_images
plt.figure(figsize=(10, 10))
for index in range(5 * 5):
    plt.subplot(5, 5, index + 1)
    image = unnormalize(sample_images[index])
    image = image.to('cpu').permute(1,2,0).numpy()
    guess = sample_preds[index].to('cpu').numpy().item()
    plt.title(label_to_title[guess])
    plt.imshow(image)
    plt.axis("off")
plt.show()

In [None]:
pred_df.to_csv('result.csv', index=False)

현재 데이터셋에는 test에 라벨이 없어서, valid로 확인할 수 밖에 없다.