# 1. Data Exploration
- Let's take a look at the data.
- 일단 데이터를 살펴봅시다.

## 1.1 CSV

In [None]:
import pandas as pd
input_path = "/kaggle/input/histopathologic-cancer-detection"
train_data = pd.read_csv(f"{input_path}/train_labels.csv")
train_data.head()



- It consists of an image name (id) and a classification value (label).
- 이미지 이름(id)와 분류값(label)로 구성되어있네요.

## 1.2 Img
- The image has a tif extension and cannot be loaded through OpenCV. You need to utilize the PIL library.
- 이미지가 tif 확장자로 되어있어 OpenCV를 통해서 불러올 수 없습니다. PIL 라이브러리를 활용해야합니다.

In [None]:
from PIL import Image
from matplotlib import pyplot as plt
import numpy as np
id = train_data.loc[0]["id"]
im = np.array(Image.open(f'{input_path}/train/{id}.tif'))
plt.title(f"{id} \n {im.shape}")
plt.imshow(im)
plt.show()

# 2. Data Loader
- Let's configure the dataset and data loader.
- 데이터셋과 데이터로더를 구성해봅시다.
- Ref : https://tutorials.pytorch.kr/beginner/data_loading_tutorial.html

## 2.1 Dataset

In [None]:
import torch
class Dataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.input_path = "/kaggle/input/histopathologic-cancer-detection"
        self.dataset = pd.read_csv(f"{input_path}/train_labels.csv")
        self.img_path = f"{self.input_path}/train/"
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        image = Image.open(f'{self.input_path}/train/{self.dataset.iloc[idx]["id"]}.tif')
        label = self.dataset.iloc[idx]["label"]
        
        if self.transform:
            image = self.transform(image)
        #label = np.zeros([1 if label == i else 0 for i in range(2)])
        return image, label

## 2.2 Dataloader & Transform
- 데이터 증강을 위한 트랜스폼을 구성합니다.
- Construct a transform for data augmentation.

In [None]:
import torchvision
data_transform = torchvision.transforms.Compose([
    torchvision.transforms.RandomRotation(30),
    torchvision.transforms.Resize((256, 256)), 
    torchvision.transforms.RandomResizedCrop(224),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.RandomVerticalFlip(),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
])

batch_size = 128
dataset = Dataset(transform=data_transform)
trainset, validset = torch.utils.data.random_split(dataset, [int(len(dataset)*0.7), len(dataset) - int(len(dataset)*0.7)], generator=torch.Generator().manual_seed(42))
dataloaders = {
    "train" : torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=4),
    "valid" : torch.utils.data.DataLoader(validset, batch_size=batch_size, shuffle=True, num_workers=4),
}
dataset_sizes = {
    "train" : int(len(dataset)*0.7),
    "valid" : len(dataset) - int(len(dataset)*0.7),
}
generator = iter(dataloaders["train"])
image, label = next(generator)
for i in range(3):
    plt.title(label[i])
    plt.imshow(image[i].permute(1, 2, 0))
    plt.show()

# 3. Modeling
- Let's do transfer learning using ResNet.
- ResNet을 이용해서 전이학습 시켜봅시다.
https://tutorials.pytorch.kr/beginner/transfer_learning_tutorial.html

## 3.1 Import Model
- Load the ResNet to be used as the backbone model, and adjust the Output Layer to 2 (number of classes in our problem).
- 백본 모델로 사용할 ResNet을 불러오고, Output Layer를 2개(우리 문제의 클래스 수)로 조정합니다. 

In [None]:
backbone = torchvision.models.resnet50(pretrained=True)
num = backbone.fc.in_features
backbone.fc = torch.nn.Linear(num, 2)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = backbone.to(device)
print(device)

## 3.2 Set Functions
- SGD and Adam are mainly used as optimizers. But here we will use Adamp (ICLR 2021).
- Optimizer로는 SGD, Adam을 주로 사용합니다. 하지만 여기서는 Adamp(ICLR 2021)를 사용해보겠습니다.
- Ref : https://github.com/clovaai/AdamP

In [None]:
!pip3 install adamp
import adamp

In [None]:
criterion = torch.nn.CrossEntropyLoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = adamp.AdamP(model.parameters(), lr=0.001, betas=(0.9, 0.999), weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

## 3.3 Training

In [None]:
import time
import copy
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))

        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train() 
            else:
                model.eval() 

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            if phase == 'valid' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)
    return model

In [None]:
model = train_model(model, criterion, optimizer, scheduler, num_epochs=20)

# 4. Test


In [None]:
input_path = "/kaggle/input/histopathologic-cancer-detection"
test_data = pd.read_csv(f"{input_path}/sample_submission.csv")
print(len(test_data))
test_data.head()

In [None]:
class Testset(torch.utils.data.Dataset):
    def __init__(self, transform):
        self.input_path = "/kaggle/input/histopathologic-cancer-detection"
        self.dataset = pd.read_csv(f"{input_path}/sample_submission.csv")
        self.img_path = f"{self.input_path}/test/"
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        image = Image.open(f'{self.input_path}/test/{self.dataset.iloc[idx]["id"]}.tif')
        label = self.dataset.iloc[idx]["label"]
        
        if self.transform:
            image = self.transform(image)
       
        return image

In [None]:
data_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize((224, 224)), 
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
])
testset = Testset(transform= data_transform)
test_loader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False)

In [None]:
import numpy as np
for i, image in enumerate(test_loader):
    if i % 10000 == 0:
        print(i, "/", len(test_loader))
    outputs = model(image.to(device))
    test_data.loc[i, 'label'] = int(torch.argmax(outputs))

In [None]:
test_data.to_csv('submission.csv', index=False)
test_data