# Baseline

## 0. Info

### Feature
* 


### Reference
* https://www.kaggle.com/anandsm7/hpa-starter-pytorch-eda-classification-nfnets

## 1. Settings

In [3]:
import os
from glob import glob
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

In [220]:
class CONFIG:
    lr = 1e-4
    weight_decay = 1e-4
    n_class = 19
    image_size = 256
    batch_size = 32
    epoch_size = 3
    data_dir = '../input/hpa-single-cell-image-classification'

## 2. Data

In [153]:
CELL_LABEL = {
    0:  "Nucleoplasm", 
    1:  "Nuclear membrane",   
    2:  "Nucleoli",   
    3:  "Nucleoli fibrillar center" ,  
    4:  "Nuclear speckles",
    5:  "Nuclear bodies",
    6:  "Endoplasmic reticulum",   
    7:  "Golgi apparatus",
    8:  "Intermediate filaments",
    9:  "Actin filaments", 
    10: "Microtubules",
    11:  "Mitotic spindle",
    12:  "Centrosome",   
    13:  "Plasma membrane",
    14:  "Mitochondria",   
    15:  "Aggresome",
    16:  "Cytosol",   
    17:  "Vesicles and punctate cytosolic patterns",   
    18:  "Negative"
}

In [178]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, mode, transform=None):
        self.data = data
        self.mode = mode
        self.transform = transform
        self.img_dir = os.path.join(CONFIG.data_dir, mode)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        _id = item['ID']
        
#         r = Image.open(os.path.join(CONFIG.data_dir, self.mode, f'{_id}_red.png'))
#         g = Image.open(os.path.join(CONFIG.data_dir, self.mode, f'{_id}_green.png'))
#         b = Image.open(os.path.join(CONFIG.data_dir, self.mode, f'{_id}_blue.png'))
#         img = np.stack([r, g, b], axis=-1)
#         img = Image.fromarray(img)
        img = Image.open(os.path.join(CONFIG.data_dir, self.mode, f'{_id}_green.png')).convert('RGB')
        
        if self.transform:
            img = self.transform(img)
        
        if self.mode == 'test':
            return img
        else:
            target = item['Label']
            target = [int(i) for i in target.split('|')]
            multi_hot = np.zeros(CONFIG.n_class)
            multi_hot[target] = 1
            return img, multi_hot

In [230]:
train_data = pd.read_csv(os.path.join(CONFIG.data_dir, 'train.csv'))
test_data = pd.read_csv(os.path.join(CONFIG.data_dir, 'sample_submission.csv'))
test_data['PredictionString'] = test_data['PredictionString'].apply(lambda x : x.split(' ')[-1])

display(train_data)
display(test_data)

Unnamed: 0,ID,Label
0,5c27f04c-bb99-11e8-b2b9-ac1f6b6435d0,8|5|0
1,5fb643ee-bb99-11e8-b2b9-ac1f6b6435d0,14|0
2,60b57878-bb99-11e8-b2b9-ac1f6b6435d0,6|1
3,5c1a898e-bb99-11e8-b2b9-ac1f6b6435d0,16|10
4,5b931256-bb99-11e8-b2b9-ac1f6b6435d0,14|0
...,...,...
21801,dd0989c4-bbca-11e8-b2bc-ac1f6b6435d0,14
21802,dd1f7fb8-bbca-11e8-b2bc-ac1f6b6435d0,3|0
21803,dd5cb36a-bbca-11e8-b2bc-ac1f6b6435d0,14|0
21804,df573730-bbca-11e8-b2bc-ac1f6b6435d0,14


Unnamed: 0,ID,ImageWidth,ImageHeight,PredictionString
0,0040581b-f1f2-4fbe-b043-b6bfea5404bb,2048,2048,eNoLCAgIMAEABJkBdQ==
1,004a270d-34a2-4d60-bbe4-365fca868193,2048,2048,eNoLCAgIMAEABJkBdQ==
2,00537262-883c-4b37-a3a1-a4931b6faea5,2048,2048,eNoLCAgIMAEABJkBdQ==
3,00c9a1c9-2f06-476f-8b0d-6d01032874a2,2048,2048,eNoLCAgIMAEABJkBdQ==
4,0173029a-161d-40ef-af28-2342915b22fb,3072,3072,eNoLCAgIsAQABJ4Beg==
...,...,...,...,...
554,fea47298-266a-4cf4-93bd-55d1bcc2fc7d,1728,1728,eNoLCAjJNgIABNkBkg==
555,feb955db-6c07-4717-a98b-92236c8e01d8,2048,2048,eNoLCAgIMAEABJkBdQ==
556,fefb9bb7-934a-40d1-8d2f-210265857388,2048,2048,eNoLCAgIMAEABJkBdQ==
557,ff069fa2-d948-408e-91b3-034cfea428d1,3072,3072,eNoLCAgIsAQABJ4Beg==


In [221]:
train_transform = transforms.Compose([
    transforms.Resize((CONFIG.image_size, CONFIG.image_size)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation((-180, 180)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

test_transform = transforms.Compose([
    transforms.Resize((CONFIG.image_size, CONFIG.image_size)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])


train_dataset = Dataset(train_data, 'train', train_transform)
test_dataset = Dataset(test_data, 'test', test_transform)
train_dataset, valid_dataset = torch.utils.data.random_split(train_dataset, [20000, 1806])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=CONFIG.batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=CONFIG.batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False)

In [173]:
x, y = next(iter(train_loader))
x.size(), y.size()

(torch.Size([32, 3, 256, 256]), torch.Size([32, 19]))

In [158]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        model = torchvision.models.resnet50()
        weights = torch.load('../input/pytorch-models/resnet50.pth')
        model.load_state_dict(weights)
        self.featurizer = nn.Sequential(*list(model.children())[:-2])
        self.classifier = nn.Sequential(
            nn.AdaptiveMaxPool2d(1),
            nn.Flatten(),
            nn.Linear(2048, CONFIG.n_class)
        )
        
    def forward(self, x):
        x = self.featurizer(x)
        x = self.classifier(x)
        return x

## 4. Train

In [159]:
class AverageMeter(object):
    def __init__(self, name):
        self.name = name
        self.reset()

    def reset(self):
        self.sum = 0
        self.count = 0
        self.avg = 0

    def update(self, val, n=1):
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = f'{self.name:10s} {self.avg:.3f}'
        return fmtstr


class ProgressMeter(object):
    def __init__(self, meters):
        self.meters = [AverageMeter(m) for m in meters]
    
    def reset(self):
        for m in self.meters:
            m.reset()
    
    def update(self, values, n=1):
        for m, v in zip(self.meters, values):
            m.update(v, n)
            self.__setattr__(m.name, m.avg)

    def log(self):
        msg = [str(meter) for meter in self.meters]
        msg = ' | '.join(msg)
        return msg


def accuracy(logits, targets):
    preds = torch.round(torch.sigmoid(logits))
    acc = (preds == targets).float().mean()
    return acc

In [242]:
class Trainer(object):
    def __init__(self, model, criterion, optimizer, scheduler, device):
        self.model = model.to(device)
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device
        self.best_epoch, self.best_score = 0, 0
        

    def train(self, train_loader, epoch):
        progress = ProgressMeter(["train_loss", "train_acc"])
        self.model.train()

        pbar = tqdm(train_loader)
        pbar.set_description(f'TRAIN {epoch:03d}')
        for idx, (inputs, targets) in enumerate(pbar):
            inputs, targets = inputs.to(self.device), targets.to(self.device)
            outputs = self.model(inputs)
            loss = self.criterion(outputs, targets)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            loss = loss.item()
            acc = accuracy(outputs, targets).item()
            progress.update([loss, acc], n=inputs.size(0))
            pbar.set_postfix(log=progress.log())

        if self.scheduler:
            self.scheduler.step()

    
    def validate(self, valid_loader, epoch):
        progress = ProgressMeter(["valid_loss", "valid_acc"])
        self.model.eval()

        pbar = tqdm(valid_loader)
        pbar.set_description(f'VALID {epoch:03d}')
        with torch.no_grad():
            for idx, (inputs, targets) in enumerate(pbar):
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = self.model(inputs)
                loss = self.criterion(outputs, targets)
                loss = loss.item()
                acc = accuracy(outputs, targets).item()
                progress.update([loss, acc], n=inputs.size(0))
                pbar.set_postfix(log=progress.log())

            if progress.valid_acc > self.best_score:
                self.best_epoch = epoch
                self.best_score = progress.valid_acc
                ckpt = {
                    'best_epoch': self.best_epoch,
                    'best_score': self.best_score,
                    'model_state_dict': self.model.state_dict()
                }
                torch.save(ckpt, 'ckpt.pt')

    
    def test(self, test_loader):
        progress = ProgressMeter(["test_loss", "test_acc"])
        ckpt = torch.load('ckpt.pt')
        self.model.load_state_dict(ckpt['model_state_dict'])
        self.model.eval()

        pbar = tqdm(test_loader)
        pbar.set_description(f'TEST')
        preds = []
        with torch.no_grad():
            for idx, inputs in enumerate(pbar):
                inputs = inputs.to(self.device)
                outputs = model(inputs)
                probs = torch.softmax(outputs, dim=1)
                probs, classes = probs.topk(1, dim=1)
                preds += [str(classes[i][0].item()) + ' ' + str(probs[i][0].item()) + ' ' for i in range(inputs.size(0))]
        return preds

In [243]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Model()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG.lr, weight_decay=1e-4)
scheduler = None
trainer = Trainer(model, criterion, optimizer, scheduler, device)

In [166]:
for ep in range(CONFIG.epoch_size):
    print('=' * 70)
    trainer.train(train_loader, ep)
    trainer.validate(valid_loader, ep)



  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]



  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]



  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]

## 5.Submission

In [244]:
preds = trainer.test(test_loader)

  0%|          | 0/140 [00:00<?, ?it/s]

In [246]:
test_data['PredictionString'] = pd.Series(preds) + test_data['PredictionString']
test_data.to_csv('submission.csv', index=False)
test_data.head()

Unnamed: 0,ID,ImageWidth,ImageHeight,PredictionString
0,0040581b-f1f2-4fbe-b043-b6bfea5404bb,2048,2048,13 0.5434622168540955 eNoLCAgIMAEABJkBdQ==
1,004a270d-34a2-4d60-bbe4-365fca868193,2048,2048,13 0.7916916012763977 eNoLCAgIMAEABJkBdQ==
2,00537262-883c-4b37-a3a1-a4931b6faea5,2048,2048,16 0.5044105648994446 eNoLCAgIMAEABJkBdQ==
3,00c9a1c9-2f06-476f-8b0d-6d01032874a2,2048,2048,0 0.3216249346733093 eNoLCAgIMAEABJkBdQ==
4,0173029a-161d-40ef-af28-2342915b22fb,3072,3072,16 0.9853707551956177 eNoLCAgIsAQABJ4Beg==
