In [None]:
# Install & import libraries

!pip install efficientnet_pytorch
!pip install "../input/keras-application/Keras_Applications-1.0.8-py3-none-any.whl"
!pip install "../input/efficientnet111/efficientnet-1.1.1-py3-none-any.whl"
!pip install "../input/pycocotools/pycocotools-2.0-cp37-cp37m-linux_x86_64.whl"
!pip install "../input/hpapytorchzoozip/pytorch_zoo-master"
!pip install "../input/tfexplainforoffline/tf_explain-0.2.1-py3-none-any.whl"

import numpy as np 
import pandas as pd
import os, gc, cv2

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as nnf
import sklearn
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import gc

from tqdm.notebook import tqdm

from efficientnet_pytorch import EfficientNet

In [None]:
# Check if images can be loaded 
# images
red = np.load('../input/segmented-train/red/000004_red.npy')
blue = np.load('../input/segmented-train/blue/000004_blue.npy')
yellow = np.load('../input/segmented-train/yellow/000004_yellow.npy')
green = np.load('../input/segmented-train/green/000004_green.npy')

plt.imshow(np.stack((red, blue, yellow), axis=2))

# labels
df_labels = pd.read_csv('../input/segmented-train/labels.csv')
df_labels.head()

In [None]:
# Set hyper parameters
NUM_CL = 19

BATCH = 16
EPOCHS = 3 # max 5

LR = 0.0001
IM_SIZE = 256

# Rotating flag
# True: train a pretrained model with additonal multi-label dataset
# False: train a model with single-label dataset 
ROT = True 

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
PATH = '/kaggle/input/'

TRAIN_DIR = PATH + 'segmented-train/'
TRAIN_multi_DIR = PATH + 'segmented-multi-60000/'    

***Helper functions***

In [None]:
# Extend the size of an image by padding
def add_margin(image, size):
    '''
    Extend the size of an image by padding 
    Height * Width * Channel -> size * size * Channel
    '''
    H, W, C = image.shape
    pad_H1 = (size - H)//2
    pad_H2 = pad_H1 + (size - H)%2
    pad_W1 = (size - W)//2
    pad_W2 = pad_W1 + (size - W)%2
    
    return np.pad(image,[(pad_H1, pad_H2),(pad_W1, pad_W2),(0,0)], 'constant')

# Make the image square by padding
def resize_to_square(image):
    '''
    Extend the size of an image by padding 
    Height * Width * Channel -> max(H, W) * max(H, W) * Channel
    '''
    H, W, C = image.shape
    size = max(H, W)    
    return add_margin(image, size)


In [None]:
df_labels = pd.read_csv('../input/segmented-train/labels.csv', index_col=0)
list_IDs = df_labels.index.tolist()


df_labels_multi = pd.read_csv('../input/segmented-multi-60000/labels.csv', index_col=0)
list_IDs_multi = df_labels_multi.index.tolist()


In [None]:
# Dataset class for cell-level classification
# For training dataset
class GetData_single_cell(Dataset):
    def __init__(self, path, list_IDs, df_labels, img_size, Transform='None'):
        self.path = path
        self.list_IDs = list_IDs
        self.labels = df_labels
        self.img_size = img_size        
        self.transform = Transform
        
    def __len__(self):
        return len(self.list_IDs)    
    
    def __getitem__(self, index):
        ID = self.list_IDs[index]   
                        
        red = np.load(self.path + "red/" + str(ID).zfill(6) + '_red.npy')
        blue = np.load(self.path + "blue/" + str(ID).zfill(6) + '_blue.npy')
        yellow = np.load(self.path + "yellow/" + str(ID).zfill(6) + '_yellow.npy')
        green = np.load(self.path + "green/" + str(ID).zfill(6) + '_green.npy')
        
        img = np.dstack((red, blue, yellow, green))

        img = resize_to_square(img)
        img = cv2.resize(img, (self.img_size, self.img_size)) 
        X = img/255.
        X = np.transpose(X, (2, 0, 1))

        y = self.labels.loc[ID]
        return X, torch.tensor(y, dtype=torch.float)


In [None]:
class GetData_single_cell_180(Dataset):
    def __init__(self, path, list_IDs, df_labels, img_size, Transform='None'):
        self.path = path
        self.list_IDs = list_IDs
        self.labels = df_labels
        self.img_size = img_size        
        self.transform = Transform
        
    def __len__(self):
        return len(self.list_IDs)    
    
    def __getitem__(self, index):
        ID = self.list_IDs[index]   
                        
        red = np.rot90(np.load(self.path + "red/" + str(ID).zfill(6) + '_red.npy'),k=2)
        blue = np.rot90(np.load(self.path + "blue/" + str(ID).zfill(6) + '_blue.npy'),k=2)
        yellow = np.rot90(np.load(self.path + "yellow/" + str(ID).zfill(6) + '_yellow.npy'),k=2)
        green = np.rot90(np.load(self.path + "green/" + str(ID).zfill(6) + '_green.npy'),k=2)
        
        img = np.dstack((red, blue, yellow, green))

        img = resize_to_square(img)
        img = cv2.resize(img, (self.img_size, self.img_size)) 
        X = img/255.
        X = np.transpose(X, (2, 0, 1))

        y = self.labels.loc[ID]
        return X, torch.tensor(y, dtype=torch.float)


In [None]:
# Split the dataset into training data and validation data
num_train = round(len(list_IDs) * 0.8)
num_valid = len(list_IDs) - num_train
print(num_train, num_valid)

num_train_180 = 42583 # The same number as the number of weakly labeled dataset

In [None]:
trainset = GetData_single_cell(path=TRAIN_DIR, list_IDs=list_IDs[:num_train], df_labels=df_labels,img_size=IM_SIZE, Transform=None)
trainset_180 = GetData_single_cell_180(path=TRAIN_DIR, list_IDs=list_IDs[:num_train_180], df_labels=df_labels,img_size=IM_SIZE, Transform=None)

if ROT:
    trainloader = DataLoader(trainset_180, batch_size=BATCH, shuffle=True)
else:
    trainloader = DataLoader(trainset, batch_size=BATCH, shuffle=True)

In [None]:
model = EfficientNet.from_pretrained('efficientnet-b0',num_classes=NUM_CL, in_channels=4)

if ROT:
    model.load_state_dict(torch.load('../input/b0-3epochs/state_dict.pth'))

model = model.to(DEVICE)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


# TRAIN

In [None]:
%%time
# Reference: https://www.kaggle.com/ateplyuk/hpa-pytorch-starter-code

for epoch in range(EPOCHS):
    tr_loss = 0.0

    model = model.train()

    for i, (images, labels) in enumerate(trainloader):        
        images = images.to(DEVICE)
        labels = labels.to(DEVICE)       
        logits = model(images.float())       
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss += loss.detach().item()

        del images, labels
        gc.collect()
    
    model.eval()
    print('Epoch: %d | Loss: %.4f'%(epoch, tr_loss / i))

In [None]:
# Evaluate the model for train dataset
trainloader_eval = DataLoader(trainset, batch_size=BATCH, shuffle=False)

with torch.no_grad():
    model.eval()
    for i, (images, labels) in enumerate(trainloader_eval):
        images = images.to(DEVICE)
        labels = labels.to(DEVICE)       
        logits = model(images.float())   
        prob = nnf.softmax(logits, dim=1)
        
        if i == 0:
            np_prob = np.array(prob.cpu())
            np_preds = np_prob.max(axis=1,keepdims=1) == np_prob
            np_labels = np.array(labels.cpu())
        else:
            np_prob = np.vstack([np_prob, np.array(prob.cpu())])
            np_preds = np.vstack([np_preds, np.array(prob.cpu()).max(axis=1,keepdims=1) == np.array(prob.cpu())])
            np_labels = np.vstack([np_labels, np.array(labels.cpu())])
        
        del images, labels
        gc.collect()
            

In [None]:
# multi-class evaluation
# https://vitalflux.com/micro-average-macro-average-scoring-metrics-multi-class-classification-python/
y_true = np.argmax(np_labels, axis = 1)
y_pred = np.argmax(np_preds, axis = 1)

acc = sklearn.metrics.accuracy_score(y_true, y_pred)
precision = sklearn.metrics.precision_score(y_true, y_pred, average='micro')
f1 = sklearn.metrics.f1_score(y_true, y_pred, average='micro')

print(sklearn.metrics.classification_report(y_true, y_pred))

In [None]:
# Output csv file
pd_labels = pd.DataFrame(np_labels.astype(np.bool).astype(int))
pd_prob = pd.DataFrame(np_prob)

pd_labels.to_csv("labels_train.csv", index=True)
pd_prob.to_csv("prob_train.csv", index=True)

# Save the trained model
torch.save(model.state_dict(), 'state_dict.pth')

# VALIDATION

In [None]:
validset = GetData_single_cell(path=TRAIN_DIR, list_IDs=list_IDs[num_train:], df_labels=df_labels,img_size=IM_SIZE, Transform=None)
validloader = DataLoader(validset, batch_size=BATCH, shuffle=True)

In [None]:
# Evaluate the model for validation dataset

with torch.no_grad():
    model.eval()
    for i, (images, labels) in enumerate(validloader):
        images = images.to(DEVICE)
        labels = labels.to(DEVICE)       
        logits = model(images.float())   
        prob = nnf.softmax(logits, dim=1)

        if i == 0:
            np_prob = np.array(prob.cpu())
            np_preds = np_prob.max(axis=1,keepdims=1) == np_prob
            np_labels = np.array(labels.cpu())
        else:
            np_prob = np.vstack([np_prob, np.array(prob.cpu())])
            np_preds = np.vstack([np_preds, np.array(prob.cpu()).max(axis=1,keepdims=1) == np.array(prob.cpu())])
            np_labels = np.vstack([np_labels, np.array(labels.cpu())])

        del images, labels
        gc.collect()

In [None]:
# multi-class evaluation
# https://vitalflux.com/micro-average-macro-average-scoring-metrics-multi-class-classification-python/
y_true = np.argmax(np_labels, axis = 1)
y_pred = np.argmax(np_preds, axis = 1)

acc = sklearn.metrics.accuracy_score(y_true, y_pred)
precision = sklearn.metrics.precision_score(y_true, y_pred, average='micro')
f1 = sklearn.metrics.f1_score(y_true, y_pred, average='micro')

print(sklearn.metrics.classification_report(y_true, y_pred))

In [None]:
# Output csv file
pd_labels = pd.DataFrame(np_labels.astype(np.bool).astype(int))
pd_prob = pd.DataFrame(np_prob)

pd_labels.to_csv("labels_valid.csv", index=True)
pd_prob.to_csv("prob_valid.csv", index=True)