Introduction
* Predicting protein organelle localization labels for each cell in the image.
* Dataset:  
    - There are four filter images of each cell. In this implementation we are using only 3 as per the host suggestion.
    - Concatenation of Red, Yellow and Blue to form 3-channel. Resize, cropping and normalization of image to pipline to model.
    - Final image size will be 224 for the model.
* Model - [EfficientNet](https://github.com/lukemelas/EfficientNet-PyTorch) :
    - Pretrained `efficientnet-b0` is used for casual results, it can be further improved with higher models and optimization.
* Loss: `BCEWithLogitsLoss` from pytorch
* Optimizer: 'AdamW' optimizer , adam with weight decay generalizes well compare with adam.

In [None]:
!pip install efficientnet_pytorch

# Importing Libraries

In [None]:
from efficientnet_pytorch import EfficientNet

import os 
import sys 
from glob import glob
from tqdm import tqdm

import numpy as np 
import pandas as pd 
pd.options.display.max_columns = 100

%matplotlib inline
import matplotlib.pyplot as plt

import torch 
torch.manual_seed(12)
from torch import nn 
import torch.nn.functional  as F 
from torch.utils.data import Dataset,DataLoader
import torchvision 
from torchvision import transforms, utils 
from PIL import Image

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

Read csv file

In [None]:
TRAIN_IMAGES = "../input/hpa-single-cell-image-classification/train/"
TEST_IMAGES = "../input/hpa-single-cell-image-classification/test/"
train = pd.read_csv("../input/hpa-single-cell-image-classification/train.csv")
sample = pd.read_csv("../input/hpa-single-cell-image-classification/sample_submission.csv")

# Prelimanary Study

In [None]:
print("train columns",train.columns)
print("test columns",sample.columns)
print("train shape",train.shape,"test shape",sample.shape)
mlb = MultiLabelBinarizer()
train.Label = train.Label.apply(lambda x: list(map(int,x.split("|"))))
train[list(range(19))] = mlb.fit_transform(train.Label)
print("different classes created",mlb.classes_)
train.head()

# Creating Dataset Generator

In [None]:
class ProteinOrganelleDataset(Dataset):
    def __init__(self,images_path,image_ids,labels,transform=None):
        super().__init__()
        self.transform = transform
        self.images_path = images_path
        self.image_ids = image_ids
        self.labels = labels
        self.filters = ["yellow", "blue", "red"]
    def __len__(self):
        return len(self.image_ids)
    
    def __getitem__(self,idx):
        image_ids = self.image_ids[idx]
        images_path = [os.path.join(self.images_path,str(image_ids)+"_"+i+".png") for i in self.filters]
        images = [np.asarray(Image.open(img)) for img in images_path]
        images = np.concatenate([np.expand_dims(img,-1) for img in images],axis=-1)
        images = Image.fromarray(images)
        if self.transform:
             images = self.transform(images) 
        labels = self.labels[idx,:]
        return {
            "images": images,
            "labels": torch.tensor(labels,dtype=torch.float)
        }

# Image Transformations

In [None]:
train_transform = transforms.Compose([
                transforms.Resize(250),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),])
validation_transform = transforms.Compose([
                transforms.Resize(250),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),])

# Prepare DataLoader

In [None]:
def dataloader(batchsize):
    dfx = pd.read_csv("../input/hpa-single-cell-image-classification/train.csv")
    mlb = MultiLabelBinarizer()
    dfx.Label = dfx.Label.apply(lambda x: list(map(int,x.split("|"))))
    dfx[list(range(19))] = mlb.fit_transform(dfx.Label)
    train,val = train_test_split(dfx,test_size=0.25)
    train_dataset = ProteinOrganelleDataset(images_path=TRAIN_IMAGES,
                                   image_ids=train.ID.values,
                                   labels=train.iloc[:,-19:].values,
                                    transform = train_transform)
    validation_dataset = ProteinOrganelleDataset(images_path=TRAIN_IMAGES,
                                   image_ids=val.ID.values,
                                   labels=val.iloc[:,-19:].values,
                                    transform = train_transform)
    train_dataloader = DataLoader(train_dataset,batch_size=batchsize,shuffle=True,num_workers=4)
    validation_dataloader = DataLoader(validation_dataset,batch_size=batchsize,shuffle=False,num_workers=4)
    return train_dataloader, validation_dataloader


# Data Visualization

In [None]:
data_loader,_ = dataloader(5)
data = next(iter(data_loader))

def show(img):
    npimg = img.numpy()
    plt.figure(figsize=(20, 50))
    plt.imshow(np.transpose(npimg, (1,2,0)))
    
show(utils.make_grid(data["images"],padding=10,normalize=True))

# Creating the Model

In [None]:
class EfficientNetProtein(nn.Module):
    def __init__(self,n_classes=19):
        super().__init__()
        self.efficientnet = EfficientNet.from_pretrained('efficientnet-b0',num_classes=n_classes)
    
    def forward(self,img):
        out = self.efficientnet(img)
        return out             

Training and Validation Functions

In [None]:
def train_fn(dataloader,model,optimizer,device):
    model.train()
    f_output=[]
    f_target=[]
    losses = []
    for d in tqdm(dataloader,total=len(dataloader)):
        images = d["images"].to(device)
        targets = d["labels"].to(device)
        optimizer.zero_grad()
        out = model(images)
        loss = loss_fn(out,targets)
        loss.backward()
        optimizer.step()
        losses.append(loss.cpu().detach().numpy())
        outs = torch.sigmoid(out.cpu().detach())
        outs = np.round(outs.numpy())
        f_output.extend(outs)
        f_target.extend(targets.cpu().detach().numpy())
    f1 = f1_score(f_target,f_output,average="macro")
    return sum(losses)/len(dataloader),f1

def validation_fn(dataloader,model,device):
    model.eval()
    f_output=[]
    f_target=[]
    losses = []
    with torch.no_grad():
        for d in tqdm(dataloader,total=len(dataloader)):
            images = d["images"].to(device)
            targets = d["labels"].to(device)
            out = model(images)
            loss = loss_fn(out,targets)
            losses.append(loss.cpu().detach().numpy())
            outs = torch.sigmoid(out.cpu().detach())
            outs = np.round(outs.numpy())
            f_output.extend(outs)
            f_target.extend(targets.cpu().detach().numpy())
    f1 = f1_score(f_target,f_output,average="macro")
    return sum(losses)/len(dataloader),f1

def loss_fn(output,target):
    return nn.BCEWithLogitsLoss()(output,target)

# Training the Model

In [None]:
EPOCHS=5
TRAINED_MODEL = "protein_efficientnet.pth"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def run():
    train_dataloader,validation_dataloader = dataloader(batchsize=64) 
    model = EfficientNetProtein()
    model.to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters()) 
    print(f"TRAINING on {str(DEVICE).upper()}...")
    best_loss = np.inf
    save_table = np.zeros(shape=(EPOCHS,4))
    for e in range(EPOCHS):
        train_loss,train_f1 = train_fn(train_dataloader,model,optimizer,DEVICE)
        validation_loss,val_f1 = validation_fn(validation_dataloader,model,DEVICE) 
        if validation_loss<best_loss:
            best_loss = validation_loss
            print(f"model saved at {best_loss:.4f} loss")
            torch.save(model.state_dict(),TRAINED_MODEL)
        print(f"Epoch:{e+1} | train: loss {train_loss:.5f} f1 {train_f1:.5f} | val: loss {validation_loss:.5f} f1 {val_f1:.5f}")
        save_table[e,:] = train_loss,train_f1,validation_loss,val_f1
    np.savetxt(f"{model.__class__.__name__}_{EPOCHS}.txt",save_table,delimiter=",")        

In [None]:
run() 