In [None]:
import numpy as np 
import pandas as pd
import os
import cv2

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as nnf

In [None]:
NUM_CL = 19

BATCH = 16
EPOCHS = 15

LR = 0.0001
IM_SIZE = 256

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
PATH = '/kaggle/input/hpa-single-cell-image-classification/'
TRAIN_DIR = PATH + 'train/'
TEST_DIR = PATH + 'test/'

In [None]:
train = pd.read_csv(PATH +'train.csv')

# I take just a subset to reduce training time 
# train = train[:1000]

train.head()

In [None]:
Transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

In [None]:
class GetData(Dataset):
    def __init__(self, path, list_IDs, labels, img_size, Transform):
        self.path = path
        self.list_IDs = list_IDs
        self.labels = labels
        self.img_size = img_size        
        self.transform = Transform
        
    def __len__(self):
        return len(self.list_IDs)    
    
    def __getitem__(self, index):
        ID = self.list_IDs[index]   
        
        # I take just a "green" images
        data_file = cv2.imread(self.path + ID + '_green.png')
            
        img = cv2.resize(data_file, (self.img_size, self.img_size))
        X = img/255.        
        
        if "train" in self.path:                       
            y = self.labels[index]
            y = y.split('|')
            y = list(map(int, y))            
            y = np.eye(NUM_CL, dtype='float')[y]                                    
            y = y.sum(axis=0)            
            return self.transform(X), y
        
        elif "test" in self.path:
            return self.transform(X), ID

In [None]:
X_Train, Y_Train = train['ID'].values, train['Label'].values

trainset = GetData(TRAIN_DIR, X_Train, Y_Train, IM_SIZE, Transform)
trainloader = DataLoader(trainset, batch_size=BATCH, shuffle=True)

In [None]:
X_Test = [name.rstrip('green.png').rstrip('_') for name in (os.listdir(TEST_DIR)) if '_green.png' in name]

testset = GetData(TEST_DIR, X_Test, None, IM_SIZE, Transform)
testloader = DataLoader(testset, batch_size=1, shuffle=False)

In [None]:
model = torchvision.models.resnet34()
model.fc = nn.Linear(512, NUM_CL, bias=True)
model = model.to(DEVICE)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# TRAIN

In [None]:
%%time

for epoch in range(EPOCHS):
    tr_loss = 0.0

    model = model.train()

    for i, (images, labels) in enumerate(trainloader):        
        images = images.to(DEVICE)
        labels = labels.to(DEVICE)       
        logits = model(images.float())       
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss += loss.detach().item()
    
    model.eval()
    print('Epoch: %d | Loss: %.4f'%(epoch, tr_loss / i))

# TEST

In [None]:
%%time

s_ls = []

with torch.no_grad():
    model.eval()
    for image, fname in testloader:     
        image = image.to(DEVICE)        
        logits = model(image.float())                          
        prob = nnf.softmax(logits, dim=1)
        p, top_class = prob.topk(1, dim=1)
        sp = ' '.join(str(e) for e in [top_class[0][0].item(), p[0][0].item()])               
        img = cv2.imread(TEST_DIR + fname[0] + '_green.png')
        
        if img.shape[0] == 2048:
            sp = sp + ' eNoLCAgIMAEABJkBdQ=='
        elif img.shape[0] == 1728:
            sp = sp + ' eNoLCAjJNgIABNkBkg=='
        else:
            sp = sp + ' eNoLCAgIsAQABJ4Beg=='
        
        s_ls.append([fname[0], img.shape[1], img.shape[0], sp])

In [None]:
sub = pd.DataFrame.from_records(s_ls, columns=['ID', 'ImageWidth', 'ImageHeight', 'PredictionString'])

print(len(sub))
sub.head()

In [None]:
sub.to_csv("submission.csv", index=False)