In [None]:
# from https://www.kaggle.com/samusram/hpa-rgb-model-rgby-cell-level-classification
!pip install "../input/keras-application/Keras_Applications-1.0.8-py3-none-any.whl"
!pip install "../input/efficientnet111/efficientnet-1.1.1-py3-none-any.whl"
!pip install "../input/tfexplainforoffline/tf_explain-0.2.1-py3-none-any.whl"

In [None]:
# added by myself
!pip install "../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master"
!pip install natsort

In [None]:
# original in the 'Even Faster HPA Cell Segmentation'
!pip install "../input/pycocotools/pycocotools-2.0-cp37-cp37m-linux_x86_64.whl"
!pip install "../input/hpapytorchzoozip/pytorch_zoo-master"
!pip install "../input/hpacellsegmentatorraman/HPA-Cell-Segmentation/"

In [None]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    import numpy as np
    import pandas as pd
    import os
    from tqdm import tqdm

    import os.path
    import urllib
    import zipfile

    from hpacellseg.cellsegmentator import *
    from hpacellseg import cellsegmentator, utils
    import cv2

    import scipy.ndimage as ndi
    from skimage import filters, measure, segmentation, transform, util
    from skimage.morphology import (binary_erosion, closing, disk,
                                    remove_small_holes, remove_small_objects)

    from PIL import Image
    import matplotlib.pyplot as plt
    
import time

In [None]:
from efficientnet_pytorch import EfficientNet

import torch
import torch.nn as nn
import torch.optim as optim 

import torchvision
from torch.utils.data import DataLoader, Dataset
import torch.utils.data
from torchvision import transforms

import random
from datetime import datetime

In [None]:
LBL_NAMES = ["Nucleoplasm", "Nuclear Membrane", "Nucleoli", "Nucleoli Fibrillar Center", "Nuclear Speckles", 
             "Nuclear Bodies", "Endoplasmic Reticulum", "Golgi Apparatus", "Intermediate Filaments", 
             "Actin Filaments", "Microtubules", "Mitotic Spindle", "Centrosome", "Plasma Membrane", 
             "Mitochondria", "Aggresome", "Cytosol", "Vesicles", "Negative"]

In [None]:
train_csv = pd.read_csv('../input/hpa-single-cell-image-classification/train.csv')

# consider only the single label training images for now, because they are what we need for the model training
single_label_train = train_csv[~train_csv['Label'].str.contains('\|')].copy() # .copy() gets rid of the warning below
# A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
single_label_train['Label'] = single_label_train['Label'].astype('int')
single_label_train = single_label_train.sort_values(by = 'Label')
single_label_train

In [None]:
# create a dictionary for mapping from image ID to label
image_id_to_label = dict(zip(single_label_train['ID'], single_label_train['Label']))

# create a dataframe consisting of three columns: file name, image ID and label
file_name_lst_1 = os.listdir(
    '../input/hpatrainsegmentation2241sthalf/input/fast-cell-segmentation-outsize-224-first-half/train_cell_segmentation_224_1st_half')
image_ID_lst_1 = [e.split('_')[0] for e in file_name_lst_1]
label_lst_1 = [image_id_to_label[key] for key in image_ID_lst_1]

In [None]:
# create a dataframe consisting of three columns: file name, image ID and label
file_name_lst_2_orig = os.listdir(
    "../input/hpa-224-2nd-half/input/zeyusss/train_cell_segmentation")

file_name_lst_2 = [e for e in file_name_lst_2_orig if e.split('_')[0] in image_id_to_label]
image_ID_lst_2 = [e.split('_')[0] for e in file_name_lst_2]
label_lst_2 = [image_id_to_label[key] for key in image_ID_lst_2]

In [None]:
# dictionary help deciding the directory
image_ID_half_dict = dict(zip(image_ID_lst_1 + image_ID_lst_2, [0]*len(image_ID_lst_1) + [1]*len(image_ID_lst_2)))

In [None]:
file_name_lst = file_name_lst_1 + file_name_lst_2
image_ID_lst = image_ID_lst_1 + image_ID_lst_2
label_lst = label_lst_1 + label_lst_2

In [None]:
len(file_name_lst)

In [None]:
# out of memory when loading 85000 images, so cannot load all images at once.
# now try following https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel to build a dataloader

# first decide the train, validation and test set split: 0.8, 0.1, 0.1
rand_lst = np.random.randint(10, size = len(file_name_lst)).tolist()

tmp_df_orig = pd.DataFrame(
    list(zip(file_name_lst, image_ID_lst, label_lst, rand_lst)), columns =['file_name', 'image_id', 'label', 'rand'])

#N_SAMPLE = 10000
tmp_df = tmp_df_orig.sample(frac=1) # create a small subset for initial testing
tmp_df = tmp_df

train_df = tmp_df[tmp_df['rand'] <= 7].copy()
valid_df = tmp_df[tmp_df['rand'] == 8].copy()
test_df = tmp_df[tmp_df['rand'] == 9].copy()

In [None]:
tmp_df_orig.shape

In [None]:
# 参考：
# https://discuss.pytorch.org/t/how-to-load-images-without-using-imagefolder/59999/2
# https://pytorch.org/tutorials/recipes/recipes/custom_dataset_transforms_loader.html

from torch.utils.data import Dataset
import natsort

class CustomDataSet(Dataset):
    def __init__(self, main_dir_1, main_dir_2, file_ids, img_id_to_label, img_id_to_dir, transform = None):
        # file_to_label is a dictionary with key = file name (e.g. 'xxxxx.png') and value = label (e.g. '0')
        # file_ids is a list of file names, e.g. ['xxxx.png', 'yyyy.png']
        self.main_dir = main_dir_1
        self.main_dir_2 = main_dir_2 # because we have two directories
        self.transform = transform
        all_imgs_1 = os.listdir(main_dir_1)
        all_imgs_2 = os.listdir(main_dir_2)
        if set(file_ids).issubset(set(all_imgs_1).union(set(all_imgs_2))):
            imgs_for_use = file_ids
        else:
            print('Input file IDs `file_ids` must be a subset of the files names in `main_dir`.')
            return
        self.total_imgs = natsort.natsorted(file_ids)
        self.img_id_to_label = img_id_to_label
        self.img_id_to_dir = img_id_to_dir

    def __len__(self):
        return len(self.total_imgs)

    def __getitem__(self, idx):
        img_id = self.total_imgs[idx].split('_')[0]
        if self.img_id_to_dir[img_id] == 0:
            img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
        else:
            img_loc = os.path.join(self.main_dir_2, self.total_imgs[idx])
        image = Image.open(img_loc).convert("RGB")
        tensor_image = self.transform(image)
        return (tensor_image, self.img_id_to_label[img_id])

img_folder_path = '../input/hpatrainsegmentation2241sthalf/input/fast-cell-segmentation-outsize-224-first-half/train_cell_segmentation_224_1st_half/'

In [None]:
import torchvision.transforms as transforms
transformations = transforms.Compose([
    #transforms.Resize(255),
    #transforms.CenterCrop(224),
    transforms.ToTensor()#,
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
# first manually create a train dataset, a validation dataset and a test dataset
# then manually create a train_loader, a valid_loader and a test_loader based on the above datasets
train_dataset = CustomDataSet(
    '../input/hpatrainsegmentation2241sthalf/input/fast-cell-segmentation-outsize-224-first-half/train_cell_segmentation_224_1st_half', 
    "../input/hpa-224-2nd-half/input/zeyusss/train_cell_segmentation", 
    train_df['file_name'].tolist(), image_id_to_label, image_ID_half_dict, transformations)

In [None]:
len(train_dataset)

# Stop here!

In [None]:
batch_size = 64
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                               num_workers=4, drop_last=True)

In [None]:
# first manually create a train dataset, a validation dataset and a test dataset
# then manually create a train_loader, a valid_loader and a test_loader based on the above datasets
valid_dataset = CustomDataSet(
    '../input/hpatrainsegmentation2241sthalf/input/fast-cell-segmentation-outsize-224-first-half/train_cell_segmentation_224_1st_half', 
    "../input/hpa-224-2nd-half/input/zeyusss/train_cell_segmentation", 
    valid_df['file_name'].tolist(), image_id_to_label, image_ID_half_dict, transformations)

valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, 
                               num_workers=4, drop_last=True)

In [None]:
n_classes = 19
model = EfficientNet.from_pretrained('efficientnet-b0', num_classes = n_classes)

In [None]:
model._fc # replace this to change the number of out_features (should be 19 - number of classes in the dataset)

for param in model.parameters():
    param.requires_grad = False
    
# Replace the last fully-connected layer
# Parameters of newly constructed modules have requires_grad=True by default
#model._fc = nn.Linear(model._fc.in_features, n_classes)
model._fc.weight.requires_grad = True

model = model.to('cuda')

In [None]:
model._fc

In [None]:
optimizer = optim.Adam(model.parameters())
loss_func = nn.CrossEntropyLoss()

In [None]:
# save testset
train_df.to_csv('train_df.csv')
valid_df.to_csv('valid_df.csv')
test_df.to_csv('test_df.csv')

In [None]:
import copy

n_epochs = 10
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

epoch_train_losses = []
epoch_train_accs = []
epoch_valid_losses = []
epoch_valid_accs = []
best_acc = -1

for epoch in range(n_epochs):
    print('Epoch ' + str(epoch))
    train_batch_loss = []
    train_batch_acc = []
    
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(current_time)
    t1 = time.time()
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        train_batch_loss.append(loss.item())
        
        _, predicted = torch.max(outputs, 1)
        acc = sum(torch.argmax(outputs, 1) == target) / batch_size
        train_batch_acc.append(acc.item())
        
        loss.backward()
        optimizer.step()        
        
        t2 = time.time()
        if batch_idx % 50 == 0:
            print(str(batch_idx) + '-th batch runtime since epoch begins: ' + str(t2 - t1))
        
    epoch_train_losses.append(sum(train_batch_loss)/len(train_batch_loss))
    epoch_train_accs.append(sum(train_batch_acc)/len(train_batch_acc))
    
    # calculate validation accuracy and losses
    valid_batch_loss = []
    valid_batch_accs = []
    with torch.no_grad():
        for i, (data, target) in enumerate(valid_loader):
            data = data.to(device)
            target = target.to(device)
            valid_outputs = model(data)
            
            valid_loss = criterion(valid_outputs, target)
            valid_batch_loss.append(valid_loss.item())
            
            _, predicted = torch.max(valid_outputs, 1)
            acc = sum(torch.argmax(valid_outputs, 1) == target) / batch_size
            valid_batch_accs.append(acc.item())
            
    epoch_valid_losses.append((sum(valid_batch_loss)/len(valid_batch_loss)))
    epoch_valid_accs.append((sum(valid_batch_accs)/len(valid_batch_accs)))
    
    if epoch_valid_accs[-1] > best_acc:
        best_acc = epoch_valid_accs[-1]
        best_model_seen = copy.deepcopy(model.state_dict())
        torch.save(best_model_seen, 'efficient_net_epoch_' + str(n_epochs) + '_batch_' + str(batch_size) + '.pt')
        
    print('epoch: \t', epoch, '\t training loss: \t', epoch_train_losses)
    print('epoch: \t', epoch, '\t training acc: \t', epoch_train_accs)
    print('epoch: \t', epoch, '\t validation loss: \t', epoch_valid_losses)
    print('epoch: \t', epoch, '\t validation acc: \t', epoch_valid_accs)

In [None]:
# save model
torch.save(best_model_seen, 'efficient_net_epoch_' + str(n_epochs) + '_batch_' + str(batch_size) + '.pt')

In [None]:
# save train/validation loss and accuracy
#epoch_train_losses_arr = np.array(epoch_train_losses)
#epoch_train_accs_arr = np.array(epoch_train_accs)
#epoch_valid_losses_arr = np.array(epoch_valid_losses)
#epoch_valid_accs_arr = np.array(epoch_valid_accs)

train_valid_performance = pd.DataFrame(
    list(zip(list(range(n_epochs)), epoch_train_losses, epoch_train_accs, epoch_valid_losses, epoch_valid_accs)), 
    columns =['epoch', 'train_losses', 'train_accs', 'valid_losses', 'valid_accs'])
train_valid_performance.to_csv('train_valid_performance.csv')