# Import dataset

In [None]:
!pip install "../input/keras-application/Keras_Applications-1.0.8-py3-none-any.whl"
!pip install "../input/efficientnet111/efficientnet-1.1.1-py3-none-any.whl"
!pip install "../input/pycocotools/pycocotools-2.0-cp37-cp37m-linux_x86_64.whl"
!pip install "../input/hpapytorchzoozip/pytorch_zoo-master"
!pip install "../input/hpacellsegmentatormaster/HPA-Cell-Segmentation-master"
!pip install "../input/tfexplainforoffline/tf_explain-0.2.1-py3-none-any.whl"

# import packages

In [None]:
import os, glob
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
# tf.compat.v1.disable_eager_execution()
import random
from sklearn.model_selection import train_test_split
import cv2
import numpy as np
import pandas as pd
import multiprocessing
from copy import deepcopy
from sklearn.metrics import precision_recall_curve, auc
import keras
import keras.backend as K
from keras.optimizers import Adam
from keras.callbacks import Callback
# please note, that locally I've trained a keras.efficientnet model, but using tensorflow.keras.applications.EfficientNetB0 should lead to the same results
from efficientnet.keras import EfficientNetB0
from keras.layers import Dense, Flatten
from keras.models import Model, load_model
from keras.utils import Sequence
from albumentations import Compose, VerticalFlip, HorizontalFlip, Rotate, GridDistortion
import matplotlib.pyplot as plt
from IPython.display import Image, display
from numpy.random import seed
seed(10)
from tensorflow.python.framework import ops
import gc
from numba import cuda 
import hpacellseg.cellsegmentator as cellsegmentator
from hpacellseg.utils import label_cell, label_nuclei
from tqdm.auto import tqdm
import base64
import numpy as np
from pycocotools import _mask as coco_mask
import typing as t
import zlib
import warnings
from tf_explain.core.integrated_gradients import IntegratedGradients
warnings.filterwarnings('ignore')

tf.random.set_seed(10)
%matplotlib inline

# Preview the datasets

In [None]:
train_df = pd.read_csv('../input/hpa-single-cell-image-classification/train.csv')
train_df.head()

## one-hot encocding class names

In [None]:
specified_class_names = """0. Nucleoplasm
1. Nuclear membrane
2. Nucleoli
3. Nucleoli fibrillar center
4. Nuclear speckles
5. Nuclear bodies
6. Endoplasmic reticulum
7. Golgi apparatus
8. Intermediate filaments
9. Actin filaments 
10. Microtubules
11. Mitotic spindle
12. Centrosome
13. Plasma membrane
14. Mitochondria
15. Aggresome
16. Cytosol
17. Vesicles and punctate cytosolic patterns
18. Negative"""

class_names = [item.split('. ')[1] for item in specified_class_names.split('\n')]
class_names

In [None]:
train_df['Label'] = train_df['Label'].map(lambda x: set(map(int, x.split('|'))))
for class_idx, class_name in enumerate(class_names):
    train_df[class_name] = train_df['Label'].map(lambda x: 1 if class_idx in x else 0)
train_df.head()

In [None]:
# dictionary for fast access to ohe vectors
id_2_ohe_vector = {img: vec for img, vec in zip(train_df['ID'], train_df.iloc[:, 2:-1].values)}

## find images with unique label combinations, these images will be put into training dataset

In [None]:
label_combinations = train_df['Label'].map(lambda x: str(sorted(list(x))))
print("There {} images with unique label combinations ".format(sum(label_combinations.value_counts()==1)))
label_combinations_counts = label_combinations.value_counts()
unique_label_combination = label_combinations_counts.index[label_combinations_counts==1]

In [None]:
train_ids_unique_label_comb = train_df['ID'][train_df['Label'].map(lambda x:str(sorted(list(x))) in unique_label_combination)]
non_unique_label_comb_bool_idx = train_df['Label'].map(lambda x:str(sorted(list(x))) not in unique_label_combination)

## create train and validation sets

In [None]:
train_ids, val_ids = train_test_split(train_df['ID'][non_unique_label_comb_bool_idx].values,
                        test_size = 0.2,
                        stratify = label_combinations[non_unique_label_comb_bool_idx],
                        random_state = 42)
train_ids = np.concatenate((train_ids, train_ids_unique_label_comb))