Human Protein Atlas - Single Cell Classification competition, 
This is a weakly supervised multi-label classification problem. Given images of cells from our microscopes and labels of protein location assigned together for all cells in the image, We will develop models capable of segmenting and classifying each individual cell with precise labels. 

This starter notebook to get an idea, on how to approach this competition.

Notebooks referred to create this kernel are given the comment of this notebook. If this helped give an upvote.

# **Importing Libraries**

In [None]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import cv2
import seaborn as sns
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping, ModelCheckpoint
from keras.layers import Input, Flatten, Dense, Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.applications import InceptionV3
from keras.optimizers import Adam

import warnings
warnings.filterwarnings("ignore")

# **BASIC EDA**

In [None]:
train_df_path="../input/hpa-single-cell-image-classification/train.csv"
train_images_path="../input/hpa-single-cell-image-classification/train"
test_images_path="../input/hpa-single-cell-image-classification/test"
sample_df_path="../input/hpa-single-cell-image-classification/sample_submission.csv"

In [None]:
train_df=pd.read_csv(train_df_path)
train_df.head()

In [None]:
train_dataset= []
for name, labels in zip(train_df['ID'], train_df['Label'].str.split('|')):
    train_dataset.append({
        'path':os.path.join(train_images_path, name),
        'labels':np.array([int(label) for label in labels])})
train_dataset= np.array(train_dataset)

In [None]:
train_df["nb_labels"] = train_df["Label"].apply(lambda x: len(x.split("|")))
print(f"Max number of labels attached to a single sample: {train_df['nb_labels'].max()}")
print(f"Min  number of labels attached to a single sample: {train_df['nb_labels'].min()}")
print(50*"-")
print("All counts:")
print(50*"-")
print(train_df["nb_labels"].value_counts())

In [None]:
sns.set(rc={'figure.figsize':(15,5)})
sns.set_style('whitegrid')

va=sns.countplot(y="nb_labels",data=train_df,palette="flare")
plt.xlabel("Number of labels",fontsize=20)
plt.ylabel("Count",fontsize=20)
plt.tight_layout()

In [None]:
single_labels_count = train_df[train_df['nb_labels']==1]['nb_labels'].count()
multi_labels_count = train_df[train_df['nb_labels']>1]['nb_labels'].count()

# Plot the value counts for each count
plt.figure(figsize=(10,5))
sns.barplot(x=['Single label', 'Multi-label'], y=[single_labels_count, multi_labels_count],palette='flare')
plt.title("Single vs Multi label distribution", fontsize=16)
plt.xlabel("Label type", fontsize=16)
plt.ylabel("Count", fontsize=16)
plt.show()

In [None]:
labels_dict={
0: "Nucleoplasm",
1: "Nuclear membrane",
2: "Nucleoli",
3: "Nucleoli fibrillar center",
4: "Nuclear speckles",
5: "Nuclear bodies",
6: "Endoplasmic reticulum",
7: "Golgi apparatus",
8: "Intermediate filaments",
9: "Actin filaments" ,
10: "Microtubules",
11: "Mitotic spindle",
12: "Centrosome",
13: "Plasma membrane",
14: "Mitochondria",
15: "Aggresome",
16: "Cytosol",
17: "Vesicles and punctate cytosolic patterns",
18: "Negative"
}

In [None]:
# Split the labels
labels = train_df["Label"].apply(lambda x: x.split("|"))

# Create a counter. This initializes the count for each class with a value of zero
labels_count = defaultdict(int)

# Update the counter 
for label in labels:
    if len(labels) > 1:
        for l in label:
            labels_count[labels_dict[int(l)]]+=1
    else:
        labels_count[labels_dict[int(label)]]+=1

# Plot         
plt.figure(figsize=(15,10))
sns.barplot(x=list(labels_count.values()), y=list(labels_count.keys()),palette='flare', orient='h')
plt.title("Distribution of cell types", fontsize=16)
plt.xlabel("Count", fontsize=16)
plt.ylabel("Type of cell", fontsize=16)
plt.show()

# **LOADING THE DATASET**

In [None]:
def load_image(path, shape):
    R = cv2.imread(path+'_red.png',cv2.IMREAD_UNCHANGED)
    Y = cv2.imread(path+'_yellow.png',cv2.IMREAD_UNCHANGED)
    G = cv2.imread(path+'_green.png',cv2.IMREAD_UNCHANGED)
    B = cv2.imread(path+'_blue.png',cv2.IMREAD_UNCHANGED)
    image = np.stack((
            R/2 + Y/2, 
            G/2 + Y/2, 
            B),-1)
        
    image = cv2.resize(image, (shape[0], shape[1]))
    image = np.divide(image, 255)
    return image  

In [None]:
plt.imshow(load_image("../input/hpa-single-cell-image-classification/train/000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0",(331,331)))
plt.axis("off")

In [None]:
def create_train(dataset_info, batch_size, shape):
    assert shape[2] == 3
    while True:
        random_indexes = np.random.choice(len(dataset_info), batch_size)
        batch_images = np.empty((batch_size, shape[0], shape[1], shape[2]))
        batch_labels = np.zeros((batch_size, 19))
        for i, idx in enumerate(random_indexes):
            image = load_image(dataset_info[idx]['path'], shape)   
            batch_images[i] = image
            batch_labels[i][dataset_info[idx]['labels']] = 1
        yield batch_images, batch_labels

In [None]:
train_ids, test_ids, train_targets, test_target = train_test_split(train_df['ID'],train_df['Label'], test_size=0.2, random_state=42)

In [None]:
train_generator = create_train(train_dataset[train_ids.index], 4, (256,256,3))
validation_generator =create_train(train_dataset[test_ids.index], 4, (256,256,3))

# **Define the model**

In [None]:
def make_model(input_shape):
    inputs= Input(shape=input_shape)
    base_model = InceptionV3(include_top=False,
                   weights='imagenet',
                   input_shape=input_shape)
    for layer in base_model.layers:
        layer.trainable = False
    bn = BatchNormalization()(inputs)
    x = base_model(bn)
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.3)(x)
    predictions = Dense(19, activation='sigmoid',name='Final')(x)

    model = Model(inputs=inputs, outputs=predictions)

    model.compile(optimizer =Adam(1e-03),
                  loss = 'binary_crossentropy',
                  metrics = tf.keras.metrics.AUC(multi_label=True)) 
    return model

In [None]:
my_callbacks = [EarlyStopping(monitor = 'val_loss', 
                              min_delta = 0.001,
                              patience = 3, 
                              mode = 'min', 
                              verbose = 1,
                              restore_best_weights = True),
                ModelCheckpoint(filepath='model.h5', 
                                save_best_only = True, 
                                monitor = 'val_loss', 
                                mode = 'min', verbose = 1),
                ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.1,
                                  patience=2, 
                                  min_lr=0.00001,
                                  mode='min',
                                  verbose=1)]

In [None]:
model=make_model((256,256,3))

In [None]:
model.summary()

# **TRAINING THE MODEL**

In [None]:
history = model.fit(train_generator,
                    steps_per_epoch=100,
                    validation_data = next(validation_generator),
                    epochs =10, 
                    callbacks =my_callbacks)

In [None]:
train_acc = history.history['auc']
val_acc = history.history['val_auc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(train_acc) + 1)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
fig.set_size_inches(20,10)

ax1.plot(epochs , train_acc , 'go-' , label = 'Training AUC')
ax1.plot(epochs , val_acc , 'ro-' , label = 'Validation AUC')
ax1.set_title('Training & Validation Accuracy')
ax1.legend()
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Accuracy")

ax2.plot(epochs , loss , 'g-o' , label = 'Training Loss')
ax2.plot(epochs , val_loss , 'r-o' , label = 'Validation Loss')
ax2.set_title('Testing Accuracy & Loss')
ax2.legend()
ax2.set_xlabel("Epochs")
ax2.set_ylabel("Training & Validation Loss")
       
fig.tight_layout()
plt.show()

Scores are not that high, we need to try out various techniques to improve the score.

# **Next step is try out with image augmentation**

**Work under progress**