# #Loading packages and data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
sns.set()
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


# #Load dataset

In [None]:
train_labels = pd.read_csv("../input/hpa-single-cell-image-classification/train.csv")
train_labels.head()


# #How many samples do we have

In [None]:
train_labels.shape[0]

# #Extract test names for submission


In [None]:
test_path = "../input/human-protein-atlas-image-classification/test/"


In [None]:
submission = pd.read_csv("../input/hpa-single-cell-image-classification/sample_submission.csv")
submission.head()


In [None]:
test_names = submission.ID.values
print(len(test_names))
print(len(test_names[0]))

# #There are 559 test images we are asked to make predictions.

# #Helper Code

In [None]:
label_names = {
    0: "Nucleoplasm",
    1: "Nuclear_membrane",
    2: "Nucleoli",
    3: "Nucleoli_fibrillar_center",
    4: "Nuclear_speckles",
    5: "Nuclear_bodies",
    6: "Endoplasmic_reticulum",
    7: "Golgi_apparatus",
    8: "Intermediate_filaments",
    9: "Actin filaments",
    10: "Microtubules",
    11: "Mitotic_spindle",
    12: "Centrosome",
    13: "Plasma_membrane",
    14: "Mitochondria",
    15: "Aggresome",
    16: "Cytosol",
    17: "Vesicles_and_punctate_cytosolic_patterns",
    18: "Negative"
}

reverse_train_labels = dict((v,k) for k,v in label_names.items())

def fill_targets(row):   
    row.Label = np.array(row.Label.split("|")).astype(np.int)
    for num in row.Label:
        name = label_names[int(num)]
        row.loc[name] = 1
    return row

In [None]:
for key in label_names.keys():
    train_labels[label_names[key]] = 0

In [None]:
train_labels = train_labels.apply(fill_targets, axis=1)


In [None]:
train_labels.head(1)

 ok, great now we can directly work with binary targets values, Lets create a dataframe for the test ids as well that we will use later to make our submission

In [None]:
test_labels = pd.DataFrame(data=test_names, columns=['ID'])
for col in train_labels.columns.values:
    if col != "ID":
        test_labels[col] = 0
test_labels.head(1)

Ok, currently we haven't made any predictions and except from Id all entries are filled with 0.


# Exploratory data analysis 


# Which protiens occur most often in train images

In [None]:
target_count = train_labels.drop(['ID', 'Label'], axis = 1).sum(axis=0).sort_values(ascending=False)
plt.figure(figsize=(15,15))
sns.barplot(y=target_count.index.values, x = target_count.values, order=target_count.index)

**Take-Away**
* We can see that most common protein structures belong to coarse grained cellular components like the plasma membrane, the cytosol and the nucleus.

* Consequently accuracy is not the right score here to measure your performance and validation strategy should be very fine.

# How many targets are most common?

In [None]:
train_labels['number_of_targets'] = train_labels.drop(["ID","Label"], axis = 1).sum(axis = 1)

count_prec = np.round(100*train_labels['number_of_targets'].value_counts()/train_labels.shape[0], 2)
plt.figure(figsize=(20,5))
sns.barplot(x= count_prec.index.values, y = count_prec.values, palette = "Reds")
plt.xlabel("Number of targets per image")
plt.ylabel("% of train data")

**Take-away**
* Most train images only have 1 or two target labels.
* More than 3 targets are very seldom!

# Which targets are correlated?

Let's see if we find some correlations between our targets. This way we may already see that some proteins often come together.

In [None]:

plt.figure(figsize=(15,15))
sns.heatmap(train_labels[train_labels.number_of_targets>1].drop(
    ["ID", "Label", "number_of_targets"],axis=1
).corr(), cmap="RdYlBu", vmin=-1, vmax=1)

**Take-away**
* We can see that many targets only have very slight correlations.


# Next Version comeing up with more details EDA and baseline.
****
# Please Upvote

Thanks


# How do the images look like?


**Peek into the directory**

* Before we start loading images, let's have a look into the train directory to get an impression of what we can find there:

In [None]:
from os import listdir
files = listdir("../input/hpa-single-cell-image-classification/train")
for n in range(10):
    print(files[n])

**Ah, ok, great! It seems that for one image id, there are different color channels present. Looking into the data description of this competition we can find that:**

* Each image is actually splitted into 4 different image files.
* These 4 files correspond to 4 different filter:
 1.  a green filter for the target protein structure of interest
 2. blue landmark filter for the nucleus
 3. red landmark filter for microtubules
 4. yellow landmark filter for the endoplasmatic reticulum
* Each image is of size 512 x 512

Let's check if the number of files divided by 4 yields the number of target samples

In [None]:
len(files)/4 == train_labels.shape[0]

# How do images of specific targets looks like.?

In [None]:
train_path = "../input/hpa-single-cell-image-classification/train/"

In [None]:
def load_image(basepath, image_id):
    images = np.zeros(shape=(4,512,512))
    images[0,:,:] = imread(basepath + image_id + "_green" + ".png")
    images[1,:,:] = imread(basepath + image_id + "_red" + ".png")
    images[2,:,:] = imread(basepath + image_id + "_blue" + ".png")
    images[3,:,:] = imread(basepath + image_id + "_yellow" + ".png")
    return images

def make_image_row(image, subax, title):
    subax[0].imshow(image[0], cmap="Greens")
    subax[1].imshow(image[1], cmap="Reds")
    subax[1].set_title("stained microtubules")
    subax[2].imshow(image[2], cmap="Blues")
    subax[2].set_title("stained nucleus")
    subax[3].imshow(image[3], cmap="Oranges")
    subax[3].set_title("stained endoplasmatic reticulum")
    subax[0].set_title(title)
    return subax

def make_title(file_id):
    file_targets = train_labels.loc[train_labels.Id==file_id, "Target"].values[0]
    title = " - "
    for n in file_targets:
        title += label_names[n] + " - "
    return title

Let's try to visualize specific target groups. In this example we will see images that contain the protein structures lysosomes or endosomes. Set target values of your choice and the target group iterator will collect all images that are subset of your choice:

In [None]:
class TargetGroupIterator:
    
    def __init__(self, target_names, batch_size, basepath):
        self.target_names = target_names
        self.target_list = [reverse_train_labels[key] for key in target_names]
        self.batch_shape = (batch_size, 4, 512, 512)
        self.basepath = basepath
    
    def find_matching_data_entries(self):
        train_labels["check_col"] = train_labels.Label.apply(
            lambda l: self.check_subset(l)
        )
        self.images_identifier = train_labels[train_labels.check_col==1].ID.values
        train_labels.drop("check_col", axis=1, inplace=True)
    
    def check_subset(self, targets):
        return np.where(set(targets).issubset(set(self.target_list)), 1, 0)
    
    def get_loader(self):
        filenames = []
        idx = 0
        images = np.zeros(self.batch_shape)
        for image_id in self.images_identifier:
            images[idx,:,:,:] = load_image(self.basepath, image_id)
            filenames.append(image_id)
            idx += 1
            if idx == self.batch_shape[0]:
                yield filenames, images
                filenames = []
                images = np.zeros(self.batch_shape)
                idx = 0
        if idx > 0:
            yield filenames, images
    

In [None]:
train_labels.head(1)

In [None]:

your_choice = ["Nucleoplasm", "Nuclear_bodies"]
your_batch_size = 20

In [None]:
imageloader = TargetGroupIterator(your_choice, your_batch_size, train_path)
imageloader.find_matching_data_entries()
iterator = imageloader.get_loader()

To keep the kernel dense, the target group iterator has a batch size which stands for the number of examples you like to look at once. In this example you can see a maximum amount of 3 images at one iteration. To observe the next 3 examples of your target group, just run the cell below again. This way you can run the cell until you have seen all images of your group without polluting the kernel:

# Building a baseline model <a class="anchor" id="baseline"></a>

### K-Fold Cross-Validation

Let's see how many test and train samples we have in this competition:

In [None]:
train_files = "../input/hpa-single-cell-image-classification/train/"
test_files = "../input/hpa-single-cell-image-classification/test/"
percentage = np.round(len(test_files)/len(train_files)*100)
print("The test size turn out to be {} % compared to the trainset.".format(percentage))

To understand the performance of our model we will use k-fold cross validation. The train data is splitted into k chunks and each chunk is used once for testing the prediction performance whereas the others are used for training. 

In [None]:
from sklearn.model_selection import RepeatedKFold
splitter = RepeatedKFold(n_splits=5,n_repeats=1, random_state=0)

In [None]:
partitions = []

for train_idx,  test_idx in splitter.split(train_labels.index):
    partition = {}
    partition["train"] = train_labels.ID.values[train_idx]
    partition["validation"] = train_labels.ID.values[test_idx]
    partitions.append(partition)
    print("TRAIN:", train_idx, "TEST:", test_idx)
    print("TRAIN:", len(train_idx), "TEST:", len(test_idx))

Next we need to setup a simple baseline model. This need not be very complex or very good. Its our first attempt to play with and to figure out how to improve. For this purpose let's use the deep learning library keras.

### Shared Parameter class

In [None]:
class ModelParameter:
    
    def __init__(self, basepath,
                 num_classes=28,
                 image_rows=512,
                 image_cols=512,
                 batch_size=200,
                 n_channels=1,
                 row_scale_factor=4,
                 col_scale_factor=4,
                 shuffle=False,
                 n_epochs=1):
        self.basepath = basepath
        self.num_classes = num_classes
        self.image_rows = image_rows
        self.image_cols = image_cols
        self.batch_size = batch_size
        self.n_channels = n_channels
        self.shuffle = shuffle
        self.row_scale_factor = row_scale_factor
        self.col_scale_factor = col_scale_factor
        self.scaled_row_dim = np.int(self.image_rows / self.row_scale_factor)
        self.scaled_col_dim = np.int(self.image_cols / self.col_scale_factor)
        self.n_epochs = n_epochs

Ok, now we will create an instance of this class and pass it to the DataGenerator, the BaseLineModel and the ImagePreprocessor.

In [None]:
parameters = ModelParameter(train_path)

### Image Preprocessor

Let's write a simple image preprocessor that handles for example the rescaling of the images. Perhaps we can expand its functionality during improvement of the baseline model. 

In [None]:
from skimage.transform import resize

class ImagePreprocessor:
    
    def __init__(self, modelparameter):
        self.parameter = modelparameter
        self.basepath = self.parameter.basepath
        self.scaled_row_dim = self.parameter.scaled_row_dim
        self.scaled_col_dim = self.parameter.scaled_col_dim
        self.n_channels = self.parameter.n_channels
    
    def preprocess(self, image):
        image = self.resize(image)
        image = self.reshape(image)
        image = self.normalize(image)
        return image
    
    def resize(self, image):
        image = resize(image, (self.scaled_row_dim, self.scaled_col_dim))
        return image
    
    def reshape(self, image):
        image = np.reshape(image, (image.shape[0], image.shape[1], self.n_channels))
        return image
    
    def normalize(self, image):
        image /= 255 
        return image
    
    def load_image(self, image_id):
        image = np.zeros(shape=(512,512,4))
        image[:,:,0] = imread(self.basepath + image_id + "_green" + ".png")
        image[:,:,1] = imread(self.basepath + image_id + "_blue" + ".png")
        image[:,:,2] = imread(self.basepath + image_id + "_red" + ".png")
        image[:,:,3] = imread(self.basepath + image_id + "_yellow" + ".png")
        return image[:,:,0:self.parameter.n_channels]
        

Let's create an instance of this preprocessor and pass it to the data generator.

In [None]:
preprocessor = ImagePreprocessor(parameters)

#### Looking at a preprocessed example image

In [None]:
import keras

class DataGenerator(keras.utils.Sequence):
    
    def __init__(self, list_IDs, labels, modelparameter, imagepreprocessor):
        self.current_epoch = 0
        self.params = modelparameter
        self.labels = labels
        self.list_IDs = list_IDs
        self.dim = (self.params.scaled_row_dim, self.params.scaled_col_dim)
        self.batch_size = self.params.batch_size
        self.n_channels = self.params.n_channels
        self.num_classes = self.params.num_classes
        self.shuffle = self.params.shuffle
        self.preprocessor = imagepreprocessor
        self.on_epoch_end()
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes, random_state=self.current_epoch)
            self.current_epoch += 1
    
    def get_targets_per_image(self, identifier):
        return self.labels.loc[self.labels.Id==identifier].drop(
                ["Id", "Target", "number_of_targets"], axis=1).values
            
    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size, self.num_classes), dtype=int)
        # Generate data
        for i, identifier in enumerate(list_IDs_temp):
            # Store sample
            image = self.preprocessor.load_image(identifier)
            image = self.preprocessor.preprocess(image)
            X[i] = image
            # Store class
            y[i] = self.get_targets_per_image(identifier)
        return X, y
    
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))
    
    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        # Generate data
        X, y = self.__data_generation(list_IDs_temp)
        return X, y

In [None]:
class PredictGenerator:
    
    def __init__(self, predict_Ids, imagepreprocessor, predict_path):
        self.preprocessor = imagepreprocessor
        self.preprocessor.basepath = predict_path
        self.identifiers = predict_Ids
    
    def predict(self, model):
        y = np.empty(shape=(len(self.identifiers), self.preprocessor.parameter.num_classes))
        for n in range(len(self.identifiers)):
            image = self.preprocessor.load_image(self.identifiers[n])
            image = self.preprocessor.preprocess(image)
            image = image.reshape((1, *image.shape))
            y[n] = model.predict(image)
        return y

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.losses import binary_crossentropy
from keras.optimizers import Adadelta
from keras.initializers import VarianceScaling


class BaseLineModel:
    
    def __init__(self, modelparameter):
        self.params = modelparameter
        self.num_classes = self.params.num_classes
        self.img_rows = self.params.scaled_row_dim
        self.img_cols = self.params.scaled_col_dim
        self.n_channels = self.params.n_channels
        self.input_shape = (self.img_rows, self.img_cols, self.n_channels)
        self.my_metrics = ['accuracy']
    
    def build_model(self):
        self.model = Sequential()
        self.model.add(Conv2D(16, kernel_size=(3, 3), activation='relu', input_shape=self.input_shape,
                             kernel_initializer=VarianceScaling(seed=0)))
        self.model.add(Conv2D(32, (3, 3), activation='relu',
                             kernel_initializer=VarianceScaling(seed=0)))
        self.model.add(MaxPooling2D(pool_size=(2, 2)))
        self.model.add(Dropout(0.25))
        self.model.add(Flatten())
        self.model.add(Dense(64, activation='relu',
                            kernel_initializer=VarianceScaling(seed=0),))
        self.model.add(Dropout(0.5))
        self.model.add(Dense(self.num_classes, activation='sigmoid'))
    
    def compile_model(self):
        self.model.compile(loss=keras.losses.binary_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=self.my_metrics)
    
    def set_generators(self, train_generator, validation_generator):
        self.training_generator = train_generator
        self.validation_generator = validation_generator
    
    def learn(self):
        return self.model.fit_generator(generator=self.training_generator,
                    validation_data=self.validation_generator,
                    epochs=self.params.n_epochs, 
                    use_multiprocessing=True,
                    workers=8)
    
    def score(self):
        return self.model.evaluate_generator(generator=self.validation_generator,
                                      use_multiprocessing=True, 
                                      workers=8)
    
    def predict(self, predict_generator):
        y = predict_generator.predict(self.model)
        return y
    
    def save(self, modeloutputpath):
        self.model.save(modeloutputpath)
    
    def load(self, modelinputpath):
        self.model = load_model(modelinputpath)

In [None]:
# Datasets
partition = partitions[0]
labels = train_labels

print("Number of samples in train: {}".format(len(partition["train"])))
print("Number of samples in validation: {}".format(len(partition["validation"])))

In [None]:
training_generator = DataGenerator(partition['train'], labels, parameters, preprocessor)
validation_generator = DataGenerator(partition['validation'], labels, parameters, preprocessor)

In [None]:
predict_generator = PredictGenerator(partition['validation'], preprocessor, train_path)

In [None]:
test_preprocessor = ImagePreprocessor(parameters)
submission_predict_generator = PredictGenerator(test_names, test_preprocessor, test_path)

In [None]:
class KernelSettings:
    
    def __init__(self, fit_baseline=False,
                 fit_improved_baseline=True,
                 fit_improved_higher_batchsize=False,
                 fit_improved_without_dropout=False):
        self.fit_baseline = fit_baseline
        self.fit_improved_baseline = fit_improved_baseline
        self.fit_improved_higher_batchsize = fit_improved_higher_batchsize
        self.fit_improved_without_dropout = fit_improved_without_dropout

In [None]:
kernelsettings = KernelSettings(fit_baseline=False,
                                fit_improved_baseline=False,
                                fit_improved_higher_batchsize=False,
                                fit_improved_without_dropout=False)

In [None]:
# Run computation and store results as csv
target_names = train_labels.drop(["Label", "number_of_targets", "ID"], axis=1).columns

if kernelsettings.fit_baseline == True:
    model = BaseLineModel(parameter)
    model.build_model()
    model.compile_model()
    model.set_generators(training_generator, validation_generator)
    history = model.learn()
    
    proba_predictions = model.predict(predict_generator)
    baseline_proba_predictions = pd.DataFrame(index = partition['validation'],
                                              data=proba_predictions,
                                              columns=target_names)
    baseline_proba_predictions.to_csv("baseline_predictions.csv")
    baseline_losses = pd.DataFrame(history.history["loss"], columns=["train_loss"])
    baseline_losses["val_loss"] = history.history["val_loss"]
    baseline_losses.to_csv("baseline_losses.csv")
    
    
    submission_proba_predictions = model.predict(submission_predict_generator)
    baseline_labels = test_labels.copy()
    baseline_labels.loc[:, test_labels.drop(["ID", "Label"], axis=1).columns.values] = submission_proba_predictions
    baseline_labels.to_csv("baseline_submission_proba.csv")
# If you already have done a baseline fit once, 
# you can load predictions as csv and further fitting is not neccessary:
else:
    baseline_proba_predictions = pd.read_csv("../input/protein-atlas-eab-predictions/baseline_predictions.csv", index_col=0)
    baseline_losses = pd.read_csv("../input/protein-atlas-eab-predictions/baseline_losses.csv", index_col=0)
    baseline_labels = pd.read_csv("../input/protein-atlas-eab-predictions/baseline_submission_proba.csv", index_col=0)

In [None]:
validation_labels = train_labels.loc[train_labels.ID.isin(partition["validation"])].copy()
validation_labels.shape

In [None]:
baseline_proba_predictions.shape

In [None]:
y_pred

In [None]:
from sklearn.metrics import accuracy_score as accuracy

y_true = validation_labels.drop(["ID", "Label", "number_of_targets"], axis=1).values
y_pred = np.where(baseline_proba_predictions.values > 0.5, 1, 0)

accuracy(y_true.flatten(), y_pred.flatten())

In [None]:
y_pred[0]

In [None]:
y_true[0]

In [None]:
proba_predictions = baseline_proba_predictions.values
hot_values = validation_labels.drop(["ID", "Label", "number_of_targets"], axis=1).values.flatten()
one_hot = (hot_values.sum()) / hot_values.shape[0] * 100
zero_hot = (hot_values.shape[0] - hot_values.sum()) / hot_values.shape[0] * 100

fig, ax = plt.subplots(1,2, figsize=(20,5))
sns.distplot(proba_predictions.flatten() * 100, color="DodgerBlue", ax=ax[0])
ax[0].set_xlabel("Probability in %")
ax[0].set_ylabel("Density")
ax[0].set_title("Predicted probabilities")
sns.barplot(x=["label = 0", "label = 1"], y=[zero_hot, one_hot], ax=ax[1])
ax[1].set_ylim([0,100])
ax[1].set_title("True target label count")
ax[1].set_ylabel("Percentage");