# About this kernel

This is a rather quick and dirty kernel I created, with two ideas in mind: Training a "2-headed" network that will learn to predict siRNA using images from both sites at the same time, and split the learning process into two stages, namely first training on all data, then training the CNN on data from a single experiment at a time. The second idea comes from [this thread by Phalanx](https://www.kaggle.com/c/recursion-cellular-image-classification/discussion/100414#latest-586901). The data comes from my previous kernel on preprocessing.

Here are the relevant sections:
* **Data Generator**: The `__generate_X` method is pretty different, since it loads two images at the same time. Everything else is standard
* **Model**: The CNN architecture used here is `EfficientNetB0`. With the right learning rates and enough time, you can probably try B1-B5; they have unfortunately not succeeded in my case. The inputs are two images, i.e. from site 1 and site 2. The two images are passed through the same CNN, then global-average-pooled, and added to form a single 1280-dimensional vector, which is ultimately used to perform predictions. This means that the networks will be updated simultaneously from the gradients of both sites.
* **Phase 1**: Train the model on all data from 10 epochs, and save results to `model.h5`.
* **Phase 2**: Load `model.h5` and train the model for 15 epochs on data from a single cell line, i.e. *HEPG2, HUVEC, RPE, U2OS*.





In [None]:
!pip install efficientnet
import efficientnet

In [None]:
epochs_0 = 10
epochs_1 = 28

In [None]:
import json
import math
import os

import cv2
import tensorflow as tf
from PIL import Image
import numpy as np
import keras
from keras import layers
from keras.applications import MobileNetV2
from keras.callbacks import Callback, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model, load_model
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, BatchNormalization, concatenate, Input, add
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score
import scipy
from tqdm import tqdm

# Preprocessing

In [None]:
train_df = pd.read_csv('../input/recursion-cellular-image-classification/train.csv')
test_df = pd.read_csv('../input/recursion-cellular-image-classification/test.csv')

train_df['category'] = train_df['experiment'].apply(lambda x: x.split('-')[0])
test_df['category'] = test_df['experiment'].apply(lambda x: x.split('-')[0])

train_target_df = pd.get_dummies(train_df['sirna'])

print(train_df.shape)
print(test_df.shape)
print(train_target_df.shape)

train_df.head()

In [None]:
train_idx, val_idx = train_test_split(
    train_df.index, test_size=0.15, random_state=1337
)

print(train_idx.shape)
print(val_idx.shape)

# Data Generator

In [None]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, df, target_df=None, mode='fit',
                 base_path = '../input/recursion-cellular-image-classification-224-jpg/train/train',
                 batch_size=32, dim=(224, 224), n_channels=3,
                 n_classes=5, random_state=1337, shuffle=True):
        self.dim = dim
        self.batch_size = batch_size
        self.df = df
        self.mode = mode
        self.base_path = base_path
        self.target_df = target_df
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.random_state = random_state
        
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_batch = [self.list_IDs[k] for k in indexes]
        
        X = self.__generate_X(list_IDs_batch)
        
        if self.mode == 'fit':
            y = self.__generate_y(list_IDs_batch)
            return X, y
        
        elif self.mode == 'predict':
            return X
        else:
            raise AttributeError('The parameter mode should be set to "fit" or "predict".')
        
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.seed(self.random_state)
            np.random.shuffle(self.indexes)
    
    def __generate_X(self, list_IDs_batch):
        'Generates data containing batch_size samples'
        # Initialization
        X_1 = np.empty((self.batch_size, *self.dim, self.n_channels))
        X_2 = np.empty((self.batch_size, *self.dim, self.n_channels))
        
        # Generate data
        for i, ID in enumerate(list_IDs_batch):
            ext = 'jpeg'
            
            code = self.df['id_code'].iloc[ID]
            
            img_path_1 = f"{self.base_path}/{code}_s1.{ext}"
            img_path_2 = f"{self.base_path}/{code}_s2.{ext}"
            
            img1 = self.__load_image(img_path_1)
            img2 = self.__load_image(img_path_2)
            
            # Store samples
            X_1[i,] = img1
            X_2[i,] = img2

        return [X_1, X_2]
    
    def __generate_y(self, list_IDs_batch):
        y = np.empty((self.batch_size, self.n_classes), dtype=int)
        
        for i, ID in enumerate(list_IDs_batch):
            sirna = self.target_df.iloc[ID]
            y[i, ] = sirna
        
        return y
    
    def __load_image(self, img_path):
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
        img = img.astype(np.float32) / 255.

        return img

In [None]:
BATCH_SIZE = 32
train_generator = DataGenerator(
    train_idx, 
    df=train_df,
    target_df=train_target_df,
    batch_size=BATCH_SIZE, 
    n_classes=train_target_df.shape[1]
)

val_generator = DataGenerator(
    val_idx, 
    df=train_df,
    target_df=train_target_df,
    batch_size=BATCH_SIZE, 
    n_classes=train_target_df.shape[1]
)

test_generator = DataGenerator(
    test_df.index, 
    df=test_df,
    batch_size=1, 
    shuffle=False,
    mode='predict',
    n_classes=train_target_df.shape[1],
    base_path='../input/recursion-cellular-image-classification-224-jpg/test/test/'
)

# Model

In [None]:
def build_model(n_classes):
    # First load mobilenet
    backbone = efficientnet.EfficientNetB1(
        weights='imagenet', 
        include_top=False,
        input_shape=(224, 224, 3)
    )
    
    im_inp_1 = Input(shape=(224, 224, 3))
    im_inp_2 = Input(shape=(224, 224, 3))

    x1 = backbone(im_inp_1)
    x2 = backbone(im_inp_2)

    x1 = GlobalAveragePooling2D()(x1)
    x2 = GlobalAveragePooling2D()(x2)

    out = add([x1, x2])
    out = Dropout(0.5)(out)

    out = Dense(n_classes, activation='softmax')(out)

    model = Model(inputs=[im_inp_1, im_inp_2], outputs=out)
    
    model.compile(Adam(0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Phase 1: Train on all data

In [None]:
model = build_model(n_classes=train_target_df.shape[1])
model.summary()

In [None]:
checkpoint = ModelCheckpoint(
    'model.h5', 
    monitor='val_loss', 
    verbose=0, 
    save_best_only=True, 
    save_weights_only=False,
    mode='auto'
)

history = model.fit_generator(
    train_generator,
    validation_data=val_generator,
    callbacks=[checkpoint],
    use_multiprocessing=False,
    workers=1,
    epochs=epochs_0
)

In [None]:
with open('history.json', 'w') as f:
    json.dump(history.history, f)

history_df = pd.DataFrame(history.history)
history_df[['loss', 'val_loss']].plot()
history_df[['acc', 'val_acc']].plot()

# Phase 2: train on each cell line

In [None]:
categories = train_df['category'].unique()
output_df = []

for category in categories:
    # Retrieve desired category
    category_df = train_df[train_df['category'] == category]
    cat_test_df = test_df[test_df['category'] == category].copy()
    
    print('\n' + '=' * 40)
    print("CURRENT CATEGORY:", category)
    print('-' * 40)
    
    train_idx, val_idx = train_test_split(
        category_df.index, 
        random_state=1337,
        test_size=0.15
    )
    
    # Create new generators
    train_generator = DataGenerator(
        train_idx, 
        df=train_df,
        target_df=train_target_df,
        batch_size=BATCH_SIZE, 
        n_classes=train_target_df.shape[1]
    )

    val_generator = DataGenerator(
        val_idx, 
        df=train_df,
        target_df=train_target_df,
        batch_size=BATCH_SIZE, 
        n_classes=train_target_df.shape[1]
    )

    test_generator = DataGenerator(
        cat_test_df.index, 
        df=test_df,
        batch_size=1, 
        shuffle=False,
        mode='predict',
        n_classes=train_target_df.shape[1],
        base_path='../input/recursion-cellular-image-classification-224-jpg/test/test/'
    )

    # Restore previously trained model
    model.load_weights('model.h5')
    model.compile(
        Adam(0.0001), 
        loss='categorical_crossentropy', 
        metrics=['accuracy']
    )

    # Train model only on data for specific category
    checkpoint = ModelCheckpoint(
        f'model_{category}.h5', 
        monitor='val_loss', 
        verbose=0, 
        save_best_only=True, 
        save_weights_only=False,
        mode='auto'
    )

    history_category = model.fit_generator(
        train_generator,
        validation_data=val_generator,
        callbacks=[checkpoint],
        use_multiprocessing=False,
        workers=1,
        verbose=2,
        epochs=epochs_1
    )

    # Make prediction and add to output dataframe
    y_pred = model.predict_generator(
        test_generator,
        workers=2,
        use_multiprocessing=True,
        verbose=1
    )

    cat_test_df['sirna'] = y_pred.argmax(axis=1)
    output_df.append(cat_test_df[['id_code', 'sirna']])

    # Save history
    with open(f'history_{category}.json', 'w') as f:
        json.dump(history_category.history, f)

# Submission

In [None]:
output_df = pd.concat(output_df)
output_df.to_csv('submission.csv', index=False)