In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
import fastai
from fastai.vision import *
import cv2
import torch
from torch.nn import Conv2d
from sklearn.model_selection import train_test_split
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
path = Path('../input/recursion-cellular-image-classification')

In [None]:
tr_df = pd.read_csv(path/'train.csv')
tr_df = tr_df.append(pd.read_csv(path/'train_controls.csv'), ignore_index=True, sort=False)

In [None]:
tr_df['site'] = 1
tr_df_copy = tr_df.copy()
tr_df_copy['site'] = 2
tr_df = tr_df.append(tr_df_copy, ignore_index=True)

In [None]:
tr_df.columns

In [None]:
def get_img_path(experiment, plate, well, site=1):
    return experiment + '/' + 'Plate' + str(plate) + '/' + well + '_s{}'.format(site)

In [None]:
tr_df['path'] = tr_df.apply(lambda row : get_img_path(row.experiment, row.plate, row.well, row.site), axis=1)

In [None]:
tr_df['cellline'] = tr_df.apply(lambda row : row.experiment[:-3], axis=1)

In [None]:
cell_lines = list(tr_df.cellline.unique())

In [None]:
cell_lines

In [None]:
for cell_line in cell_lines:
    !mkdir {cell_line}

In [None]:
!ls

In [None]:
tr_dfs = [tr_df.loc[lambda df: df.cellline == cell_line] for cell_line in cell_lines]

In [None]:
list(map(len, tr_dfs))

In [None]:
class RecursionImageList(ImageList):
    def open(self, fn:PathOrStr) -> Image:
        fn = str(fn)
        imgs = []
        for channel in [1, 2, 3, 4, 5, 6]:
            img_path = fn + '_w{}'.format(channel) + '.png'
            imgs.append(cv2.imread(img_path, cv2.IMREAD_GRAYSCALE))
        img = cv2.merge(imgs)
        img = img / 255.0
        return Image(px=pil2tensor(img, np.float32))

In [None]:
num_classes = len(tr_df.sirna.unique())

In [None]:
num_classes

In [None]:
srcs = [
    RecursionImageList.from_df(tr_cell_line_df, path=path/'train', cols=['path'])
    for tr_cell_line_df in tr_dfs
]

In [None]:
def split_ds(src, tr_df):
    df = tr_df
    test_size = max(num_classes, int(len(df) * 0.05))
    tr_df, va_df = train_test_split(df, test_size=test_size, random_state=42, stratify = df.sirna)
    src = src.split_by_list(RecursionImageList.from_df(tr_df, path=path/'train', cols=['path']), 
                            RecursionImageList.from_df(va_df, path=path/'train', cols=['path']))
    return src

In [None]:
srcs = list(map(split_ds, srcs, tr_dfs))

In [None]:
srcs = [src.label_from_df(cols=['sirna']) for src in srcs]

In [None]:
data = [src.transform(size=512).databunch(bs=32, pin_memory=True) for src in srcs]

In [None]:
data

In [None]:
precision = Precision(average="macro")
recall = Recall(average="macro")
loss_func = CrossEntropyFlat()

In [None]:
learners = [cnn_learner(data[i], models.resnet18, metrics=[accuracy, error_rate],
                    loss_func=loss_func,
                    model_dir='/kaggle/working/' + cell_lines[i], pretrained=True)
            for i in range(len(data))
           ]

In [None]:
learners[0].model[0][0]

In [None]:
# let's make our model work with single channel
for learn in learners:
    trained_kernel = learn.model[0][0].weight
    new_conv = Conv2d(6, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    with torch.no_grad():
        new_conv.weight[:,:] = torch.stack([torch.mean(trained_kernel, 1)] * 6, dim=1)
    learn.model[0][0] = new_conv
    learn.model.cuda();
#     learn.load('../../input/recursion-fast-ai-channel-2-4-resnet50/stage-04');

In [None]:
# validation_results = []
# for learn in learners:
#     validation_results.append(learn.validate())

In [None]:
# validation_results

In [None]:
i = 0

In [None]:
# # Find a good learning rate
# learners[i].lr_find(start_lr=1.0e-5, end_lr=1e-1, num_it=256)
# learners[i].recorder.plot(suggestion=True)

In [None]:
# !nvidia-smi

In [None]:
learners[i].fit_one_cycle(10, 2e-3)

In [None]:
learners[i].recorder.plot_losses()
learners[i].recorder.plot_metrics()
learners[i].recorder.plot_lr(show_moms=True)
learners[i].save('stage-01')

In [None]:
i = 1

In [None]:
# # Find a good learning rate
# learners[i].lr_find(start_lr=1.0e-5, end_lr=1e-1, num_it=256)
# learners[i].recorder.plot(suggestion=True)

In [None]:
learners[i].fit_one_cycle(10, 5e-3)

In [None]:
learners[i].recorder.plot_losses()
learners[i].recorder.plot_metrics()
learners[i].recorder.plot_lr(show_moms=True)
learners[i].save('stage-01')

In [None]:
i = 2

In [None]:
# # Find a good learning rate
# learners[i].lr_find(start_lr=1.0e-4, end_lr=1e-1, num_it=256)
# learners[i].recorder.plot(suggestion=True)

In [None]:
learners[i].fit_one_cycle(10, 5e-3)

In [None]:
learners[i].recorder.plot_losses()
learners[i].recorder.plot_metrics()
learners[i].recorder.plot_lr(show_moms=True)
learners[i].save('stage-01')

In [None]:
i = 3

In [None]:
# # Find a good learning rate
# learners[i].lr_find(start_lr=1.0e-4, end_lr=1e-1, num_it=256)
# learners[i].recorder.plot(suggestion=True)

In [None]:
learners[i].fit_one_cycle(10, 4e-3)

In [None]:
learners[i].recorder.plot_losses()
learners[i].recorder.plot_metrics()
learners[i].recorder.plot_lr(show_moms=True)
learners[i].save('stage-01')

# Prediction

In [None]:
te_df = pd.read_csv(path/'test.csv')
te_df['path1'] = te_df.apply(lambda row : get_img_path(row.experiment, row.plate, row.well, site=1), axis=1)
te_df['path2'] = te_df.apply(lambda row : get_img_path(row.experiment, row.plate, row.well, site=2), axis=1)
te_df['cellline'] = te_df.apply(lambda row : row.experiment[:-3], axis=1)
te_dfs = [te_df.loc[lambda df: df.cellline == cell_line] for cell_line in cell_lines]

In [None]:
te_df['cellline'].unique()

In [None]:
list(map(len, te_dfs))

In [None]:
test_srcs1 = [
    RecursionImageList.from_df(te_cell_line_df, path=path/'test', cols=['path1'])
    for te_cell_line_df in te_dfs
]

In [None]:
test_srcs2 = [
    RecursionImageList.from_df(te_cell_line_df, path=path/'test', cols=['path2'])
    for te_cell_line_df in te_dfs
]

In [None]:
for cell_line, learn, test_src1, test_src2, te_df in zip(cell_lines, learners, test_srcs1, test_srcs2, te_dfs):
    print(cell_line)
    learn.data.add_test(test_src1)
    preds1, y = learn.get_preds(DatasetType.Test)
    learn.data.add_test(test_src2)
    preds2,y = learn.get_preds(DatasetType.Test)
    preds = 0.5 * (preds1 + preds2)
    te_df['sirna'] = preds.argmax(1)

In [None]:
te_df = pd.concat(te_dfs, ignore_index=True)

In [None]:
te_df.head()

In [None]:
te_df.to_csv('submission.csv', index=False, columns=['id_code','sirna'])