In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
import fastai
from fastai.vision import *
import cv2
import torch
from torch.nn import Conv2d
from sklearn.model_selection import train_test_split
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
path = Path('../input/recursion-cellular-image-classification')

In [None]:
tr_df = pd.read_csv(path/'train.csv')

In [None]:
tr_df['site'] = 1
tr_df_copy = tr_df.copy()
tr_df_copy['site'] = 2
tr_df = tr_df.append(tr_df_copy, ignore_index=True)

In [None]:
tr_df.columns

In [None]:
def get_img_path(experiment, plate, well, site=1):
    return experiment + '/' + 'Plate' + str(plate) + '/' + well + '_s{}'.format(site)

In [None]:
tr_df['path'] = tr_df.apply(lambda row : get_img_path(row.experiment, row.plate, row.well, row.site), axis=1)

In [None]:
tr_df['cellline'] = tr_df.apply(lambda row : row.experiment[:-3], axis=1)

In [None]:
cell_lines = list(tr_df.cellline.unique())

In [None]:
cell_lines

In [None]:
class RecursionImageList(ImageList):
    def open(self, fn:PathOrStr) -> Image:
        fn = str(fn)
        imgs = []
        for channel in [1, 2, 3, 4, 5, 6]:
            img_path = fn + '_w{}'.format(channel) + '.png'
            imgs.append(cv2.imread(img_path, cv2.IMREAD_GRAYSCALE))
        img = cv2.merge(imgs)
        img = img / 255.0
        return Image(px=pil2tensor(img, np.float32))

In [None]:
# tr_df_small = tr_df.sample(n=120, random_state=42)

In [None]:
src = (
    RecursionImageList.from_df(tr_df, path=path/'train', cols=['path'])
)

In [None]:
df = tr_df
tr_df, va_df = train_test_split(df, test_size=0.05, random_state=42, stratify = df.cellline)
src = src.split_by_list(RecursionImageList.from_df(tr_df, path=path/'train', cols=['path']), 
                        RecursionImageList.from_df(va_df, path=path/'train', cols=['path']))

In [None]:
# src = src.split_subsets(train_size=0.001, valid_size=0.001)

In [None]:
src = src.label_from_df(cols=['cellline'])

In [None]:
data = src.transform(size=512).databunch(bs=16, pin_memory=True)

In [None]:
data

In [None]:
precision = Precision(average="macro")
recall = Recall(average="macro")
loss_func = CrossEntropyFlat()

In [None]:
learn = cnn_learner(data, models.resnet18, metrics=[accuracy, error_rate, precision, recall],
                    loss_func=loss_func,
                    model_dir='/kaggle/working', pretrained=True)

In [None]:
learn.model[0][0]

In [None]:
# let's make our model work with single channel
trained_kernel = learn.model[0][0].weight
new_conv = Conv2d(6, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
with torch.no_grad():
    new_conv.weight[:,:] = torch.stack([torch.mean(trained_kernel, 1)] * 6, dim=1)
learn.model[0][0] = new_conv
learn.model.cuda();

In [None]:
# !ls ../input/recursion-fast-ai-channel-2-4-resnet50

In [None]:
# learn.load('../input/recursion-fast-ai-channel-2-4-resnet50/stage-05');

In [None]:
# learn.unfreeze()

In [None]:
learn.summary()

In [None]:
# Find a good learning rate
learn.lr_find(start_lr=1.0e-5, end_lr=1e-1, num_it=256)
learn.recorder.plot(suggestion=True)

In [None]:
!nvidia-smi

In [None]:
learn.fit(8, 3e-4)

In [None]:
learn.recorder.plot_losses()
learn.recorder.plot_metrics()
learn.recorder.plot_lr(show_moms=True)

In [None]:
learn.save('stage-01')

In [None]:
learn.validate(dl=learn.data.train_dl)

# Prediction

In [None]:
te_df = pd.read_csv(path/'test.csv')
te_df['path1'] = te_df.apply(lambda row : get_img_path(row.experiment, row.plate, row.well, site=1), axis=1)
te_df['path2'] = te_df.apply(lambda row : get_img_path(row.experiment, row.plate, row.well, site=2), axis=1)
te_df['cellline'] = te_df.apply(lambda row : row.experiment[:-3], axis=1)
test_src1 = RecursionImageList.from_df(te_df, path=path/'test', cols=['path1'])
test_src2 = RecursionImageList.from_df(te_df, path=path/'test', cols=['path2'])

In [None]:
learn.data.add_test(test_src1)
preds1, y = learn.get_preds(DatasetType.Test)

In [None]:
learn.data.add_test(test_src2)
preds2,y = learn.get_preds(DatasetType.Test)

In [None]:
preds = 0.5 * (preds1 + preds2)

In [None]:
te_df['pred_cellline'] = preds.argmax(1)

In [None]:
cell_line_dict = dict(enumerate(cell_lines))

In [None]:
cell_line_dict

In [None]:
1 - (te_df['pred_cellline'].apply(lambda row: cell_line_dict[row]) == te_df['cellline']).mean()