In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Essentially a simplified, modified version of tinkertytonk project - so credit to him

Attach data and make sure GPU is enabled before running this nb

In [0]:
!pip install kaggle --upgrade -q

In [0]:
import os
import gc
import numpy as np
# import jovian
import cv2

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

from fastai import *
from fastai.vision import *
from fastai.metrics import accuracy, error_rate
from fastai.callbacks import *

from PIL import Image
from tqdm.notebook import tqdm 
from pathlib import Path

Setup Kaggle API Key

In [0]:
os.environ['KAGGLE_USERNAME']="skr1125"
os.environ['KAGGLE_KEY']=""

In [0]:
!pwd

In [0]:
!ls ../input

In [0]:
# setting up paths for use in the notebook
PATH = '../input/human-protein-atlas-image-classification/'
TRAIN = '../input/human-protein-atlas-image-classification/train/'
TEST =  '../input/human-protein-atlas-image-classification/test/'
LABELS = '../input/human-protein-atlas-image-classification/train.csv'
path_working = Path('/kaggle/working')

In [0]:
channels4 = ['_yellow', '_red', '_green_', '_blue']
channels3 = ['_red', '_green', '_blue']

In [0]:
index_class_dict = {
0:  'Nucleoplasm',
1:  'Nuclear membrane',
2:  'Nucleoli',   
3:  'Nucleoli fibrillar center',
4:  'Nuclear speckles',
5:  'Nuclear bodies',
6:  'Endoplasmic reticulum',   
7:  'Golgi apparatus',
8:  'Peroxisomes',
9:  'Endosomes',
10:  'Lysosomes',
11:  'Intermediate filaments',
12:  'Actin filaments',
13:  'Focal adhesion sites',   
14:  'Microtubules',
15:  'Microtubule ends',  
16:  'Cytokinetic bridge',   
17:  'Mitotic spindle',
18:  'Microtubule organizing center',  
19:  'Centrosome',
20:  'Lipid droplets',
21:  'Plasma membrane',   
22:  'Cell junctions', 
23:  'Mitochondria',
24:  'Aggresome',
25:  'Cytosol',
26:  'Cytoplasmic bodies',   
27:  'Rods & rings' }

The following two cells are really NOT needed for the process of visualizing or conducting the training. Just an illustration to show the file and process it into a one hot vector of structures present in the training images.

In [0]:
# Read the training data so that the cell id and the structures they contain (as strings) can be seen in the Target field
train_df = pd.read_csv(LABELS) # see LABELS definition above
train_df.head()

In [0]:
# next convert the whole thing into a one hot vector coding as well 
for i in range(28):
    train_df[f'{index_class_dict[i]}'] = train_df['Target'].map(lambda x:1 if str(i) in x.strip().split() else 0)
train_df.head()

The cells below need to be repeated for the 4 channel case. I am trying this first for a simple 3 channel model

In [0]:
# Suppose we only use the rgb values provided initially and IGNORE the y values, so only 3 channels 
# We will try 4 channels after that - remember to initialize y value with avg of other 3 NOT zeros
# import Fastai vision to get their Image class
from fastai.vision.image import *

# taken from : https://github.com/wdhorton/protein-atlas-fastai/blob/master/utils.py
# discussion : https://www.kaggle.com/c/human-protein-atlas-image-classification/discussion/71039
# adapted from https://www.kaggle.com/iafoss/pretrained-resnet34-with-rgby-0-460-public-lb
def open_3_channel(fname):
    fname = str(fname)
    # strip extension before adding color
    if fname.endswith('.png'):
        fname = fname[:-4]
    # SKR: colors below changed to only be 3 colors red, green, blue 
    # SKR: IGNORING YELLOW for now
    #colors = ['red','green','blue','yellow']
    colors = ['red', 'green', 'blue']
    flags = cv2.IMREAD_GRAYSCALE
    
    img = [cv2.imread(fname+'_'+color+'.png', flags).astype(np.float32)/255
           for color in colors]
    
    # convert from a [512,512,4] tensor to a [4,512,512] tensor
    # convert from a [512, 512, 3] tensor to a [3, 512, 512] tensor
    x = np.stack(img, axis=-1)    
    
    # create a Fastai image from the tensor
    return Image(pil2tensor(x, np.float32).float())

def open_4_channel(fname):
    fname = str(fname)
    # strip extension before adding color
    if fname.endswith('.png'):
        fname = fname[:-4]
    colors = ['red','green','blue','yellow']
    flags = cv2.IMREAD_GRAYSCALE
    
    img = [cv2.imread(fname+'_'+color+'.png', flags).astype(np.float32)/255
           for color in colors]

    x = np.stack(img, axis=-1)    
    
    # create a Fastai image from the tensor
    return Image(pil2tensor(x, np.float32).float())


Create the DataBunch. Here another approach could be to size the images to be 224 x 224 above and train the model first 
THEN train the model on 512 x 512 images and use that model to predict. 

In [0]:
bs=32
size=512

In [0]:
# read submission file to get the names of test images
test_df = pd.read_csv(PATH + 'sample_submission.csv')
test_df.head()

In [0]:
np.random.seed(230)

In [0]:
PATH

In [0]:
test = ImageList.from_df(test_df, PATH, folder='test', suffix='.png')

In [0]:
src = (ImageList.from_df(train_df, PATH, folder='train', suffix='.png')
                .split_by_rand_pct(0.2)
                .label_from_df(cols='Target', label_delim=' ')
                .add_test(test))

In [0]:
src.train.x.create_func = open_4_channel
src.train.x.open = open_4_channel

src.valid.x.create_func = open_4_channel
src.valid.x.open = open_4_channel

In [0]:
src.test.x.create_func = open_4_channel
src.test.x.open = open_4_channel

In [0]:
# 4 channel protein stats - going to only 3 channel stats
protein_stats = ([0.08069, 0.05258, 0.05487, 0.08282], [0.13704, 0.10145, 0.15313, 0.13814])


In [0]:
# create databunch after using bs and normalizing using protein stats
data = src.databunch(bs=bs).normalize(protein_stats)

In [0]:
data.show_batch(rows=3, figsize=(12,9))

In [0]:
data.c

Need to modify the model to be 4 channel. So following this kaggle kernel
https://www.kaggle.com/iafoss/pretrained-resnet34-with-rgby-0-460-public-lb
and implementing a 4 channel modification of Resnet34 but loading the weights
for the 3 channels from pretrained model and creating starting point for weights of Y channel to be initialized with values from other channels (could try different variants of this if so inclined).

In [0]:
class Resnet34_4(nn.Module):
    def __init__(self, pre=True):
        super().__init__()
        encoder = models.resnet34(pretrained=pre)
        
        self.conv1 = nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3, bias=False)
        if(pre):
            w = encoder.conv1.weight
            self.conv1.weight = nn.Parameter(torch.cat((w,
                                    0.5*(w[:,:1,:,:]+w[:,2:,:,:])),dim=1))
        
        self.bn1 = encoder.bn1
        self.relu = nn.ReLU(inplace=True) 
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer0 = nn.Sequential(self.conv1,self.relu,self.bn1,self.maxpool)
        self.layer1 = encoder.layer1
        self.layer2 = encoder.layer2
        self.layer3 = encoder.layer3
        self.layer4 = encoder.layer4
        #the head will be added automatically by fast.ai
        
    def forward(self, x):
        x = self.layer0(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        return x

In [0]:
# arch = models.resnet34
arch = Resnet34_4

This should be modified to be different metrics as per competition spec

In [0]:
acc_02 = partial(accuracy_thresh, thresh=0.2)

In [0]:
f_score = partial(fbeta, thresh=0.2)

In [0]:
learn = cnn_learner(data, arch, metrics=[acc_02, f_score])

Need to set learn.model_dir attribute in Learner to a full libpath path that is writable and so 

In [0]:
learn.model_dir = path_working 

In [0]:
path_working

In [0]:
learn.model_dir = path_working

In [0]:
learn.model_dir

In [0]:
learn.lr_find()

In [0]:
learn.recorder.plot()

In [0]:
lr = 2e-2

In [0]:
learn.fit_one_cycle(10, slice(lr))

In [0]:
learn.save('stage1-rn34-4ch')

In [0]:
learn.unfreeze()

In [0]:
learn.lr_find()

In [0]:
learn.recorder.plot()

In [0]:
# training longer to better train y weights
learn.fit_one_cycle(10, slice(1e-5, lr/5))

In [0]:
learn.save('stage2-rn34-4ch')

In [0]:
preds, _ = learn.get_preds(DatasetType.Test)

In [0]:
type(preds)

In [0]:
preds.shape

In [0]:
type(learn.data.classes)

In [0]:
len(learn.data.classes)

In [0]:
thresh = 0.2
labelled_preds = [' '.join([learn.data.classes[i] for i,p in enumerate(pred) if p > thresh]) for pred in preds]

In [0]:
len(labelled_preds)

In [0]:
labelled_preds[:5]

In [0]:
learn.data.test_ds.items[0]

In [0]:
Path(learn.data.test_ds.items[0]).stem

In [0]:
# converting image path strings to the file name only with no extension
fnames = [Path(f).stem for f in learn.data.test_ds.items]

In [0]:
sample_list = list(test_df.Id)
# sample_list[:5]
pred_dict = dict((key, value) for (key, value) in zip(fnames, labelled_preds))
pred_list_cor = [pred_dict[id] for id in sample_list]
df = pd.DataFrame({'ID':sample_list, 'Predicted':pred_list_cor})
df.to_csv('protein_classification.csv', header=True, index=False)

In [0]:
df.head()

In [0]:
# Submit
!kaggle competitions submit -c human-protein-atlas-image-classification -f protein_classification.csv -m "Message"

# View results
!kaggle competitions submissions -c human-protein-atlas-image-classification > results.txt

In [0]:
!ls results.txt

In [0]:
!more results.txt