### Investigate & Exploit Plate Leak

In [1]:
import os

import numpy as np
import pandas as pd

from fastai.vision import *
from fastai.metrics import *

In [2]:
from pathlib import Path

DATA = Path("/mnt/disk4/cell/")

In [15]:
train_df = pd.read_csv(DATA/'train.csv')
test_df = pd.read_csv(DATA/'test.csv')
train_df.head(10)

Unnamed: 0,id_code,experiment,plate,well,sirna
0,HEPG2-01_1_B03,HEPG2-01,1,B03,513
1,HEPG2-01_1_B04,HEPG2-01,1,B04,840
2,HEPG2-01_1_B05,HEPG2-01,1,B05,1020
3,HEPG2-01_1_B06,HEPG2-01,1,B06,254
4,HEPG2-01_1_B07,HEPG2-01,1,B07,144
5,HEPG2-01_1_B08,HEPG2-01,1,B08,503
6,HEPG2-01_1_B09,HEPG2-01,1,B09,188
7,HEPG2-01_1_B10,HEPG2-01,1,B10,700
8,HEPG2-01_1_B11,HEPG2-01,1,B11,1100
9,HEPG2-01_1_B12,HEPG2-01,1,B12,611


In [5]:
train_df.sirna.max(), train_df.sirna.min()

(1107, 0)

In [4]:
train_df.plate.value_counts()

3    9130
2    9129
1    9129
4    9127
Name: plate, dtype: int64

In [12]:
train_df.loc[train_df.sirna==4,:].plate.value_counts().index.values

array([3, 1, 2])

In [26]:
train_df["pname"]=train_df.apply(lambda x:x["experiment"]+"-"+x["plate"].__str__(), axis=1)

### Mapping group to each siRNA

In [31]:
def sameRna(sirna):
    pnames = list(set(train_df[train_df.sirna==sirna].pname))
    return list(set(train_df[train_df.pname.isin(pnames)].sirna))

In [56]:
list1 = sameRna(0)
list2 = sameRna(1)
list3 = sameRna(2)
list4 = sameRna(4)

In [57]:
len(list1),len(list2),len(list3),len(list4)

(277, 277, 277, 277)

In [61]:
groups = np.zeros(1108)
for i in range(4):
    for sirna in eval("list%s"%(i+1)):
        groups[sirna] = i

In [62]:
groups

array([0., 1., 2., 1., ..., 1., 3., 1., 0.])

In [63]:
np.save("groups.npy",groups)

### Test on our best submission

In [64]:
subdf = pd.read_csv("submission_ens_4b3_b4_2b5.csv")

In [69]:
subdf_ = subdf.set_index("id_code").join(test_df.set_index("id_code")).reset_index()

In [71]:
subdf_.sample(10)

Unnamed: 0,id_code,sirna,experiment,plate,well
5611,HUVEC-18_1_E17,939,HUVEC-18,1,E17
9330,HUVEC-21_2_K22,147,HUVEC-21,2,K22
7611,HUVEC-19_4_I03,798,HUVEC-19,4,I03
6183,HUVEC-18_3_F18,887,HUVEC-18,3,F18
9779,HUVEC-21_4_F12,229,HUVEC-21,4,F12
10637,HUVEC-22_3_G20,679,HUVEC-22,3,G20
17536,RPE-11_4_H02,443,RPE-11,4,H02
15618,RPE-10_1_H14,541,RPE-10,1,H14
4761,HUVEC-17_2_D20,887,HUVEC-17,2,D20
3231,HEPG2-10_4_K14,137,HEPG2-10,4,K14


In [72]:
subdf_["pname"]=subdf_.apply(lambda x:x["experiment"]+"-"+x["plate"].__str__(), axis=1)

subdf_["group_"] = subdf_.sirna.apply(lambda x:groups[x])

In [76]:
subdf_.sample(10)

Unnamed: 0,id_code,sirna,experiment,plate,well,pname,group_
11587,HUVEC-23_2_N07,19,HUVEC-23,2,N07,HUVEC-23-2,2.0
8431,HUVEC-20_3_H10,187,HUVEC-20,3,H10,HUVEC-20-3,0.0
3482,HEPG2-11_1_J05,316,HEPG2-11,1,J05,HEPG2-11-1,0.0
7984,HUVEC-20_1_M23,29,HUVEC-20,1,M23,HUVEC-20-1,3.0
2071,HEPG2-09_4_H21,277,HEPG2-09,4,H21,HEPG2-09-4,1.0
5521,HUVEC-17_4_O08,1036,HUVEC-17,4,O08,HUVEC-17-4,0.0
9050,HUVEC-21_1_K19,925,HUVEC-21,1,K19,HUVEC-21-1,0.0
14457,RPE-09_1_E17,622,RPE-09,1,E17,RPE-09-1,1.0
9748,HUVEC-21_4_D22,90,HUVEC-21,4,D22,HUVEC-21-4,3.0
17314,RPE-11_3_J20,674,RPE-11,3,J20,RPE-11-3,3.0


In [101]:
group_test = subdf_.groupby("pname").apply(lambda x:x.group_.value_counts().reset_index().loc[0]).reset_index()[["pname","index"]]

In [103]:
pred_group = dict(zip(group_test.pname, group_test["index"]))

### Mask for each group

In [104]:
mask = np.zeros((4,1108), np.float32)

for i in range(4):
    mask[i] = groups==i

In [105]:
mask

array([[1., 0., 0., 0., ..., 0., 0., 0., 1.],
       [0., 1., 0., 1., ..., 1., 0., 1., 0.],
       [0., 0., 1., 0., ..., 0., 0., 0., 0.],
       [0., 0., 0., 0., ..., 0., 1., 0., 0.]], dtype=float32)

In [143]:
np.save("mask.npy",mask)

In [120]:
test_df["group_"] = test_df.pname.apply(lambda x:int(pred_group[x]))

### Mask for each test picture

In [134]:
test_df["mask"] = test_df.group_.apply(lambda x:mask[x,:])

In [139]:
pred_mask = np.stack(test_df["mask"])

In [141]:
np.save("pred_mask.npy",pred_mask)