## Human Protein Atlas - Single Cell Classification
### Finding individual human cell differences in microscope images
![image](https://storage.googleapis.com/kaggle-competitions/kaggle/23823/logos/header.png?t=2020-11-24-14-18-10)


In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
ROOT = "../input/hpa-single-cell-image-classification/"
os.listdir(ROOT)

In [None]:
train =  pd.read_csv(ROOT+"train.csv")
train.head()

In [None]:
sub =  pd.read_csv(ROOT+"sample_submission.csv")
sub.head()

In [None]:
train.shape

In [None]:
len(os.listdir(ROOT+"train/"))

In [None]:
assert len(os.listdir(ROOT+"train/")) == (4 * train.shape[0])
assert set([i.split("_")[0] for i in os.listdir(ROOT+"train/")]) == set(train.ID.unique())

In [None]:
sub.shape

In [None]:
len(os.listdir(ROOT+"test/"))

In [None]:
os.listdir(ROOT+"train/")[:5]

In [None]:
import cv2
import matplotlib.pyplot as plt

In [None]:
red = cv2.imread(ROOT+"train/5e3a2e6a-bb9c-11e8-b2b9-ac1f6b6435d0_red.png", cv2.IMREAD_UNCHANGED)
yellow = cv2.imread(ROOT+"train/5e3a2e6a-bb9c-11e8-b2b9-ac1f6b6435d0_yellow.png", cv2.IMREAD_UNCHANGED)
blue = cv2.imread(ROOT+"train/5e3a2e6a-bb9c-11e8-b2b9-ac1f6b6435d0_blue.png", cv2.IMREAD_UNCHANGED)
green = cv2.imread(ROOT+"train/5e3a2e6a-bb9c-11e8-b2b9-ac1f6b6435d0_green.png", cv2.IMREAD_UNCHANGED)
red.shape, yellow.shape, blue.shape, green.shape

**We have 4 images per training example. The important one is green one.
Others can be used if needed**

In [None]:
plt.imshow(green, cmap='gray');
plt.show()

In [None]:
plt.imshow(yellow, cmap='gray');
plt.show()

In [None]:
plt.imshow(blue, cmap='gray');
plt.show()

In [None]:
plt.imshow(red, cmap='gray');
plt.show()

In [None]:
img = cv2.merge((red, green, blue))  
plt.imshow(img);
plt.show()

### Labels

In [None]:
train.head()

In [None]:
train.Label = train.Label.apply(lambda x: x.split("|"))
train.head()

In [None]:
# There are total 18 labels

labels = {
0: "Nucleoplasm",
1: "Nuclear membrane",
2: "Nucleoli",
3: "Nucleoli fibrillar center",
4: "Nuclear speckles",
5: "Nuclear bodies",
6: "Endoplasmic reticulum",
7: "Golgi apparatus",
8: "Intermediate filaments",
9: "Actin filaments",
10: "Microtubules",
11: "Mitotic spindle",
12: "Centrosome",
13: "Plasma membrane",
14: "Mitochondria",
15: "Aggresome",
16: "Cytosol",
17: "Vesicles and punctate cytosolic patterns",
18: "Negative",
}

In [None]:
import itertools
import seaborn as sns

In [None]:
fig, ax = plt.subplots(1,1, figsize=(14, 7))
sns.countplot([labels[int(i)] for i in itertools.chain.from_iterable(train.Label)], axes=ax);
plt.title("Distribution of labels in training data")
plt.xticks(rotation=90)
plt.show()

### Co-occurrence Matrix

In [None]:
u = pd.get_dummies(pd.DataFrame(train.Label.tolist()), prefix='', prefix_sep='').groupby(level=0, axis=1).sum()
v = u.T.dot(u)
v.values[(np.r_[:len(v)], ) * 2] = 0
v = v.reindex([str(i) for i in range(1, 19)], axis=1)
v = v.reindex([str(i) for i in range(1, 19)], axis=0)
v

In [None]:
co_mat = v
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
sns.heatmap((v / np.sum(v, axis=0)).T, cbar=True, annot=False)
plt.show()

In [None]:
counts = np.bincount(green.reshape(-1))
counts = counts / np.sum(counts)

In [None]:
train.head()

**Given that we dont have pixel level labels for each class let's see how   
can we guess presence and absence of a class by just using raw image pixel values**

In [None]:
import tqdm.auto as tqdm

In [None]:
def plotdist(k, color):
    sample = train.sample(k)
    count_list = []
    for i in tqdm.tqdm(sample.ID, leave=False):
        path = ROOT+"train/"+i+"_"+color+".png"
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        counts = np.bincount(img.reshape(-1), minlength=256)
        counts = counts / np.sum(counts)
        count_list.append(counts)
    sample['counts'] = count_list

    fig, ax =  plt.subplots(1,1, figsize=(12, 7))
    for class_id in range(19):
        cts = sample[sample.Label.apply(lambda x: str(class_id) in x)].counts.tolist()
        stats = (np.array(cts).sum(axis=0) / len(cts))
        if (not np.isnan(stats).any()):
            # Not ploting 0 as it is outlier
            plt.plot([i for i in range(1, 256)], stats[1:], label="class "+str(class_id));
    plt.legend()
    plt.xlabel("Pixel value")
    plt.title(color+" Pixel Value Distribution for images containing different classes")
    plt.show()

In [None]:
plotdist(500, 'green')

In [None]:
plotdist(500, 'red')

In [None]:
plotdist(500, 'blue')

In [None]:
plotdist(500, 'yellow')

In [None]:
def plotimg(axes, ID):
    red = cv2.imread(ROOT+"train/"+ID+"_red.png", cv2.IMREAD_UNCHANGED)
    green = cv2.imread(ROOT+"train/"+ID+"_green.png", cv2.IMREAD_UNCHANGED)
    blue = cv2.imread(ROOT+"train/"+ID+"_blue.png", cv2.IMREAD_UNCHANGED)
    img = cv2.merge((red, green, blue))
    axes.imshow(img)

In [None]:
for k in range(0, 19):
    IDS = train[train.Label.apply(lambda x: str(k) in x)].sample(4).ID.tolist()
    fig,axes = plt.subplots(1, 4, figsize=(16, 4))
    for ID, ax in zip(IDS, axes):
        plotimg(ax, ID)
    fig.suptitle(labels[k] + " samples")
    plt.show()

the notebook is still WIP but
### do upvote if it helped :)