In [None]:
#matplotlib template from:  https://stackoverflow.com/questions/46615554/how-to-display-multiple-images-in-one-figure-correctly/46616645
#mask segmentation prediction string: https://www.kaggle.com/its7171/mmdetection-for-segmentation-inference

## Human Protein Atlas: Visualize Dataset

# Contents

1. [Load and view dataset](#1)
1. [Read and show image data](#2)
1. [Visualization information in HPA-dataset](#3)

<a id="1"></a> <br>
# <div class="alert alert-block alert-success">Load and view dataset</div>

## Import Libraries

In [None]:
import collections
import json
import os
import uuid
from tqdm import tqdm

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw, ImageFilter
import tifffile as tiff 
import seaborn as sns

## Load dataset

### Image files

In [None]:
#get an overview of some input image files
%ls ../input/hpa-single-cell-image-classification/train | head -12

In [None]:
#save the file directories to variables 
#save the file names to lists
traindir="../input/hpa-single-cell-image-classification"+"/train/"
testdir="../input/hpa-single-cell-image-classification"+"/test/"
train = os.listdir(traindir)
test = os.listdir(testdir)

In [None]:
#check file amounts and if all images have same file extension
n_png = [filename for filename in train if filename.find('png') != -1]
print("Train:")
print(f"   PNG files :  {len(n_png)}")
print(f"   total files :  {len(train)}")
n_png = [filename for filename in test if filename.find('png') != -1]
print("Test:")
print(f"   PNG files :  {len(n_png)}")
print(f"   total files :  {len(test)}")

In [None]:
#check if every imageID has one image per color or if some are missing
n_blue = [filename for filename in train if filename.find('blue') != -1]
n_blue = len(n_blue)
n_green = [filename for filename in train if filename.find('green') != -1]
n_green = len(n_green)
n_red = [filename for filename in train if filename.find('red') != -1]
n_red = len(n_red)
n_yellow = [filename for filename in train if filename.find('yellow') != -1]
n_yellow = len(n_yellow)
print("Train:")
print(f"   blue :  {n_blue}")
print(f"   green :  {n_green}")
print(f"   red :  {n_red}")
print(f"   yellow :  {n_yellow}")
n_blue = [filename for filename in test if filename.find('blue') != -1]
n_blue = len(n_blue)
n_green = [filename for filename in test if filename.find('green') != -1]
n_green = len(n_green)
n_red = [filename for filename in test if filename.find('red') != -1]
n_red = len(n_red)
n_yellow = [filename for filename in test if filename.find('yellow') != -1]
n_yellow = len(n_yellow)
print("Test:")
print(f"   blue :  {n_blue}")
print(f"   green :  {n_green}")
print(f"   red :  {n_red}")
print(f"   yellow :  {n_yellow}")

### CSV file

In [None]:
#read class information and make a list of classes for each image
image_dataframe=pd.read_csv("../input/hpa-single-cell-image-classification/train.csv")
image_dataframe["Label"]=image_dataframe["Label"].apply(lambda x:list(map(int, x.split("|"))))

In [None]:
image_dataframe.head()

In [None]:
class_names={
    0: "Nucleoplasm",
    1: "Nuclear membrane",
    2: "Nucleoli",
    3: "Nucleoli fibrillar center",
    4: "Nuclear speckles",
    5: "Nuclear bodies",
    6: "Endoplasmic reticulum",
    7: "Golgi apparatus",
    8: "Intermediate filaments",
    9: "Actin filaments",
    10: "Microtubules",
    11: "Mitotic spindle",
    12: "Centrosome",
    13: "Plasma membrane",
    14: "Mitochondria",
    15: "Aggresome",
    16: "Cytosol",
    17: "Vesicles and punctate cytosolic patterns",
    18: "Negative"
}

In [None]:
flat_list = [class_names[item] for sublist in image_dataframe["Label"] for item in sublist]
flat_list = pd.DataFrame(flat_list)
flat_list.columns=["name"]

In [None]:
#make a bar plot of the label frequencies
sns.countplot(y = 'name',
              data = flat_list,
              order = flat_list['name'].value_counts().index)
# Show the plot
plt.show()

<a id="2"></a> <br>
# <div class="alert alert-block alert-warning">Read and show image data</div>

We are given microscopic images of human cells, tissues, and organs. These images are all in png format.

In [None]:
#visualize the four color maps of one of the images
train.sort()
for i in range(4):
    print(train[i])

w = 10
h = 10
fig = plt.figure(figsize=(15, 15))
columns = 2
rows = 2

# ax enables access to manipulate each of subplots
ax = []
titles=["Nuclei ['..._blue.png']",
        "Proteins of interest ['..._green.png']",
        "Microtubuli ['..._red.png']",
        "Endoplasmatic Reticulum ['..._yellow.png']"]

for i in range(columns*rows):
    img = plt.imread(traindir+train[i])
    # create subplot and append to ax
    ax.append( fig.add_subplot(rows, columns, i+1) )
    ax[i].set_title(titles[i])  # set title
    plt.imshow(img)

plt.show()  # render the plot

In [None]:
#check and visualize the image dimensions of the images

shapes=[]
for file in tqdm(train):
    image = plt.imread(traindir+file)
    shapes.append([image.shape[0],image.shape[1]])
    
shapesdf = pd.DataFrame(shapes, columns =['width', 'height'])
sns.jointplot(x=shapesdf['width'], y=shapesdf['height'])