In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
#print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

### Aim of the Compition


Analyse the Protein cell from the biomedical image and find the pattern to accelerate the understanding of human cells behaviour and optimise disease [such as breast cancer, prostate cancer, colon cancer, diabetes, autoimmune diseases, ovarian cancer and renal failure].

### Company Information 

#### Human Protein Atlas

![](https://www.ebi.ac.uk/gxa/resources/images/experiment-list-latest/human_protein_atlas.png)

The Human Protein Atlas (HPA) is a Swedish-based program started in 2003 with the aim to map of all the human proteins in cells, tissues and organs using integration of various omics technologies, including antibody-based imaging, mass spectrometry-based proteomics, transcriptomics and systems biology. All the data in the knowledge resource is open access to allow scientists both in academia and industry to freely access the data for exploration of the human proteome. [ More Information ](https://en.wikipedia.org/wiki/Human_Protein_Atlas)

Company Mojor Working with Three Project :
* [Tissue Atlas ](https://www.proteinatlas.org/tissue)
* [Cell Atlas](https://www.proteinatlas.org/cell)
* [Pathology Atlas](https://www.proteinatlas.org/pathology)



### Definition of Protein Structure

Protein structure is the three-dimensional arrangement of atoms in an amino acid-chain molecule. Proteins are polymers – specifically polypeptides – formed from sequences of amino acids, the monomers of the polymer. A single amino acid monomer may also be called a residue indicating a repeating unit of a polymer [Reference](https://en.wikipedia.org/wiki/Protein_structure)

![](http://paulbrinson.weebly.com/uploads/5/9/8/1/59812627/1628628_orig.gif)

### Cell Structure 

What do all cells have in common?

Same chemical makeup

* Proteins (made up of amino acids; many are enzymes)
* Nucleic acids (DNA, RNA)
* Lipids (fatty or oily molecules)
* Carbohydrates (sugars and starches)

![](https://s3.studylib.net/store/data/008655064_1-388dce9b3c81ae4c6ed884d95c10f722-260x520.png)

![](https://biologydictionary.net/wp-content/uploads/2017/03/Cell-membrane-diagram.jpg)
### Important Functions of Protein in Your Body

* Growth and Maintenance
* Causes Biochemical Reactions
	* Digestion
	* Energy production
	* Blood clotting
	* Muscle contraction
* Acts as a Messenger
* Provides Structure
* Maintains Proper pH
	* The balance between acids and bases is measured using the pH scale. It ranges from 0 to 14, with 0 being the most acidic, 7 neutral and 14 the most alkaline.
* Balances Fluids
* Bolsters Immune Health
* Transports and Stores Nutrients
* Provides Energy

### Protein Interactions with Disease


Proteins do not function in isolation; it is their interactions with one another and also with other molecules (e.g. DNA, RNA) that mediate metabolic and signaling pathways, cellular processes, and organismal systems. Due to their central role in biological function, protein interactions also control the mechanisms leading to healthy and diseased states in organisms. Diseases are often caused by mutations affecting the binding interface or leading to biochemically dysfunctional allosteric changes in proteins. Therefore, protein interaction networks can elucidate the molecular basis of disease, which in turn can inform methods for prevention, diagnosis, and treatment. In this chapter, we will describe the computational approaches to predict and map networks of protein interactions and briefly review the experimental methods to detect protein interactions. We will describe the application of protein interaction networks as a translational approach to the study of human disease and evaluate the challenges faced by these approaches. [More Information](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3531279/)
![](https://slideplayer.com/slide/5698688/18/images/32/The+role+of+protein+interaction+in+disease.jpg)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image
from collections import Counter

import os
print(os.listdir("../input"))

In [None]:
#import training data
train = pd.read_csv("../input/train.csv")
print(train.head())

#map of targets in a dictionary
subcell_locs = {
0:  "Nucleoplasm", 
1:  "Nuclear membrane",   
2:  "Nucleoli",   
3:  "Nucleoli fibrillar center" ,  
4:  "Nuclear speckles",
5:  "Nuclear bodies",
6:  "Endoplasmic reticulum",   
7:  "Golgi apparatus",
8:  "Peroxisomes",
9:  "Endosomes",
10:  "Lysosomes",
11:  "Intermediate filaments",   
12:  "Actin filaments",
13:  "Focal adhesion sites",   
14:  "Microtubules",
15:  "Microtubule ends",   
16:  "Cytokinetic bridge",   
17:  "Mitotic spindle",
18:  "Microtubule organizing center",  
19:  "Centrosome",
20:  "Lipid droplets",   
21:  "Plasma membrane",   
22:  "Cell junctions", 
23:  "Mitochondria",
24:  "Aggresome",
25:  "Cytosol",
26:  "Cytoplasmic bodies",   
27:  "Rods & rings" 
}

### Imporatant Information 

You are predicting protein organelle localization labels for each sample. There are in total 28 different labels present in the dataset. The dataset is acquired in a highly standardized way using one imaging modality (confocal microscopy). However, the dataset comprises 27 different cell types of highly different morphology, which affect the protein patterns of the different organelles. All image samples are represented by four filters (stored as individual files).
* the protein of interest (green) 
* nucleus (blue), 
* microtubules (red), 
* endoplasmic reticulum (yellow). 

The green filter should hence be used to predict the label, and the other filters are used as references.

In [None]:
print("The image with ID == 1 has the following labels:", train.loc[1, "Target"])
print("These labels correspond to:")
for location in train.loc[1, "Target"].split():
    print("-", subcell_locs[int(location)])

#reset seaborn style
sns.reset_orig()

#get image id
im_id = train.loc[1, "Id"]

#create custom color maps
cdict1 = {'red':   ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'green': ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0))}

cdict2 = {'red':   ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'green': ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0))}

cdict3 = {'red':   ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'green': ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0))}

cdict4 = {'red': ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'green': ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0))}

plt.register_cmap(name='greens', data=cdict1)
plt.register_cmap(name='reds', data=cdict2)
plt.register_cmap(name='blues', data=cdict3)
plt.register_cmap(name='yellows', data=cdict4)

#get each image channel as a greyscale image (second argument 0 in imread)
green = cv2.imread('../input/train/{}_green.png'.format(im_id), 0)
red = cv2.imread('../input/train/{}_red.png'.format(im_id), 0)
blue = cv2.imread('../input/train/{}_blue.png'.format(im_id), 0)
yellow = cv2.imread('../input/train/{}_yellow.png'.format(im_id), 0)

#display each channel separately
fig, ax = plt.subplots(nrows = 2, ncols=2, figsize=(15, 15))
ax[0, 0].imshow(green, cmap="greens")
ax[0, 0].set_title("Protein of interest", fontsize=18)
ax[0, 1].imshow(red, cmap="reds")
ax[0, 1].set_title("Microtubules", fontsize=18)
ax[1, 0].imshow(blue, cmap="blues")
ax[1, 0].set_title("Nucleus", fontsize=18)
ax[1, 1].imshow(yellow, cmap="yellows")
ax[1, 1].set_title("Endoplasmic reticulum", fontsize=18)
for i in range(2):
    for j in range(2):
        ax[i, j].set_xticklabels([])
        ax[i, j].set_yticklabels([])
        ax[i, j].tick_params(left=False, bottom=False)
plt.show()

In [None]:
labels_num = [value.split() for value in train['Target']]
labels_num_flat = list(map(int, [item for sublist in labels_num for item in sublist]))
labels = ["" for _ in range(len(labels_num_flat))]
for i in range(len(labels_num_flat)):
    labels[i] = subcell_locs[labels_num_flat[i]]

fig, ax = plt.subplots(figsize=(15, 5))
pd.Series(labels).value_counts().plot('bar', fontsize=14)