In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
#print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

### Aim of the Compition


Analyse the Protein cell from the biomedical image and find the pattern to accelerate the understanding of human cells behaviour and optimise disease [such as breast cancer, prostate cancer, colon cancer, diabetes, autoimmune diseases, ovarian cancer and renal failure].

### Company Information 

#### Human Protein Atlas

![](https://www.ebi.ac.uk/gxa/resources/images/experiment-list-latest/human_protein_atlas.png)

The Human Protein Atlas (HPA) is a Swedish-based program started in 2003 with the aim to map of all the human proteins in cells, tissues and organs using integration of various omics technologies, including antibody-based imaging, mass spectrometry-based proteomics, transcriptomics and systems biology. All the data in the knowledge resource is open access to allow scientists both in academia and industry to freely access the data for exploration of the human proteome. [ More Information ](https://en.wikipedia.org/wiki/Human_Protein_Atlas)

Company Mojor Working with Three Project :
* [Tissue Atlas ](https://www.proteinatlas.org/tissue)
* [Cell Atlas](https://www.proteinatlas.org/cell)
* [Pathology Atlas](https://www.proteinatlas.org/pathology)



### Definition of Protein Structure

Protein structure is the three-dimensional arrangement of atoms in an amino acid-chain molecule. Proteins are polymers – specifically polypeptides – formed from sequences of amino acids, the monomers of the polymer. A single amino acid monomer may also be called a residue indicating a repeating unit of a polymer [Reference](https://en.wikipedia.org/wiki/Protein_structure)

![](http://paulbrinson.weebly.com/uploads/5/9/8/1/59812627/1628628_orig.gif)

### Cell Structure 

What do all cells have in common?

Same chemical makeup

* Proteins (made up of amino acids; many are enzymes)
* Nucleic acids (DNA, RNA)
* Lipids (fatty or oily molecules)
* Carbohydrates (sugars and starches)

![](https://s3.studylib.net/store/data/008655064_1-388dce9b3c81ae4c6ed884d95c10f722-260x520.png)

![](https://biologydictionary.net/wp-content/uploads/2017/03/Cell-membrane-diagram.jpg)
### Important Functions of Protein in Your Body

* Growth and Maintenance
* Causes Biochemical Reactions
	* Digestion
	* Energy production
	* Blood clotting
	* Muscle contraction
* Acts as a Messenger
* Provides Structure
* Maintains Proper pH
	* The balance between acids and bases is measured using the pH scale. It ranges from 0 to 14, with 0 being the most acidic, 7 neutral and 14 the most alkaline.
* Balances Fluids
* Bolsters Immune Health
* Transports and Stores Nutrients
* Provides Energy

### Protein Interactions with Disease


Proteins do not function in isolation; it is their interactions with one another and also with other molecules (e.g. DNA, RNA) that mediate metabolic and signaling pathways, cellular processes, and organismal systems. Due to their central role in biological function, protein interactions also control the mechanisms leading to healthy and diseased states in organisms. Diseases are often caused by mutations affecting the binding interface or leading to biochemically dysfunctional allosteric changes in proteins. Therefore, protein interaction networks can elucidate the molecular basis of disease, which in turn can inform methods for prevention, diagnosis, and treatment. In this chapter, we will describe the computational approaches to predict and map networks of protein interactions and briefly review the experimental methods to detect protein interactions. We will describe the application of protein interaction networks as a translational approach to the study of human disease and evaluate the challenges faced by these approaches. [More Information](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3531279/)
![](https://slideplayer.com/slide/5698688/18/images/32/The+role+of+protein+interaction+in+disease.jpg)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image
from collections import Counter

import os
print(os.listdir("../input"))

In [None]:
#import training data
train = pd.read_csv("../input/train.csv")
print(train.head())

#map of targets in a dictionary
subcell_locs = {
0:  "Nucleoplasm", 
1:  "Nuclear membrane",   
2:  "Nucleoli",   
3:  "Nucleoli fibrillar center" ,  
4:  "Nuclear speckles",
5:  "Nuclear bodies",
6:  "Endoplasmic reticulum",   
7:  "Golgi apparatus",
8:  "Peroxisomes",
9:  "Endosomes",
10:  "Lysosomes",
11:  "Intermediate filaments",   
12:  "Actin filaments",
13:  "Focal adhesion sites",   
14:  "Microtubules",
15:  "Microtubule ends",   
16:  "Cytokinetic bridge",   
17:  "Mitotic spindle",
18:  "Microtubule organizing center",  
19:  "Centrosome",
20:  "Lipid droplets",   
21:  "Plasma membrane",   
22:  "Cell junctions", 
23:  "Mitochondria",
24:  "Aggresome",
25:  "Cytosol",
26:  "Cytoplasmic bodies",   
27:  "Rods & rings" 
}

### Important Information 

You are predicting protein organelle localization labels for each sample. There are in total 28 different labels present in the dataset. The dataset is acquired in a highly standardized way using one imaging modality (confocal microscopy). However, the dataset comprises 27 different cell types of highly different morphology, which affect the protein patterns of the different organelles. All image samples are represented by four filters (stored as individual files).
* the protein of interest (green) 
* nucleus (blue), 
* microtubules (red), 
* endoplasmic reticulum (yellow). 

The green filter should hence be used to predict the label, and the other filters are used as references.

In [None]:
print("The image with ID == 1 has the following labels:", train.loc[1, "Target"])
print("These labels correspond to:")
for location in train.loc[1, "Target"].split():
    print("-", subcell_locs[int(location)])

#reset seaborn style
sns.reset_orig()

#get image id
im_id = train.loc[1, "Id"]

#create custom color maps
cdict1 = {'red':   ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'green': ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0))}

cdict2 = {'red':   ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'green': ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0))}

cdict3 = {'red':   ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'green': ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0))}

cdict4 = {'red': ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'green': ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0))}

plt.register_cmap(name='greens', data=cdict1)
plt.register_cmap(name='reds', data=cdict2)
plt.register_cmap(name='blues', data=cdict3)
plt.register_cmap(name='yellows', data=cdict4)

#get each image channel as a greyscale image (second argument 0 in imread)
green = cv2.imread('../input/train/{}_green.png'.format(im_id), 0)
red = cv2.imread('../input/train/{}_red.png'.format(im_id), 0)
blue = cv2.imread('../input/train/{}_blue.png'.format(im_id), 0)
yellow = cv2.imread('../input/train/{}_yellow.png'.format(im_id), 0)

#display each channel separately
fig, ax = plt.subplots(nrows = 2, ncols=2, figsize=(15, 15))
ax[0, 0].imshow(green, cmap="greens")
ax[0, 0].set_title("Protein of interest", fontsize=18)
ax[0, 1].imshow(red, cmap="reds")
ax[0, 1].set_title("Microtubules", fontsize=18)
ax[1, 0].imshow(blue, cmap="blues")
ax[1, 0].set_title("Nucleus", fontsize=18)
ax[1, 1].imshow(yellow, cmap="yellows")
ax[1, 1].set_title("Endoplasmic reticulum", fontsize=18)
for i in range(2):
    for j in range(2):
        ax[i, j].set_xticklabels([])
        ax[i, j].set_yticklabels([])
        ax[i, j].tick_params(left=False, bottom=False)
plt.show()

In [None]:
labels_num = [value.split() for value in train['Target']]
labels_num_flat = list(map(int, [item for sublist in labels_num for item in sublist]))
labels = ["" for _ in range(len(labels_num_flat))]
for i in range(len(labels_num_flat)):
    labels[i] = subcell_locs[labels_num_flat[i]]

fig, ax = plt.subplots(figsize=(15, 5))
pd.Series(labels).value_counts().plot('bar', fontsize=14)

In [None]:
train_img = os.listdir("../input/train/")
test_img = os.listdir("../input/test/")

train_path = "../input/train/"
test_path = "../input/test/"


### Total Number of  Train and Test Images

In [None]:
train_df = pd.DataFrame(train_img,columns=['image_id'])
test_df = pd.DataFrame(test_img,columns=['image_id'])
print("Number of Total Train Image : ",len(train_df))
print("Number of Test Train Image : ",len(test_df))

### Get the Color Name From Train and Test Images

In [None]:
color = []
for n in train_img:
    if "red" in n:
       color.append('red')
    elif "blue" in n:
       color.append('blue')
    elif "yellow" in n:
       color.append('yellow')
    elif "green" in n:
       color.append('green')
train_df['c_name'] = pd.DataFrame(color)
color = []
for n in test_img:
    if "red" in n:
       color.append('red')
    elif "blue" in n:
       color.append('blue')
    elif "yellow" in n:
       color.append('yellow')
    elif "green" in n:
       color.append('green')  
test_df['c_name'] = pd.DataFrame(color)

In [None]:
plt.figure(figsize=(15,12))
train_df["c_name"].value_counts().plot(kind="bar")
plt.xlabel("Counts")
plt.ylabel("Colors")
plt.legend("Colors")
plt.title("Color Image Counts")

In [None]:
train_df.head(5)

In [None]:
test_df.head(5)

In [None]:
train_df['id'] = train_df['image_id'].str.split('_').str[0]
test_df['id'] = test_df['image_id'].str.split('_').str[0]

### Total Number of Unique Image on Train and Test Data


when we consider as single unique id and combine four color channel at that time we have got only 31072 in train data and 11702 in test data


In [None]:
print("Total Number of Unique Image on Train Data ",len(train_df['id'].value_counts()))
print("Total Number of Unique Image on Test Data ",len(test_df['id'].value_counts()))

### Now Sort the value id and color name wise

Below DataFame you will Better Understand

**For Each unique id you will get the four color **

In [None]:
train_df = train_df.sort_values(by=['id', 'c_name']).reset_index(drop=True)
test_df = test_df.sort_values(by=['id', 'c_name']).reset_index(drop=True)

In [None]:
train_df.head(12)

In [None]:
test_df.head(12)

### we have only Read One Image and take some insight 
ID is ** 00070df0-bbc3-11e8-b2bc-ac1f6b6435d0** from train data

In [None]:
import cv2
import gc
gc.collect()
import matplotlib.pyplot as plt

### Display Four color chanel image on single id

In [None]:
img_1 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_blue.png',0)
img_2 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_green.png',0)
img_3 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_red.png',0)
img_4 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_yellow.png',0)
fig, ax = plt.subplots(nrows = 2, ncols=2, figsize=(20, 20))
ax[0, 0].imshow(img_1)
ax[0, 0].set_title("Protein of interest", fontsize=18)
ax[0, 1].imshow(img_2)
ax[0, 1].set_title("Microtubules", fontsize=18)
ax[1, 0].imshow(img_3)
ax[1, 0].set_title("Nucleus", fontsize=18)
ax[1, 1].imshow(img_4)
ax[1, 1].set_title("Endoplasmic reticulum", fontsize=18)
for i in range(2):
    for j in range(2):
        ax[i, j].set_xticklabels([])
        ax[i, j].set_yticklabels([])
        ax[i, j].tick_params(left=False, bottom=False)
plt.show()

In [None]:
img_1 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_blue.png',0)
img_2 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_green.png',0)
img_3 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_red.png',0)
img_4 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_yellow.png',0)
fig, ax = plt.subplots(nrows = 2, ncols=2, figsize=(20, 20))
ax[0, 0].imshow(img_1, cmap="blues")
ax[0, 0].set_title("Protein of interest", fontsize=18)
ax[0, 1].imshow(img_2, cmap="greens")
ax[0, 1].set_title("Microtubules", fontsize=18)
ax[1, 0].imshow(img_3, cmap="reds")
ax[1, 0].set_title("Nucleus", fontsize=18)
ax[1, 1].imshow(img_4, cmap="yellows")
ax[1, 1].set_title("Endoplasmic reticulum", fontsize=18)
for i in range(2):
    for j in range(2):
        ax[i, j].set_xticklabels([])
        ax[i, j].set_yticklabels([])
        ax[i, j].tick_params(left=False, bottom=False)
plt.show()


## [opencv python merge different channel images into one](https://stackoverflow.com/questions/44112358/opencv-python-merge-different-channel-images-into-one)

As OpenCV 3.x stored image as numpy array, we can simply average each image and add them together, provided that the height and width of the images are exactly the same.


In [None]:
img_1 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_blue.png',0)
img_2 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_green.png',0)
img_3 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_red.png',0)
img_4 = cv2.imread('../input/train/00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_yellow.png',0)

no_img = 4
img = img_1/no_img + img_2/no_img + img_3/no_img + img_4/no_img
img.shape

In [None]:
plt.figure(figsize=(10, 10))
plt.imshow(img)
plt.show()

### Combine all Four Images and Here is the view of few Images

In [None]:
train_df1 = train_df[0:12]

for name, group in train_df1.groupby(['id'])['image_id']:
    img_1 = cv2.imread('../input/train/'+group.values[0],0)
    img_2 = cv2.imread('../input/train/'+group.values[1],0)
    img_3 = cv2.imread('../input/train/'+group.values[2],0)
    img_4 = cv2.imread('../input/train/'+group.values[3],0)

    no_img = 4
    img = img_1/no_img + img_2/no_img + img_3/no_img + img_4/no_img
    print(img.shape)
    plt.figure(figsize=(10, 10))
    plt.imshow(img)
    plt.show()

In [None]:
train_image = []
train = train_df[0:6000]
no_img = 4
for name, group in train.groupby(['id'])['image_id']:
    img_1 = cv2.imread('../input/train/'+group.values[0],0)
    img_2 = cv2.imread('../input/train/'+group.values[1],0)
    img_3 = cv2.imread('../input/train/'+group.values[2],0)
    img_4 = cv2.imread('../input/train/'+group.values[3],0)
    img = []
    img = img_1/no_img + img_2/no_img + img_3/no_img + img_4/no_img
    train_image.append(img)

### Now Set the Traget variable according id wise
Frist you have to doing shorting on target variable 

In [None]:
train = pd.read_csv("../input/train.csv")
train = train.sort_values(['Id']).reset_index(drop=True)
train.head(5)

In [None]:
labels = []
for i in train['Target'][0:1500]:
    li = list(i.split(" ")) 
    labels.append(li)
print("length of Traget Variable :",len(labels))

In [None]:
image = np.array(train_image)
labels = np.array(labels)

In [None]:
gc.collect()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [None]:
# binarize the labels using scikit-learn's special multi-label
# binarizer implementation
print("[INFO] class labels:")
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(labels)
 
#loop over each of the possible class labels and show them
for (i, label) in enumerate(mlb.classes_):
    print("{}. {}".format(i + 1, label))

In [None]:
from keras.preprocessing.image import ImageDataGenerator

#================================
# import the necessary packages
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation
from keras.layers.core import Flatten
from keras.layers.core import Dropout
from keras.layers.core import Dense
from keras import backend as K

#================================

import matplotlib
#matplotlib.use("Agg")
 
# import the necessary packages
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.preprocessing.image import img_to_array

In [None]:
img_width = 512
img_height = 512
(trainX, testX, trainY, testY) = train_test_split(image,labels, test_size=0.3, random_state=42)

trainX = trainX.reshape(trainX.shape[0], img_width, img_height,1) 
testX = testX.reshape(testX.shape[0], img_width, img_height,1) 

In [None]:
aug = ImageDataGenerator()
EPOCHS = 20
INIT_LR = 1e-3
BS = 32

### Make keras deep learning model for image classification - VGG

In [None]:
depth=1
chanDim = -1
classes=28, 
finalAct="sigmoid"


inputShape = (img_width, img_height, depth)

model = Sequential()
# CONV => RELU => POOL
model.add(Conv2D(32, (3, 3), padding="same",
input_shape=inputShape))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Dropout(0.25))
# (CONV => RELU) * 2 => POOL
model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
# (CONV => RELU) * 2 => POOL
model.add(Conv2D(128, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Conv2D(128, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

# first (and only) set of FC => RELU layers
model.add(Flatten())
model.add(Dense(1024))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# use a *softmax* activation for single-label classification
# and *sigmoid* activation for multi-label classification
model.add(Dense(27))
model.add(Activation(finalAct))
 
opt = Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS)

model.compile(loss="binary_crossentropy", optimizer=opt,metrics=["accuracy"])

In [None]:
model_vgg = model.fit_generator(aug.flow(trainX, trainY, batch_size=1),validation_data=(testX, testY),steps_per_epoch=len(trainX) // BS,epochs=EPOCHS, verbose=1)


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15,5))
ax[0].set_title('loss')
ax[0].plot(model_vgg.epoch, model_vgg.history["loss"], label="Train loss")
ax[0].plot(model_vgg.epoch, model_vgg.history["val_loss"], label="Validation loss")
ax[1].set_title('acc')
ax[1].plot(model_vgg.epoch, model_vgg.history["acc"], label="Train acc")
ax[1].plot(model_vgg.epoch, model_vgg.history["val_acc"], label="Validation acc")
ax[0].legend()
ax[1].legend()


In [None]:
#sub = pd.read_csv("../input/sample_submission.csv")
# test_image = []
# for name, group in test_df.groupby(['id'])['image_id']:
#     img_1 = cv2.imread('../input/test/'+group.values[0],0)
#     img_2 = cv2.imread('../input/test/'+group.values[1],0)
#     img_3 = cv2.imread('../input/test/'+group.values[2],0)
#     img_4 = cv2.imread('../input/test/'+group.values[3],0)
#     img = []
#     img = img_1/no_img + img_2/no_img + img_3/no_img + img_4/no_img
#     i = i + 1
#     print(i)
#     test_image.append(img)

# TO-DO

* optimise the ram/release the ram
* read all test dataset and prediction value on our prepared model

# If you know any technique which read all dataset images without consuming whole kaggle RAM please provide the details for same it will be great help for me and others to.

### You like this kernal please give the *upvote*.
