Credit: [@anokas](https://www.kaggle.com/anokas)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pal = sns.color_palette()

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

print('# File sizes')
for f in os.listdir('../input'):
    if not os.path.isdir('../input/' + f):
        print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / (1000000*1000), 7)) + 'GB')
    else:
        sizes = [os.path.getsize('../input/'+f+'/'+x)/(1000000*1000) for x in os.listdir('../input/' + f)]
        print(f.ljust(30) + str(round(sum(sizes), 7)) + 'GB' + ' ({} files)'.format(len(sizes)))

# Training Data

In [None]:
df_train = pd.read_csv('../input/train.csv')
df_train.head()

## A Useful Dictionary....

In [None]:
dicts={
0:  "Nucleoplasm", 
1:  "Nuclear membrane",   
2:  "Nucleoli",   
3:  "Nucleoli fibrillar center" ,  
4:  "Nuclear speckles"   ,
5:  "Nuclear bodies"   ,
6:  "Endoplasmic reticulum",   
7:  "Golgi apparatus"   ,
8:  "Peroxisomes"   ,
9:  "Endosomes"   ,
10:  "Lysosomes"   ,
11:  "Intermediate filaments",   
12:  "Actin filaments"   ,
13:  "Focal adhesion sites",   
14:  "Microtubules"   ,
15:  "Microtubule ends",   
16:  "Cytokinetic bridge",   
17:  "Mitotic spindle"   ,
18:  "Microtubule organizing center" ,  
19:  "Centrosome"   ,
20:  "Lipid droplets",   
21:  "Plasma membrane",   
22:  "Cell junctions"  , 
23:  "Mitochondria"   ,
24:  "Aggresome"   ,
25:  "Cytosol",
26:  "Cytoplasmic bodies",   
27:  "Rods & rings" 
}

In [None]:
labels = df_train['Target'].apply(lambda x: x.split(' '))
from collections import Counter, defaultdict
counts = defaultdict(int)
for l in labels:
    for l2 in l:
        counts[l2] += 1
strs=[]
for count in counts.keys(): strs.append(dicts[int(count)])

data=[go.Bar(x=list(strs), y=list(counts.values()))]
layout=dict(height=800, width=800, title='Distribution of training labels')
fig=dict(data=data, layout=layout)
py.iplot(data, filename='train-label-dist')

In [None]:
# Co-occurence Matrix
com = np.zeros([len(counts)]*2)
for i, l in enumerate(list(counts.keys())):
    for i2, l2 in enumerate(list(counts.keys())):
        c = 0
        cy = 0
        for row in labels.values:
            if l in row:
                c += 1
                if l2 in row: cy += 1
        com[i, i2] = cy / c

data=[go.Heatmap(z=com, x=list(strs), y=list(strs))]
layout=go.Layout(height=800, width=800, title='Co-occurence matrix of training labels')
fig=dict(data=data, layout=layout)
py.iplot(data, filename='train-com')

# Images

In [None]:
import cv2
from PIL import Image


new_style = {'grid': False}
plt.rc('axes', **new_style)
_, ax = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(20, 20))
i = 0
for f, l in df_train[:4].values:
    img = cv2.imread('../input/train/{}_green.png'.format(f)) + cv2.imread('../input/train/{}_red.png'.format(f)) + cv2.imread('../input/train/{}_blue.png'.format(f))
    im = Image.fromarray(img)
    
    ax[i // 2, i % 2].imshow(im)
    arr = l.split(" ")
    str = ""
    for ind in arr:
        str=str+dicts[int(ind)]+", "
    ax[i // 2, i % 2].set_title('{} - {}'.format(f, str))
    #ax[i // 4, i % 4].show()
    i += 1
    
plt.show()