**EXPLORATION OF DATA** 

To see the files, size, number of pictures, each label type

In [None]:
IMAGE_SIZE=96 # image is 96X96
IMAGE_CHANNELS=3 #three colour channels

In [None]:
import os
os.listdir('../input/histopathologic-cancer-detection')

In [None]:
print(len(os.listdir('../input/histopathologic-cancer-detection/train'))) #train data
print(len(os.listdir('../input/histopathologic-cancer-detection/test')))  #test data

In [None]:
import pandas as pd

df_data = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv')

print(df_data.shape) # labels and label type

In [None]:
df_data['label'].value_counts() #count for each type

**VISUALISATION**

In [None]:
#necessary libraries
import cv2
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#function not original
def draw_category_images(col_name,figure_cols, df, IMAGE_PATH):
    
    """
    Give a column in a dataframe,
    this function takes a sample of each class and displays that
    sample on one row. The sample size is the same as figure_cols which
    is the number of columns in the figure.
    Because this function takes a random sample, each time the function is run it
    displays different images.
    """
    

    categories = (df.groupby([col_name])[col_name].nunique()).index
    f, ax = plt.subplots(nrows=len(categories),ncols=figure_cols, 
                         figsize=(4*figure_cols,4*len(categories))) # adjust size here
    # draw a number of images for each location
    for i, cat in enumerate(categories):
        sample = df[df[col_name]==cat].sample(figure_cols) # figure_cols is also the sample size
        for j in range(0,figure_cols):
            file=IMAGE_PATH + sample.iloc[j]['id'] + '.tif'
            im=cv2.imread(file)
            ax[i, j].imshow(im, resample=True, cmap='gray')
            ax[i, j].set_title(cat, fontsize=16)  
    plt.tight_layout()
    plt.show()

In [None]:
#displaying image of each label
IMAGE_PATH = '../input/histopathologic-cancer-detection/train/' 

draw_category_images('label',4, df_data, IMAGE_PATH)

**With center marked 32x32**

In [None]:
def readImage(path):
    # OpenCV reads the image in bgr format by default
    bgr_img = cv2.imread(path)
    # We flip it to rgb for visualization purposes
    b,g,r = cv2.split(bgr_img)
    rgb_img = cv2.merge([r,g,b])
    return rgb_img

In [None]:
train_path = '../input/histopathologic-cancer-detection/train'

In [None]:
from sklearn.utils import shuffle #for random sampling
import matplotlib.patches as patches #for rectangular part
# random sampling
shuffled_data = shuffle(df_data)

fig, ax = plt.subplots(2,5, figsize=(20,8))
fig.suptitle('Histopathologic scans of lymph node sections',fontsize=20)
# Negatives
for i, idx in enumerate(shuffled_data[shuffled_data['label'] == 0]['id'][:5]):
    path = os.path.join(train_path, idx)
    ax[0,i].imshow(readImage(path + '.tif'))
    # Create a Rectangle patch
    box = patches.Rectangle((32,32),32,32,linewidth=4,edgecolor='b',facecolor='none', linestyle=':', capstyle='round')
    ax[0,i].add_patch(box)
ax[0,0].set_ylabel('Negative samples', size='large')
# Positives
for i, idx in enumerate(shuffled_data[shuffled_data['label'] == 1]['id'][:5]):
    path = os.path.join(train_path, idx)
    ax[1,i].imshow(readImage(path + '.tif'))
    # Create a Rectangle patch
    box = patches.Rectangle((32,32),32,32,linewidth=4,edgecolor='r',facecolor='none', linestyle=':', capstyle='round')
    ax[1,i].add_patch(box)
ax[1,0].set_ylabel('Tumor tissue samples', size='large')