In [1]:
"""
Robert E Ruzzo III
Classify.ipynb

The purpose of this notebook is to batch classify unlabeled images

"""
import os
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
tf.set_random_seed(42)
from keras.models import Model
from keras.layers import Input
from keras.preprocessing.image import ImageDataGenerator

Using TensorFlow backend.


In [12]:
"""
Configuration
Used to hold variable values making them easier to change if needed.

    Args: 
        None

Variables:
    batch_size (int): The batch processing size
    data (string): The location of the training set labels csv
    data_dir (string): The directory which containes the subdirectories of the photos to be analzed. 
        For this notebook to work correctly the pictures have to divided into a sub directories based on their class.
    sample_sub (string):The directory that the submission example csv is located

"""
class Configuration:
    def __init__(self):
        self.batch_size = 128
        self.data_dir = 'D:\\Datasets\\histopathologic-cancer-detection\\test'
        self.sample_sub = 'D:\\Datasets\\histopathologic-cancer-detection\\train_labels.csv'
        self.file_dir ='D:\\Datasets\\histopathologic-cancer-detection\\'

In [13]:
#Create a configuration class instance
config= Configuration()

In [4]:
#Load Pretrained Model
histo_model = keras.models.load_model('ResNet50_10_BCross')

Instructions for updating:
Colocations handled automatically by placer.




In [5]:
#Show the summary of the pretrained model
histo_model.summary()
#Uncomment the next line to view the weights
#histo_model.get_weights() 

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 96, 96, 3)    0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 102, 102, 3)  0           input_4[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 48, 48, 64)   9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 48, 48, 64)   256         conv1[0][0]                      
__________________________________________________________________________________________________
activation

In [6]:
"""
setup_data : Function
Used to setup the imagedatagenerator iterables for batch classification

    Args: 
        test_data_dir (string): The directory that contains a subdirectory with the images. IT IS NECESSARY that there
            is a parent level of directory instead of base directory of the images. Otherwise the generator will not 
            work correctly

    Returns:
        test_generator (iterable image generator) : An interable generator object for batch image classification
    

"""
def setup_data(test_data_dir):
    
    test_datagen = ImageDataGenerator(rescale=1.0/255) # Only scaling is performed
    

    test_generator = test_datagen.flow_from_directory(
        test_data_dir,
        batch_size=config.batch_size,
        target_size=(96,96),
        class_mode=None,
        shuffle = False)
    
    
    return test_generator
    #return the cropping generator and the number of items

In [7]:
#Setup our test generator iterable
test_gen= setup_data(config.data_dir)
#Calculate the steps
steps = test_gen.n//config.batch_size + 1 
'''
    Note: If you get less than the number of images for results you need to add 1 to the batch number to include the 
    remainder. A manual check can verify this, if the result of images / batch size doesnt have a remainder of 0 then
    it is necessary. 
'''

Found 57458 images belonging to 1 classes.


'\n    Note: If you get less than the number of images for results you need to add 1 to the batch number to include the \n    remainder. A manual check can verify this, if the result of images / batch size doesnt have a remainder of 0 then\n    it is necessary. \n'

In [8]:
#Use the GPU to calculate the probabilities
device_name="/gpu:0"
with tf.device(device_name):
    probabilities = histo_model.predict_generator(test_gen,steps)
    

In [14]:
#Load all of the predictions
predicts = [p for p in probabilities]
#Convert the predictions to 0,1
class_preds = np.argmax(predicts, axis=1)
#Gather all of the image names, the image names and the probabilites will be in the same order
image_names = [".".join(f.split(".")[:-1]) for f in os.listdir(config.file_dir) if os.path.isfile(os.path.join(config.file_dir,f))]

In [16]:
#Remove the first column of predictions as it is a predictor of negative, we want prediction of positive
predictions = np.array(predicts)[:,1]

In [17]:
#Create a dictionary of the image_name:predictions
predicts_dictionary = dict((key, value) for (key, value) in zip(image_names, predictions))

In [18]:
#Get the order of the sample submission as it is important for file generation
samples = pd.read_csv(config.sample_sub)
#Strip off the name of the file and leave the rest
sample_list = list(samples.id)

In [19]:
# Create a new list of predictions that follow the order of the sample submission
pred_list_cor = [predicts_dictionary.get(id) for id in sample_list]


In [20]:
# Create a dataframe to hold the information for output to a csv
submission_dataframe = pd.DataFrame({'id':sample_list,'label':pred_list_cor})

# Export to csv
submission_dataframe.to_csv('submission.csv', header=True, index=False)