In [None]:
import numpy as np
import os
from pathlib import Path
import pickle
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input

import sys
sys.path.append('..')
from helper.classification_tools import CustomLabelEncoder

# Loading files
First, we need to get the file paths of the pre-processed images we saved in 01_preprocess.ipynb. 

In [None]:
img_root = Path('..','data','images_preprocessed','images_histeq_resize')
assert img_root.is_dir()
files = sorted(img_root.glob("*.bmp"))

## Shuffle the filenames so they appear randomly in the dataset
rs = np.random.RandomState(seed=749976)
rs.shuffle(files)

assert len(files) == 1800
print('first 10 filenames: {}'.format([x.name for x in files[:10]]))

## Extracting the labels from filenames
The labels are determined from the characteris in the filename before the first "_". We could also just take the first two characters of the filename, but this does not generalize to cases where the labels have different numbers of characters.

In [None]:
def extract_labels(f): return [x.stem.split('_')[0] for x in f]
labels = extract_labels(files)
print('first 10 labels: {}'.format(labels[:10]))

# Label encoding
One step that will make our lives easier throughout the analysis is standardizing the
encoding of labels. The labels are stored as strings in the filenames, but it will be more
convenient to convert them to numeric values for more convenient calculations of statistics like accuracy, precision, recall, etc.
We can create one LabelEncoder model and save it for reuse throughout the study.

sklearn has a LabelEncoder object, but it doesn't let you sort the labels alphabetically. Therefore I wrote a simple label encoder which can do this.

In [None]:
le = CustomLabelEncoder()
le.fit(labels, sorter=lambda x: x.upper())

labels_int = le.transform(labels[:10])
labels_str = le.inverse_transform(labels_int)

# save the label encoder so it can be used throughout the rest of this study
with open(Path('..','models','label_encoder.pickle'), 'wb') as f:
    pickle.dump(le, f)

print('label encodings: {}'.format(le.mapper))
print('first 10 integer labels: {}'.format(labels_int))
print('first 10 string labels: {}'.format(labels_str))

# Loading Images
For feature extraction to work correctly, the images have to be in the correct format for the network weights.
Keras gives us functions for loading and formatting these images. Note the function is called 'preprocessing,'
but it does not actually change the properties of the image like the preprocessing we did before. Instead, it 
ensures that the images are represented the correct way.

In [None]:
def load_images(paths):
    """
    Loads images in the correct format for use with the Keras VGG16 model
    
    Images are loaded as PIL image objects, converted to numpy array, and then formatted
    with the appropriate VGG16.preprocess_input() function. Note that this only changes
    how the images are represented, it does not change the actual visual properties of the
    images like preprocessing did before.
    
    Parameters
    ----------
    paths: list(Path)
        list of Paths to each file where the image is stored. Note that the images should 
        have the same height, width in pixels so they can be stored in one array.
    
    Returns
    ----------
    images: ndarray
        n_images x r x c x 3 array of pixel values that is compatible with the Keras model.
    
    """
    
    images = [image.load_img(file) for file in paths] # load images
    # convert images to an array with shape consistent for the vgg16 input
    images = np.asarray([image.img_to_array(img) for img in images]) 
    # normalizes the pixel values to match the imagenet format (and therefore the pre-trained weights)
    images = preprocess_input(images) 
    
    return images

    

In [None]:
images = load_images(files)
assert len(images) == 1800
print(images.shape)

# Feature extraction
We will use the VGG16 network as a signal processor, generating a feature descriptor for each image that we can use later for classification.

Get the weights of the VGG16 model

In [None]:
vgg16_path = Path('..','models','VGG16.h5')
if not vgg16_path.is_file():
    vgg16 = keras.applications.VGG16(include_top=True,  # include fully connected layers
                                     weights='imagenet') # use pre-trained model
    vgg16.save(vgg16_path) # save model so we don't have to download it everytime
    
else:   
    vgg16 = keras.models.load_model(vgg16_path) # use saved model



The warning indicates that the model hasn't been compiled with an optimizer/loss function for training. Since we are 
not training the model, and are just using it as a feature extractor, this is not a problem.

We can see the strutcure of the VGG16 model here.

In [None]:
vgg16.summary()

The pre-trained model will run data through the entire network and return the output of the classification layer. 
Howevever, we only want the output of the intermediate layer so that we can use it as a feature descriptor. 

In [None]:
def layer_extractor(model=vgg16, layer='fc1'):
    """
    returns a model that will extract the outputs of *layer* from *model*.
    
    Parameters
    -------------
    model: keras model
        full model from which intermediate layer will be extracted
    layer: string
        name of layer from which to extract outputs
    
    Returns
    -------------
    new_model: keras model
        feature extractor model which takes the same inputs as *model* and returns the outputs
        of the intermediate layer specified by *layer* by calling new_model.predict(inputs)
    """
    assert layer in [x.name for x in model.layers]  # make sure the layer exists

    new_model = keras.Model(inputs = vgg16.input, outputs=[vgg16.get_layer(layer).output])
    
    return new_model




# FC1 features

In [None]:
fc1_extractor = layer_extractor()
fc1 = fc1_extractor.predict(images)

# save results
results = {'filename' : files,
           'features': fc1,
          'labels': labels,
           'layer_name': 'fc1'
          }

feature_dir = Path('..','data','features')
os.makedirs(feature_dir, exist_ok=True)
with open(feature_dir / 'VGG16_fc1_features_std.pickle', 'wb') as f:
    pickle.dump(results, f)

print(fc1.shape)

# Features from other layers
Simply repeat the process substituting the name of the layer you wish to extract

In [None]:
'block3_conv2' in [x.name for x in vgg16.layers]

In [None]:
for layer in ['fc2', 'block5_pool', 'block5_conv3']:
    extractor = layer_extractor(layer=layer)  # model to extract features for each layer
    features = extractor.predict(images)  # features extracted by model
    # save the results using the same format as before
    results = {'filename': files,
              'features': features,
              'labels': labels,
              'layer_name': layer}
    with open(feature_dir / 'VGG16_{}_features.pickle'.format(layer), 'wb') as f:
        pickle.dump(results, f)

# FC1 features without histogram equalization
Use the fc1 extractor 

In [None]:
img_root_nohisteq = Path('..','data','images_preprocessed','images_resize')
assert img_root_nohisteq.is_dir()
files_noh = sorted(img_root_nohisteq.glob('*'))
rs = np.random.RandomState(seed=3626210179)
rs.shuffle(files_noh)
labels_noh = extract_labels(files_noh)
assert len(files_noh) == 1800
print('first 5 filenames and labels')
print([x.name for x in files_noh[:5]])
print(labels_noh[:5])

In [None]:
# follow the same process described above to load images, convert to array, and format for vgg16
images_noh = load_images(files_noh)
fc1_noh = fc1_extractor.predict(images_noh)

results = {'filename': files_noh,
          'features': fc1_noh,
          'labels': labels_noh,
          'layer_name': 'fc1 no_histeq'}
with open(feature_dir / 'VGG16_fc1_features_NoHistEQ.pickle', 'wb') as f:
    pickle.dump(results, f)

