# Experiment 5.0 - Inception Resnet v2 - extract Features

Reproduce Results of [Transfer learning with deep convolutional neural network for liver steatosis assessment in ultrasound images](https://pubmed.ncbi.nlm.nih.gov/30094778/). We used a pre-trained CNN to extract features based on B-mode images. 

The CNNfeatures are extracted using the pretrained Inception-Resnet-v2 implemented in Keras.
See reference: https://jkjung-avt.github.io/keras-inceptionresnetv2/

![Screen Shot 2020-10-12 at 2 57 22 PM](https://user-images.githubusercontent.com/23482039/95781182-47437700-0c9b-11eb-8826-594811ba3322.png)


In [68]:
import sys
import random
sys.path.append('../src')

import warnings
warnings.filterwarnings("ignore") 

import pickle
import pandas as pd

import numpy as np
import mlflow
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from tqdm import tqdm

from utils.compute_metrics import get_metrics, get_majority_vote,log_test_metrics
from utils.dataframe_creation import create_dataframe_preproccessing
from sklearn.preprocessing import StandardScaler
from tensorflow.python.keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import GroupKFold
from tqdm import tqdm
from pprint import pprint
from itertools import product
from keras.preprocessing.image import ImageDataGenerator


## 1. Feature Extraction

In [69]:
M, N= 434, 636 # ultrasound image dimension
#In the paper, the authors extract the features directly from the pretrained inception resnet_v2
# pooling: Optional pooling mode for feature extraction when include_top is False.
#'max' means that global max pooling will be applied.
net = InceptionResNetV2(include_top=False,
                        weights='imagenet',
                        pooling= 'avg')
dataset = create_dataframe_preproccessing()
dataset.head()

Unnamed: 0,id,labels,fat,fname
0,1,0,3,../data/02_interim/raw_images/1/P1_image1.jpg
1,1,0,3,../data/02_interim/raw_images/1/P1_image2.jpg
2,1,0,3,../data/02_interim/raw_images/1/P1_image3.jpg
3,1,0,3,../data/02_interim/raw_images/1/P1_image4.jpg
4,1,0,3,../data/02_interim/raw_images/1/P1_image5.jpg


In [74]:
dataset = dataset.astype({"labels": str})
datagen = ImageDataGenerator()
#datagen = ImageDataGenerator(rescale=1./255)
#datagen = ImageDataGenerator(featurewise_center=True,
                                   #featurewise_std_normalization=True)
#datagen.mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
#datagen.std = np.array([0.229, 0.224, 0.225], dtype=np.float32)

#From paper: Images were resized using the bi-cubic interpolation algorithm to the resolution originally designed for the network.
generator = datagen.flow_from_dataframe(dataset, x_col='fname',y_col='labels',shuffle=False, target_size=(299,299), interpolation ='bicubic', batch_size =25)

Found 550 validated image filenames belonging to 2 classes.


In [75]:
for i, (inputs_batch, labels_batch) in enumerate(tqdm(generator)):
    v = net(inputs_batch)
    break
    

  0%|          | 0/22 [00:04<?, ?it/s]


In [65]:
v.shape

TensorShape([25, 1536])

In [57]:
max_pool_dim = 1536
features = np.zeros(shape=(len(dataset), max_pool_dim))
batch_size = 25
assert len(dataset)%batch_size ==0, '550 should be divisable by batch_SIZE'

for i, (inputs_batch, labels_batch) in enumerate(tqdm(generator)):
    features[i*batch_size:(i+1)*batch_size] = net(inputs_batch)
    if (i+1)*batch_size == 550:
        break

 95%|█████████▌| 21/22 [01:16<00:03,  3.63s/it]


## 2. Save features

In [58]:
df_features = pd.DataFrame(features)
df_features= pd.concat([dataset['id'], dataset['labels'], df_features], axis=1)
df_features[["labels"]] = df_features[["labels"]].apply(pd.to_numeric)
df_features.head()

Unnamed: 0,id,labels,0,1,2,3,4,5,6,7,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,1,0,0.652569,0.456327,0.347633,0.06385,0.060641,0.223506,0.598393,0.708075,...,0.188298,0.75634,0.256016,0.600391,0.515476,0.245517,0.114754,0.142941,0.592586,0.405014
1,1,0,0.662694,0.397173,0.379525,0.042804,0.077925,0.218255,0.676569,0.675742,...,0.143959,0.599035,0.272683,0.512366,0.536357,0.293908,0.120353,0.124621,0.505979,0.538553
2,1,0,0.678799,0.44609,0.28122,0.059611,0.047996,0.203064,0.655565,0.815762,...,0.142166,0.560333,0.238087,0.467891,0.550278,0.30547,0.100554,0.120454,0.49094,0.465642
3,1,0,0.664021,0.461157,0.343546,0.070976,0.063509,0.222404,0.672063,0.745233,...,0.210451,0.724491,0.293846,0.627026,0.606262,0.303108,0.112011,0.186801,0.556711,0.576585
4,1,0,0.606826,0.469013,0.323584,0.076501,0.083841,0.248243,0.636795,0.775547,...,0.215539,0.629683,0.255984,0.533129,0.611823,0.318369,0.141402,0.116915,0.508992,0.509136


In [59]:
# Create a dict for the scattering features, and parameters
inception_dict_tensor = {
                'features':df_features,
                'label' : dataset['labels'],
                'pid' : dataset['id'],
                'Interpolation': 'bicubic' }

with open('../data/03_features/inception_dict_tensor_avg_interpolation_pooling.pickle', 'wb') as handle:
    pickle.dump(inception_dict_tensor, handle, protocol=pickle.HIGHEST_PROTOCOL)