# Nearest Neighbors with Keras

In [1]:
import glob
from itertools import groupby
from pathlib import Path
import numpy as np
import pandas as pd
from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.preprocessing import image
from keras.layers import Dense, Reshape
from keras.models import Model
from coremltools.converters.keras import convert
from sklearn.model_selection import train_test_split
from keras.applications.vgg16 import VGG16
from tqdm import tqdm_notebook as tqdm

Using TensorFlow backend.


In [2]:
IMG_HEIGHT, IMG_WIDTH = (240, 240)
img_folder = Path("/home/soham/AeroMIT/dataset_exploiting/kmeans_version/").expanduser()

## Base Network for Feature Extraction
We load the model which has been trained on ImageNet. We specify `include_top=False`.  This ensures that we don't load the final layers specific to the classes the model was originall trained to predict. For more information, see the [Keras documentation](https://keras.io/applications/#resnet50)

In [3]:
encoder_model = VGG16(input_shape=(IMG_HEIGHT,IMG_WIDTH,3), weights='imagenet', include_top=False)

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


In [4]:
np.prod(encoder_model.output.shape.as_list()[1:])

25088

In [5]:
len(encoder_model.get_weights())

26

## Split Data

In [6]:
import os
import fnmatch

In [7]:
image_filenames = []
for filename in os.listdir(img_folder):
    image_filenames.append(filename)

In [8]:
train_filenames, test_filenames = train_test_split(image_filenames, test_size=0.1)
print("test images: ", len(test_filenames))
print("train images", len(train_filenames))

test images:  7908
train images 71165


## Extract features for all images in the database

In [None]:
os.chdir(img_folder)
filename_order = []
def load_encode_images(encoder, filenames):
    batch_size = 16
    encoded_dim = np.prod(encoder.output.shape[1:]).value
    file_count = len(filenames)
    encoded = np.zeros((file_count, encoded_dim))
    for start_index in tqdm(list(range(0, file_count, batch_size))):
        end_index = min(start_index + batch_size, file_count)
        batch_filenames = filenames[start_index:end_index]

        batch_images = load_images(filename_order,batch_filenames)
        batch_encoded = encoder.predict(batch_images)
        batch_encoded_flat = batch_encoded.reshape(len(batch_images), -1)
        encoded[start_index:end_index, :] = batch_encoded_flat

    return encoded

def load_images(filename_order,filenames):
    images = np.zeros((len(filenames), IMG_HEIGHT, IMG_WIDTH, 3))
    for i, filename in enumerate(filenames):
        img = image.load_img(filename, target_size=(IMG_HEIGHT,IMG_WIDTH))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        images[i, :, :, :] = img_array
        filename_order.append([i,filename])
    return images

encoded_imgs = load_encode_images(encoder_model, train_filenames).T
print(filename_order)

HBox(children=(IntProgress(value=0, max=4448), HTML(value='')))

In [12]:
encoded_imgs = encoded_imgs.T
print(encoded_imgs.shape)


(71164, 25088)


## Buiding the Kmeans using minibatch

In [25]:
num_steps = 50 # Total steps to train
batch_size = 1024 # The number of samples per batch
k = 20 # The number of clusters
num_features = 25088
#X = tf.placeholder(tf.float32, shape=[num_features,None])

In [13]:
from sklearn.cluster import MiniBatchKMeans

In [26]:
kmeans = MiniBatchKMeans(n_clusters=k, random_state=0, batch_size=200, max_iter=100000).fit(encoded_imgs)

In [44]:
(kmeans.cluster_centers_).shape

(10, 25088)

In [27]:
labels = kmeans.labels_

In [28]:
np.savetxt("/home/soham/AeroMIT/rnn_yolo_model/keras-knn/labels_20_v2clusters.csv",labels,delimiter = ',')

In [11]:

with open("image_filename_order.txt",'w') as f:
    for file in filename_order:
        f.write(str(file) + '\n')

In [24]:
labels_df = pd.DataFrame(labels)
labels_df['filenames'] = [filename[1] for file in filename_order]
labels_df.to_csv('/home/soham/AeroMIT/rnn_yolo_model/keras-knn/labels_20_filenames.csv')

Unnamed: 0,0
count,71164.0
mean,5.839708
std,2.376889
min,0.0
25%,7.0
50%,7.0
75%,7.0
max,9.0


In [None]:
del(encoded_imgs)

## Predictions

In [None]:
os.chdir(img_folder)
prediction_filename_order = []
def load_predict_encode_images(encoder, filenames):
    batch_size = 16
    encoded_dim = np.prod(encoder.output.shape[1:]).value
    file_count = len(filenames)
    encoded = np.zeros((file_count, encoded_dim))
    for start_index in tqdm(list(range(0, file_count, batch_size))):
        end_index = min(start_index + batch_size, file_count)
        batch_filenames = filenames[start_index:end_index]

        batch_images = load_predict_images(filename_order,batch_filenames)
        batch_encoded = encoder.predict(batch_images)
        batch_encoded_flat = batch_encoded.reshape(len(batch_images), -1)
        encoded[start_index:end_index, :] = batch_encoded_flat

    return encoded

def load_predict_images(filename_order,filenames):
    images = np.zeros((len(filenames), IMG_HEIGHT, IMG_WIDTH, 3))
    for i, filename in enumerate(filenames):
        img = image.load_img(filename, target_size=(IMG_HEIGHT,IMG_WIDTH))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        images[i, :, :, :] = img_array
        prediction_filename_order.append([i,filename])
    return images

encoded_predict_imgs = load_predict_encode_images(encoder_model, test_filenames)
print(predict_filename_order)

In [33]:
prediction_labels = kmeans.predict(encoded_predict_imgs)

Unnamed: 0,0,1,"(a, b)",a
0,1,2,"[5, 6]",5
1,3,4,"[7, 8]",7


In [None]:
predict_labels_df = pd.DataFrame(labels)
predict_labels_df['filenames'] = [filename[1] for file in predict_filename_order]
predict_labels_df.to_csv('/home/soham/AeroMIT/rnn_yolo_model/keras-knn/predict_labels_20_filenames.csv')