# Extracting Image features

In [1]:
# Imports
import os
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import json

# matplotlib and seaborn for plotting
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns
%matplotlib inline  

plt.style.use('fivethirtyeight')

In [2]:
def walk_up_folder(path, depth=1):
    """
    Helper method to navigate the file system and get to the file location
    """
    _cur_depth = 1        
    while _cur_depth < depth:
        path = os.path.dirname(path)
        _cur_depth += 1
    return path

In [3]:
data_path = os.path.join(walk_up_folder(os.getcwd(), depth=1),'Data/product_data.json')

with open(data_path, encoding='utf-8') as data_file:
    data = json.loads(data_file.read())

In [4]:
def blank_image():
    """
    Loads a blank image indicating missing file
    """
    return Image.fromarray(np.zeros((200,200,3), dtype="uint8"), 'RGB')

In [5]:
import PIL
from PIL import Image
import requests
from io import BytesIO
import urllib.request
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.layers import Activation, Dropout, Flatten, Dense

Using TensorFlow backend.


In [6]:
# images=[]
# for i in range(len(data)):
#     url=data[i]['images_url']
#     if url.startswith("//"):
#         url ="https://"+url[2:]
#     try:
#         response = requests.get(url)
#         img = Image.open(BytesIO(response.content))
#         # Making sure all images are of the same dimensions
#         if img.size!=(200,200):
#             img=img.resize((200, 200), PIL.Image.ANTIALIAS)
#         images.append(img)
#     except:
#         images.append(blank_image())
#         continue

In [7]:
#images[287]

In [8]:
#img_to_array(images[287]).shape

In [9]:
#img_to_array(images[0]).reshape(-1).reshape(1,3,200,200)

In [10]:
#print(images[300].size)

In [11]:
#img_to_array(images[330]).shape

In [12]:
#import cv2
#cv2.cvtColor(img_to_array(images[330]), cv2.COLOR_BGRA2BGR).shape

In [13]:
#images[19].resize((200, 200), PIL.Image.ANTIALIAS).size

# Loading the Dataset

In [14]:
dataset_final = np.ndarray(shape=(1000,200, 200,3),dtype=np.float32)
# Loading the image data
for i in range(len(data)):
    url=data[i]['images_url']
    if url.startswith("//"):
        url ="https://"+url[2:]
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        # Making sure all images are of the same dimensions
        if img.size!=(200,200):
            img=img.resize((200, 200), PIL.Image.ANTIALIAS)
        x = img_to_array(img)
        # In case of grayScale images the len(img.shape) == 2
        if len(x.shape) > 2 and x.shape[2] == 4:
            #convert the image from RGBA2RGB
            x = cv2.cvtColor(x, cv2.COLOR_BGRA2BGR)  
    except:
        x=img_to_array(blank_image())
        pass
    x = (x - 128.0) / 128.0
    print(i,x.shape)
    dataset_final[i]=x
    #dataset[i] = np.rollaxis(x, axis=2, start=0)   # this is a Numpy array with shape (3, 200, 200)

0 (200, 200, 3)
1 (200, 200, 3)
2 (200, 200, 3)
3 (200, 200, 3)
4 (200, 200, 3)
5 (200, 200, 3)
6 (200, 200, 3)
7 (200, 200, 3)
8 (200, 200, 3)
9 (200, 200, 3)
10 (200, 200, 3)
11 (200, 200, 3)
12 (200, 200, 3)
13 (200, 200, 3)
14 (200, 200, 3)
15 (200, 200, 3)
16 (200, 200, 3)
17 (200, 200, 3)
18 (200, 200, 3)
19 (200, 200, 3)
20 (200, 200, 3)
21 (200, 200, 3)
22 (200, 200, 3)
23 (200, 200, 3)
24 (200, 200, 3)
25 (200, 200, 3)
26 (200, 200, 3)
27 (200, 200, 3)
28 (200, 200, 3)
29 (200, 200, 3)
30 (200, 200, 3)
31 (200, 200, 3)
32 (200, 200, 3)
33 (200, 200, 3)
34 (200, 200, 3)
35 (200, 200, 3)
36 (200, 200, 3)
37 (200, 200, 3)
38 (200, 200, 3)
39 (200, 200, 3)
40 (200, 200, 3)
41 (200, 200, 3)
42 (200, 200, 3)
43 (200, 200, 3)
44 (200, 200, 3)
45 (200, 200, 3)
46 (200, 200, 3)
47 (200, 200, 3)
48 (200, 200, 3)
49 (200, 200, 3)
50 (200, 200, 3)
51 (200, 200, 3)
52 (200, 200, 3)
53 (200, 200, 3)
54 (200, 200, 3)
55 (200, 200, 3)
56 (200, 200, 3)
57 (200, 200, 3)
58 (200, 200, 3)
59 (200

462 (200, 200, 3)
463 (200, 200, 3)
464 (200, 200, 3)
465 (200, 200, 3)
466 (200, 200, 3)
467 (200, 200, 3)
468 (200, 200, 3)
469 (200, 200, 3)
470 (200, 200, 3)
471 (200, 200, 3)
472 (200, 200, 3)
473 (200, 200, 3)
474 (200, 200, 3)
475 (200, 200, 3)
476 (200, 200, 3)
477 (200, 200, 3)
478 (200, 200, 3)
479 (200, 200, 3)
480 (200, 200, 3)
481 (200, 200, 3)
482 (200, 200, 3)
483 (200, 200, 3)
484 (200, 200, 3)
485 (200, 200, 3)
486 (200, 200, 3)
487 (200, 200, 3)
488 (200, 200, 3)
489 (200, 200, 3)
490 (200, 200, 3)
491 (200, 200, 3)
492 (200, 200, 3)
493 (200, 200, 3)
494 (200, 200, 3)
495 (200, 200, 3)
496 (200, 200, 3)
497 (200, 200, 3)
498 (200, 200, 3)
499 (200, 200, 3)
500 (200, 200, 3)
501 (200, 200, 3)
502 (200, 200, 3)
503 (200, 200, 3)
504 (200, 200, 3)
505 (200, 200, 3)
506 (200, 200, 3)
507 (200, 200, 3)
508 (200, 200, 3)
509 (200, 200, 3)
510 (200, 200, 3)
511 (200, 200, 3)
512 (200, 200, 3)
513 (200, 200, 3)
514 (200, 200, 3)
515 (200, 200, 3)
516 (200, 200, 3)
517 (200, 

918 (200, 200, 3)
919 (200, 200, 3)
920 (200, 200, 3)
921 (200, 200, 3)
922 (200, 200, 3)
923 (200, 200, 3)
924 (200, 200, 3)
925 (200, 200, 3)
926 (200, 200, 3)
927 (200, 200, 3)
928 (200, 200, 3)
929 (200, 200, 3)
930 (200, 200, 3)
931 (200, 200, 3)
932 (200, 200, 3)
933 (200, 200, 3)
934 (200, 200, 3)
935 (200, 200, 3)
936 (200, 200, 3)
937 (200, 200, 3)
938 (200, 200, 3)
939 (200, 200, 3)
940 (200, 200, 3)
941 (200, 200, 3)
942 (200, 200, 3)
943 (200, 200, 3)
944 (200, 200, 3)
945 (200, 200, 3)
946 (200, 200, 3)
947 (200, 200, 3)
948 (200, 200, 3)
949 (200, 200, 3)
950 (200, 200, 3)
951 (200, 200, 3)
952 (200, 200, 3)
953 (200, 200, 3)
954 (200, 200, 3)
955 (200, 200, 3)
956 (200, 200, 3)
957 (200, 200, 3)
958 (200, 200, 3)
959 (200, 200, 3)
960 (200, 200, 3)
961 (200, 200, 3)
962 (200, 200, 3)
963 (200, 200, 3)
964 (200, 200, 3)
965 (200, 200, 3)
966 (200, 200, 3)
967 (200, 200, 3)
968 (200, 200, 3)
969 (200, 200, 3)
970 (200, 200, 3)
971 (200, 200, 3)
972 (200, 200, 3)
973 (200, 

In [15]:
dataset_final.shape

(1000, 200, 200, 3)

In [16]:
dataset_final[0]

array([[[ 0.9296875,  0.9296875,  0.9296875],
        [ 0.9296875,  0.9296875,  0.9296875],
        [ 0.9296875,  0.9296875,  0.9296875],
        ..., 
        [ 0.9453125,  0.9453125,  0.9453125],
        [ 0.9453125,  0.9453125,  0.9453125],
        [ 0.9453125,  0.9453125,  0.9453125]],

       [[ 0.9296875,  0.9296875,  0.9296875],
        [ 0.9296875,  0.9296875,  0.9296875],
        [ 0.9296875,  0.9296875,  0.9296875],
        ..., 
        [ 0.9453125,  0.9453125,  0.9453125],
        [ 0.9453125,  0.9453125,  0.9453125],
        [ 0.9453125,  0.9453125,  0.9453125]],

       [[ 0.9296875,  0.9296875,  0.9296875],
        [ 0.9296875,  0.9296875,  0.9296875],
        [ 0.9296875,  0.9296875,  0.9296875],
        ..., 
        [ 0.9453125,  0.9453125,  0.9453125],
        [ 0.9453125,  0.9453125,  0.9453125],
        [ 0.9453125,  0.9453125,  0.9453125]],

       ..., 
       [[ 0.9375   ,  0.9375   ,  0.9375   ],
        [ 0.9375   ,  0.9375   ,  0.9375   ],
        [ 0.9375   

# Network Architecture

In [17]:
weight_file = os.path.join(walk_up_folder(os.getcwd(), depth=1),'Data/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5')

In [18]:
from keras.applications import VGG19
from keras.applications.vgg19 import preprocess_input

In [42]:
# Create the base model of VGG19
vgg19 = VGG19(weights='imagenet', include_top=False, input_shape = (200, 200, 3), classes = 11)

In [43]:
X_train = preprocess_input(dataset_final)

In [44]:
X_train.shape

(1000, 200, 200, 3)

In [45]:
type(X_train)

numpy.ndarray

In [61]:
# For finetuning, if we have labels
import keras
from keras.models import Model
from keras import models
from keras import layers
from keras import optimizers
# Add Dense and Dropout layers on top of VGG19 pre-trained
model = models.Sequential()
model.add(layers.Dense(512, activation='relu', input_dim=4 * 4 * 512))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(10, activation="softmax"))

# Compile the model
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

In [48]:
train_features = np.ndarray(shape=(1000,6, 6,512),dtype=np.float32)

In [49]:
for k in range(0,1000,100):
    print("Finished 100")
    train_features[k:k+100] = vgg19.predict(X_train[k:k+100], batch_size=256, verbose=1)

Finished 100
Finished 100
Finished 100
Finished 100
Finished 100
Finished 100
Finished 100
Finished 100
Finished 100
Finished 100


In [50]:
train_features.shape

(1000, 6, 6, 512)

In [51]:
train_features = np.reshape(train_features, (1000, 6*6*512))

In [52]:
train_features.shape

(1000, 18432)

In [53]:
from sklearn.externals import joblib

In [54]:
joblib.dump(train_features,'image_feats.pkl')

['image_feats.pkl']

In [55]:
new_feats = joblib.load('image_feats.pkl')

In [56]:
new_feats.shape

(1000, 18432)

In [57]:
type(new_feats)

numpy.ndarray