In [43]:
import pandas as pd
from skimage.transform import resize
# from scipy.misc import imresize
from keras.applications import vgg16, vgg19, inception_v3, resnet50, xception
from keras.models import Model
import matplotlib.pyplot as plt
import numpy as np
import os
%matplotlib inline

In [8]:
def image_batch_generator(image_names, batch_size):
    num_batches = len(image_names) // batch_size
    for i in range(num_batches):
        batch = image_names[i * batch_size : (i + 1) * batch_size]
        yield batch
    batch = image_names[(i+1) * batch_size:]
    yield batch
    
def vectorize_images(image_dir, image_size, preprocessor, 
                     model, vector_file, batch_size=32):
    image_names = os.listdir(image_dir)
    num_vecs = 0
    fvec = open(vector_file, "wb")
    for image_batch in image_batch_generator(image_names, batch_size):
        batched_images = []
        for image_name in image_batch:
            image = plt.imread(os.path.join(image_dir, image_name))
            image = imresize(image, (image_size, image_size))
            batched_images.append(image)
        X = preprocessor(np.array(batched_images, dtype="float32"))
        vectors = model.predict(X)
        for i in range(vectors.shape[0]):
            if num_vecs % 100 == 0:
                print("{:d} vectors generated".format(num_vecs))
            image_vector = ",".join(["{:.5e}".format(v) for v in vectors[i].tolist()])
            fvec.write("{:s}\t{:s}\n".format(image_batch[i], image_vector))
            num_vecs += 1
    print("{:d} vectors generated".format(num_vecs))
    fvec.close()


In [130]:
train_df = pd.read_pickle('train_df.pkl')
train_df.head()

Unnamed: 0,Image,Id,size,height,width,aspect,color,square
0,00022e1a.jpg,w_e15442c,"(500, 699)",500,699,0.715308,0,False
1,000466c4.jpg,w_1287fbc,"(700, 1050, 3)",700,1050,0.666667,1,False
2,00087b01.jpg,w_da2efe0,"(368, 1050, 3)",368,1050,0.350476,1,False
3,001296d5.jpg,w_19e5482,"(170, 397, 3)",170,397,0.428212,1,False
4,0014cfdf.jpg,w_f22f3e3,"(398, 700)",398,700,0.568571,0,False


### Generate image vectors using VGG16

In [34]:
DATA_DIR = "./data/"
IMAGE_DIR = os.path.join(DATA_DIR, "train_processed/")
print(IMAGE_DIR)

./data/train_processed/


In [10]:
# vgg16_model = vgg16.VGG16(weights="imagenet", include_top=True)
# # vgg16_model.summary()

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [29]:
# model = Model(input=vgg16_model.input,
#              output=vgg16_model.get_layer("fc2").output)
# preprocessor = vgg16.preprocess_input

# vectorize_images(IMAGE_DIR, IMAGE_SIZE, preprocessor, model, VECTOR_FILE)

In [35]:
image_dir = IMAGE_DIR

image_names = os.listdir(image_dir)
image_names[:10]

['00022e1a.jpg',
 '000466c4.jpg',
 '00087b01.jpg',
 '001296d5.jpg',
 '0014cfdf.jpg',
 '0025e8c2.jpg',
 '0026a8ab.jpg',
 '0031c258.jpg',
 '0035632e.jpg',
 '0037e7d3.jpg']

In [116]:
image_size = 224
# vector_file = os.path.join('./output/', "vgg16_vec_training.tsv")
preprocessor = vgg16.preprocess_input
model = Model(input=vgg16_model.input,
             output=vgg16_model.get_layer("fc2").output)

train_names = []
train_vecs = []
batched_images = []
batched_names = []

# for i in range(6400,len(image_names)):
for i in range(3200,len(image_names)):

    image_name = image_names[i]
    img = plt.imread(os.path.join(image_dir, image_name))
    img = np.stack((img,img,img),axis=2)
    img = resize(img, (image_size, image_size))
    batched_images.append(img)
    batched_names.append(image_name)

    if i%100 == 99:
        
        print('Generating batch ',i//100+1)
        train_names += batched_names
        X = preprocessor(np.array(batched_images, dtype="float32"))
        vecs = model.predict(X)
        train_vecs += vecs.tolist()
        
        batched_images = []
        batched_names = []
    
    if i == len(image_names)-1:
        
        train_names += batched_names
        X = preprocessor(np.array(batched_images, dtype="float32"))
        vecs = model.predict(X)
        train_vecs += vecs.tolist() 

  """
  warn("The default mode, 'constant', will be changed to 'reflect' in "


Generating batch  33
Generating batch  34
Generating batch  35
Generating batch  36
Generating batch  37
Generating batch  38
Generating batch  39
Generating batch  40
Generating batch  41
Generating batch  42
Generating batch  43
Generating batch  44
Generating batch  45
Generating batch  46
Generating batch  47
Generating batch  48
Generating batch  49
Generating batch  50
Generating batch  51
Generating batch  52
Generating batch  53
Generating batch  54
Generating batch  55
Generating batch  56
Generating batch  57
Generating batch  58
Generating batch  59
Generating batch  60
Generating batch  61
Generating batch  62
Generating batch  63
Generating batch  64
Generating batch  65
Generating batch  66
Generating batch  67
Generating batch  68
Generating batch  69
Generating batch  70
Generating batch  71
Generating batch  72
Generating batch  73
Generating batch  74
Generating batch  75
Generating batch  76
Generating batch  77
Generating batch  78
Generating batch  79
Generating ba

ValueError: operands could not be broadcast together with shapes (6600,4096) (50,4096) 

In [111]:
np.stack((img,img,img),axis=2).shape

(224, 224, 3, 3)

In [109]:
image.shape

(224, 224, 3)

In [117]:
vec_df = pd.DataFrame({'name':train_names[:len(train_vecs)],'vec':train_vecs})
vec_df.head()

Unnamed: 0,name,vec
0,52f9b180.jpg,"[0.0, 0.3950077295303345, 0.22122499346733093,..."
1,5306471d.jpg,"[0.0, 0.3900309205055237, 0.21196603775024414,..."
2,530c43d6.jpg,"[0.0, 0.3982654809951782, 0.2229750156402588, ..."
3,53377226.jpg,"[0.0, 0.40115663409233093, 0.22265923023223877..."
4,53399bfb.jpg,"[0.0, 0.40114134550094604, 0.20162978768348694..."


In [118]:
vec_df.to_pickle('train_vecs_B.pkl')

In [119]:
df_A = pd.read_pickle('train_vecs_A.pkl')
df_A.head()

Unnamed: 0,name,vec
0,00022e1a.jpg,"[0.0, 0.40396255254745483, 0.23073887825012207..."
1,000466c4.jpg,"[0.0, 0.40127861499786377, 0.2082851231098175,..."
2,00087b01.jpg,"[0.0, 0.39454972743988037, 0.22640979290008545..."
3,001296d5.jpg,"[0.0, 0.3986162543296814, 0.22212862968444824,..."
4,0014cfdf.jpg,"[0.0, 0.4057381749153137, 0.21147847175598145,..."


In [120]:
df_B = vec_df
df_B.head()

Unnamed: 0,name,vec
0,52f9b180.jpg,"[0.0, 0.3950077295303345, 0.22122499346733093,..."
1,5306471d.jpg,"[0.0, 0.3900309205055237, 0.21196603775024414,..."
2,530c43d6.jpg,"[0.0, 0.3982654809951782, 0.2229750156402588, ..."
3,53377226.jpg,"[0.0, 0.40115663409233093, 0.22265923023223877..."
4,53399bfb.jpg,"[0.0, 0.40114134550094604, 0.20162978768348694..."


In [132]:
df = pd.concat([df_A, df_B, vec_df])
len(df)

9850

In [128]:
df.to_pickle('train_vecs.pkl')

In [134]:
# df['label'] = df['name'].apply(lambda x: train_df[train_df.Image==x].iloc[0]['Id'])
df.head()

Unnamed: 0,name,vec,label
0,00022e1a.jpg,"[0.0, 0.40396255254745483, 0.23073887825012207...",w_e15442c
1,000466c4.jpg,"[0.0, 0.40127861499786377, 0.2082851231098175,...",w_1287fbc
2,00087b01.jpg,"[0.0, 0.39454972743988037, 0.22640979290008545...",w_da2efe0
3,001296d5.jpg,"[0.0, 0.3986162543296814, 0.22212862968444824,...",w_19e5482
4,0014cfdf.jpg,"[0.0, 0.4057381749153137, 0.21147847175598145,...",w_f22f3e3


In [135]:
df.to_pickle('train_vecs.pkl')

### Generate VGG16 vectors for test set

In [136]:
DATA_DIR = "./data/"
IMAGE_DIR = os.path.join(DATA_DIR, "test_processed/")
print(IMAGE_DIR)

./data/test_processed/


In [137]:
image_dir = IMAGE_DIR

image_names = os.listdir(image_dir)
image_names[:10]

['00029b3a.jpg',
 '0003c693.jpg',
 '000bc353.jpg',
 '0010a672.jpg',
 '00119c3f.jpg',
 '001259cc.jpg',
 '0015f9b4.jpg',
 '0018c4ba.jpg',
 '001bf484.jpg',
 '002d8d81.jpg']

In [138]:
len(image_names)

15610

In [145]:
# image_size = 224
# # vector_file = os.path.join('./output/', "vgg16_vec_training.tsv")
# preprocessor = vgg16.preprocess_input
# model = Model(input=vgg16_model.input,
#              output=vgg16_model.get_layer("fc2").output)

train_names = []
train_vecs = []
batched_images = []
batched_names = []

# for i in range(6400,len(image_names)):
for i in range(10000,len(image_names)):

    image_name = image_names[i]
    img = plt.imread(os.path.join(image_dir, image_name))
    img = np.stack((img,img,img),axis=2)
    img = resize(img, (image_size, image_size))
    batched_images.append(img)
    batched_names.append(image_name)

    if i%100 == 99:
        
        print('Generating batch ',i//100+1)
        train_names += batched_names
        X = preprocessor(np.array(batched_images, dtype="float32"))
        vecs = model.predict(X)
        train_vecs += vecs.tolist()
        
        batched_images = []
        batched_names = []
    
    if i == len(image_names)-1:
        
        train_names += batched_names
        X = preprocessor(np.array(batched_images, dtype="float32"))
        vecs = model.predict(X)
        train_vecs += vecs.tolist()

  warn("The default mode, 'constant', will be changed to 'reflect' in "


Generating batch  101
Generating batch  102
Generating batch  103
Generating batch  104
Generating batch  105
Generating batch  106
Generating batch  107
Generating batch  108
Generating batch  109
Generating batch  110
Generating batch  111
Generating batch  112
Generating batch  113
Generating batch  114
Generating batch  115
Generating batch  116
Generating batch  117
Generating batch  118
Generating batch  119
Generating batch  120
Generating batch  121
Generating batch  122
Generating batch  123
Generating batch  124
Generating batch  125
Generating batch  126
Generating batch  127
Generating batch  128
Generating batch  129
Generating batch  130
Generating batch  131
Generating batch  132
Generating batch  133
Generating batch  134
Generating batch  135
Generating batch  136
Generating batch  137
Generating batch  138
Generating batch  139
Generating batch  140
Generating batch  141
Generating batch  142
Generating batch  143
Generating batch  144
Generating batch  145
Generating

In [146]:
vec_df = pd.DataFrame({'name':train_names[:len(train_vecs)],'vec':train_vecs})
vec_df.head()

Unnamed: 0,name,vec
0,a383d0e2.jpg,"[0.0, 0.4009876251220703, 0.22087979316711426,..."
1,a386d5a5.jpg,"[0.0, 0.40454337000846863, 0.2142924666404724,..."
2,a388218e.jpg,"[0.0, 0.3923652470111847, 0.2241707742214203, ..."
3,a38d0ee5.jpg,"[0.0, 0.4009319543838501, 0.22321638464927673,..."
4,a38d2ec5.jpg,"[0.0, 0.39040112495422363, 0.20830994844436646..."


In [147]:
vec_df.to_pickle('test_vecs_C.pkl')

In [148]:
df_A = pd.read_pickle('test_vecs_A.pkl')
df_B = pd.read_pickle('test_vecs_B.pkl')
df = pd.concat([df_A, df_B, vec_df])
len(df)

15610

In [149]:
df.to_pickle('test_vecs.pkl')