In [None]:
import sqlite3
import random
import itertools
import subprocess
import os
import shlex
import time
import pickle

In [None]:
from PIL import Image
# from pillow import Image

import numpy as np
import matplotlib.pyplot as plt

In [None]:
import keras
from keras.preprocessing import image
from keras.applications.imagenet_utils import decode_predictions, preprocess_input
from keras.models import Model

In [None]:
model = keras.applications.VGG16(weights='imagenet', include_top=True)
print("model loaded")
model.summary()

In [None]:
def load_image(path):
    img = image.load_img(path, target_size=model.input_shape[1:3])
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return img, x

In [None]:
# set up the feature extractor

feat_extractor = Model(inputs=model.input, outputs=model.get_layer("fc2").output)
print("feature extractor setup")
feat_extractor.summary()

In [None]:
# Here we import the sqlite3 database and create a cursor
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"
db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
# test that we can fetch the pragma for each table

c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

### Get random samples

In [None]:
sql = ('''
    SELECT images.id, images.path, images.filename, images.identifier, metadata.cat
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    ORDER BY RANDOM()
    LIMIT 1000
    ''')

c.execute(sql)
rows = c.fetchall()
print(len(rows))

### run convert on samples 

In [None]:

start = time.time()

targetSize = 512

convert_path = "/home/rte/data/images/random/1k/"

filepaths = []

for row in rows:
    path = row[1] + '/' + row[2]
#     print(path)
    filepaths.append(path.replace('./','/home/rte/arXiv/src_all/'))

print("total number of filepaths: " + str(len(filepaths)))

# write list of image paths and IDs to file (for debugging purposes, mostly)

fname = convert_path + "filepaths.txt"
# print(fname)
f = open(fname, "w+")
for path, row in zip(filepaths, rows):
    f.write(path + "," + str(row[0]) + "\n")
f.close()

# arguments for convert
# NOTE MODIFIED TO REMOVE "^>"
arguments = shlex.split("-colorspace sRGB -background white -alpha off -resize " + str(targetSize))
# print(arguments)

# call convert for each image path
for row, f in zip(rows, filepaths):
#     print(row)
#     print(f)
    outputname = [convert_path + str(row[0]) + ".jpg"]

#     print("calling convert")
    # call the montage command and parse list of files and arguments
    convert_cmd = ["convert"] + ["-density"] + ["300"] + [f + "[0]"] + arguments + outputname
#     print(convert_cmd)

    result = subprocess.Popen(convert_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out, err = result.communicate()
#     print(out)
    print(err)

print("finished converting!")
end = time.time()
print("time taken:", end - start)

In [None]:
current_path = "/home/rte/data/images/random/1k/"
# current_path = convert_path

image_extensions = ['.jpg', '.png', '.jpeg']   # case-insensitive (upper/lower doesn't matter)
max_num_images = 100000

images = [os.path.join(dp, f) for dp, dn, filenames in os.walk(current_path) for f in filenames if os.path.splitext(f)[1].lower() in image_extensions]
num_x = len(images)
print("keeping %d images to analyze" % num_x)

tic = time.clock()

all_predictions = []

features = []
for i, image_path in enumerate(images):
    if i % 500 == 0:
        toc = time.clock()
        elap = toc-tic;
        print("analyzing image %d / %d. Time: %4.4f seconds." % (i, len(images),elap))
        tic = time.clock()
    img, x = load_image(image_path)

#     feat = feat_extractor.predict(x)[0]
#     features.append(feat)
    
    predictions = model.predict(x)
    
    for _, pred, prob in decode_predictions(predictions)[0]:
        print("predicted %s with probability %0.3f" % (pred, prob))
    for image_pred in zip(pred, prob):
        all_predictions.append(image_pred)
        
print('finished predicting class for %d images' % len(images))

# write images, predictions to a pickle file

f = "classification_vgg_subset" + ".pickle"

print(f)

# WRITE
with open(f, "wb") as write_file:
    pickle.dump([images, features], write_file)
    write_file.close()
