In [None]:
import os
import random
import time
import datetime
import pickle
import sqlite3
import re

from PIL import Image

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
# path on rte
db_path = os.path.expanduser("~/data/db/arxiv_db_images.sqlite3")

# Here we import the sqlite3 database and create a cursor
db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
import keras
from keras.preprocessing import image
from keras.applications.imagenet_utils import decode_predictions, preprocess_input
from keras.models import Model

In [None]:
# this seems to help with some GPU memory issues

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [None]:
model = keras.applications.VGG16(weights='imagenet', include_top=True)
print("model loaded")
model.summary()

# set up the feature extractor

feat_extractor = Model(inputs=model.input, outputs=model.get_layer("fc2").output)
print("feature extractor setup")
feat_extractor.summary()

In [None]:
def load_image(path):
    img = image.load_img(path, target_size=model.input_shape[1:3])
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return img, x

### Select images from SQLite according to category/date range

Run one of the below blocks and then jump down to next section.

In [None]:
# Get all images from cs.AI from 2012 (whole year)

name = "csAI_all2012"

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id, metadata.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?")

c.execute(sql, ("cs.AI", ))
rows = c.fetchall()
print(len(rows))

In [None]:
# Get images from stat.ML from 2012 (whole year)

name = "statML_all2012"

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id, metadata.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, ("stat.ML", ))
rows = c.fetchall()
print(len(rows))

### Generate t-SNE

Find all image files, run classifier over images, optionally save features, then run t-SNE over features and save image map.

In [None]:
paths = []
# all_images_path = "/mnt/hd2/images/all/"
all_images_path = os.path.expanduser("~/all/") # symlink to folder with all images

for row in rows:
    paths.append(all_images_path + str(row[5]) + ".jpg")
print(paths[:3])
print(len(paths))

In [None]:
# run the classifier over all images

num_x = len(paths)

start = time.time()

features = []
for i, image_path in enumerate(paths):
    if i % 500 == 0:
#         toc = time.clock()
        elap = time.time() - start;
        print("analyzing image %d / %d. Time: %4.4f seconds." % (i, len(paths),elap))
        start = time.time()
    img, x = load_image(image_path)

    feat = feat_extractor.predict(x)[0]
    features.append(feat)

print('finished extracting features for %d images' % len(paths))

# write images, features to a pickle file
pickle_file = "features_" + name + "_vgg_x" + str(num_x) + ".pickle"
print(pickle_file)

# WRITE
with open(pickle_file, "wb") as write_file:
    pickle.dump([paths, features], write_file)
    write_file.close()

In [None]:
# set parameters for t-SNE (these will be written to the filename)
perp = 50
bPCA = True
num_iterations = 2000

### Run t-SNE

Note that this involves random processes and so will not give the same output.

In [None]:
# read features and then run t-SNE

with open(pickle_file, "rb") as read_file:
    images, features = pickle.load(read_file)
    read_file.close()

# check that we still have the features and list of images
print("----- checking images and features -----")
print("length of images: " + str(len(images)))
print("length of features: " + str(len(features)))
for img, f in list(zip(images, features))[0:5]:
    print("image: %s, features: %0.2f,%0.2f,%0.2f,%0.2f... "%(img, f[0], f[1], f[2], f[3]))

if len(images) >= 300:
    features = np.array(features)
    print("----- running pca across features -----")
    pca = PCA(n_components=300)
    pca.fit(features)

    pca_features = pca.transform(features)

    X = np.array(pca_features)
    X.shape
    tsne = TSNE(n_components=2, learning_rate=150, perplexity=perp, 
                angle=0.2, verbose=2, 
                n_iter=num_iterations, random_state=5).fit_transform(X)

    # normalise points
    tx, ty = tsne[:,0], tsne[:,1]
    tx = (tx-np.min(tx)) / (np.max(tx) - np.min(tx))
    ty = (ty-np.min(ty)) / (np.max(ty) - np.min(ty))

    width = 4000
    height = 3000
    max_dim = 100

    full_image = Image.new('RGBA', (width, height))
    for img, x, y in zip(images, tx, ty):
        tile = Image.open(img)
        rs = max(1, tile.width/max_dim, tile.height/max_dim)
        tile = tile.resize((int(tile.width/rs), int(tile.height/rs)), Image.ANTIALIAS)
        full_image.paste(tile, (int((width-max_dim)*x), int((height-max_dim)*y)), mask=tile.convert('RGBA'))

    plt.figure(figsize = (16,12))
    imshow(full_image)

    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H-%M-%S')
    filename = "tSNE_" + name + "_x" + str(len(images)) + "_n" + str(num_iterations) + "_p" + str(perp) + "_" + st
    print(filename)
    full_image.save(filename + ".png")


### Get accreditation for all images used

In [None]:
# go through all of the retrieved SQL rows and format as an accreditation
# if html is True, format with <a href=####> for web usage
# input = [[author, title, date, identifier], [author, title, date, identifier],...[author, title, date, identifier]]

def format_credits(rows, html=False):
    accreditations = []

    empty_counter = 0
    
    # grab the details and check each
    for i, row in enumerate(rows):
        print(i)
        if row:
            s = row[0]
            start = "['"
            end = "']"
            # if the author has been written to the SQLite database with parenthesis
            if s.find(start) != -1 and s.rfind(end) != -1:
                author = s[s.find(start)+len(start):s.rfind(end)-len(end)]
            else:
                author = s[:]
            print(author)

            title = row[1]
            # replace line breaks and double spaces
            title = title.replace("\n", "").replace("  "," ")
            print(title)

            date = row[2].split("-")[0]
            print(date)

            identifier = row[3]
            print(identifier)
        #     reg_exp = re.compile("/[^\d]\d{2}[^\d]/")
            longest_digits = max(re.findall(r'\d+', identifier), key = len)
            print(len(longest_digits))

            # if the identifier contains seven consecutive numbers, add a slash
            if len(longest_digits) == 7:
                print("----- regex match -----")
                reverse = identifier[::-1]
                print(reverse)
                identifier_reverse = reverse[:7] + "/" + reverse[7:]
                identifier = identifier_reverse[::-1]
            else:
                # otherwise we can leave the identifier how it is
                print("----- no match -----")
            print(identifier)
            url = "https://arxiv.org/abs/" + identifier
            print(url)
            print("*" * 20)

            # format string and append
            if html:
                fmt_str = '{}: {}, {}, <a href="{}">{}</a>'
                accreditations.append(fmt_str.format(author, title, str(date), url, url))  
            else:
                fmt_str = '{}: {}, {}, {}'
                accreditations.append(fmt_str.format(author, title, str(date), url, url))
        else:
            empty_counter += 1
            print("empty!")

    print("number of empty slots:", empty_counter)
    return accreditations

To get the accreditations, find the paper identifiers of all the images we have used. Remove duplicates.

In [None]:
meta_ids = []

for row in rows:
    meta_ids.append(row[6])

print("meta_ids length:",len(meta_ids))

In [None]:
def unique(ilist):
    ulist = []
    for x in ilist:
        if x not in ulist:
            ulist.append(x)
    return ulist

meta_ids = unique(meta_ids)
print(len(meta_ids))

In [None]:
rows = []

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier "
    "FROM metadata "
    "WHERE metadata.id is ? ")

# iterate over the file_ids list and grab sql data
for file_id in meta_ids[:]:
    print("image parent article id:",file_id)
    c.execute(sql, (file_id, ))
    row = c.fetchall()
    rows.append(row[0])
    print(row[0])
#     print(len(row))https://arxiv.org/abs/1201.6583
#     print(len(row[0]))
#     print(list(rows))

accreditations = format_credits(rows, False)

os.chdir(os.path.expanduser("~/documentation/data-samples/"))
print(os.getcwd())

f = open(filename + ".txt", "w+")
for row in accreditations:
    f.write(str(row) + "\n")
f.close()