In [None]:
import subprocess
import os
import sqlite3
import random

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.interpolate import BSpline
import scipy.interpolate as interpolate

import seaborn as sns
sns.set(style="ticks")

import joypy
from matplotlib import cm

import shlex

In [None]:
open_images = False

In [None]:
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
search_term = "monte carlo"
# search_term = "convolutional neural network"
# search_term = "convolution"
# search_term = "cnn"
sql_search_term = "%" + search_term + "%"
print(sql_search_term)

In [None]:
# extended search for multiple terms

sql = '''
    SELECT captions.image_ids, captions.caption, captions.fignum, metadata.cat, metadata.created
    FROM captions
    LEFT JOIN metadata ON captions.identifier = metadata.identifier
    WHERE caption LIKE "%convolutional%" 
    OR caption LIKE "%convolutional neural network%"
    OR caption LIKE "%cnn%"    
    '''

c.execute(sql, ())
rows = c.fetchall()

print("number of rows:",len(rows))

In [None]:
sql = '''
    SELECT captions.image_ids, captions.caption, captions.fignum, metadata.cat, metadata.created
    FROM captions
    LEFT JOIN metadata ON captions.identifier = metadata.identifier
    WHERE caption LIKE ?
    '''

c.execute(sql, (sql_search_term, ))
rows = c.fetchall()

print("number of rows:",len(rows))

In [None]:
for row in rows[:3]:
    print(row)

### Use full primary category

In [None]:
# full primary category
data_long = []
for row in rows:
    cat = row[3].split(" ")[0]
    year = row[4].split("-")[0]
    data_long.append([cat, year])

In [None]:
df = pd.DataFrame(data_long, columns=["cat", "year"]).astype({'cat': 'category', 'year': 'int32'})

In [None]:
df

In [None]:
df.cat.unique()

In [None]:
df.dtypes

In [None]:
pd.set_option('display.max_row', 1000)
# pd.set_option('display.max_columns', 50)
df.groupby('cat').count()

In [None]:
plt.figure(figsize=(20,20))
fig, axes = joypy.joyplot(df, by="cat", column="year", grid="y", linewidth=1, 
                          legend=False, fade=True,
                          hist=False, bins=23, overlap=1,
                          title="Caption occurrences of " + search_term, 
                          figsize=(20,20),
                          )
# kind="counts", bins=30,
# range_style='own'
# colormap=cm.Blues_r

In [None]:
# fig.savefig("monte-carlo_cat_year_ridge.svg", dpi=300)
# filename = search_term.replace("%","").replace(" ", "-") + "_ridge.svg"
# fig.savefig(filename, dpi=300)

fig.savefig("convnet_multiterms_ridge.svg", dpi=300)

### Use only the condensed categories

In [None]:
# condensed categories
data_short = []
for row in rows:
    cat = row[3].split(" ")[0].split(".")[0]
    year = row[4].split("-")[0]
    data_short.append([cat, year])

In [None]:
df = pd.DataFrame(data_short, columns=["cat", "year"]).astype({'cat': 'category', 'year': 'int32'})

In [None]:
df.set_index(["cat", "year"]).count(level="cat")

In [None]:
df.groupby('cat').count()

In [None]:
df

In [None]:
df.cat.unique()

In [None]:
df.year.unique()

In [None]:
plt.figure(figsize=(20,20))
fig, axes = joypy.joyplot(df, by="cat", column="year", grid=True, linewidth=1,
                          hist=True, legend=False, fade=True,
                          title="Caption occurrences of " + search_term, 
                          figsize=(20,20),
                          bins=20,
                          ylim='max', overlap=0
                          )
# kind="counts", bins=30,
# range_style='own'
# kind='kde'
# kind='normalized_counts'

In [None]:
# filename = search_term.replace("%","").replace(" ", "-") + "_maincat_ridge.svg"
# fig.savefig(filename, dpi=300)
os.chdir("/home/rte/documentation/captions/")
# fig.savefig("convnet_multiterms_maincat_ridge.svg", dpi=300)
fig.savefig("monte-carlo_maincat_hist.svg", dpi=300)
# fig.savefig("monte-carlo_maincat_ridge.svg", dpi=300)
# fig.savefig("convolution_multiple_maincat_ridge.svg", dpi=300)

In [None]:
# random.shuffle(rows)

### use the image_ids to open/save files

In [None]:
sql_results = rows

In [None]:
random.seed(5)
random.shuffle(sql_results)

In [None]:
image_ids = []
for row in sql_results[:200]:
#     print(row)
    if row[0] is not None:
        image = row[0].split("\|")[0]
#         for image in images:
        image_ids.append(image)
print(len(image_ids))
print(image_ids)

In [None]:
filelist = [x + ".jpg" for x in image_ids[:144]]
print(filelist)

In [None]:
savepath = "/home/rte/documentation/captions/"
os.chdir("/mnt/hd2/images/all")
outputname = [savepath + "montage_" + search_term.replace(" ","-") + ".jpg"]
arguments = shlex.split("-colorspace sRGB -units PixelsPerInch -density 300 -background white -alpha off -geometry 240x240+2+2 -tile 12x")
montage_cmd = ["montage"] + filelist + arguments + outputname
result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = result.communicate()
print(out)
print(err)
print("subprocess finished")
print("-" * 40)

In [None]:
files = []
for image_ids, caption, fignum, cat, created in rows:
    print(image_ids, cat, created)
    if image_ids is not None:
        if "\|" in image_ids:
            # print("splitting string:",image_ids)
            ids = image_ids.split("\|")
            for id in ids:
                # print(id)
                files.append(id)
        else:
            files.append(image_ids)

# files = [str(x[0]) + ".jpg" for x in rows[:]]
print("total number of results:", len(files))

In [None]:
if open_images:
    os.chdir("/mnt/hd2/images/all")

    cmd = ["feh"]
    for file in files:
        cmd.append(file + ".jpg")
    # print(cmd)

    # for row in rows:
    #     print("fignum:",row[2])
    #     print("caption:",row[1])

    subprocess.run(cmd)

Make image

In [None]:
full_image = Image.new('RGBA', (width, height))
for img, x, y in zip(images, tx, ty):
    tile = Image.open(img)
    tw = tile.width
    th = tile.height
#             print(img)
#             print("tile dimensions: x=" + str(tile.width) + " y=" + str(tile.height))
    if tw < 10000 and th < 10000:
        rs = max(1, tw/max_dim, th/max_dim)
        tile = tile.resize((int(tw/rs), int(th/rs)), Image.ANTIALIAS)
        full_image.paste(tile, (int((width-max_dim)*x), int((height-max_dim)*y)), mask=tile.convert('RGBA'))
#             else:
#                 print("tile width or height too big?")

plt.figure(figsize = (16,12))
imshow(full_image)

Grid

In [None]:
nx = 40
ny = 25

In [None]:
# assign to grid
grid_assignment = rasterfairy.transformPointCloud2D(tsne, target=(nx, ny))

In [None]:
tile_width = 72
tile_height = 56

full_width = tile_width * nx
full_height = tile_height * ny
aspect_ratio = float(tile_width) / tile_height

grid_image = Image.new('RGB', (full_width, full_height))

for img, grid_pos in zip(images, grid_assignment[0]):
    idx_x, idx_y = grid_pos
    x, y = tile_width * idx_x, tile_height * idx_y
    tile = Image.open(img)
    tile_ar = float(tile.width) / tile.height  # center-crop the tile to match aspect_ratio
    if (tile_ar > aspect_ratio):
        margin = 0.5 * (tile.width - aspect_ratio * tile.height)
        tile = tile.crop((margin, 0, margin + aspect_ratio * tile.height, tile.height))
    else:
        margin = 0.5 * (tile.height - float(tile.width) / aspect_ratio)
        tile = tile.crop((0, margin, tile.width, margin + float(tile.width) / aspect_ratio))
    tile = tile.resize((tile_width, tile_height), Image.ANTIALIAS)
    grid_image.paste(tile, (int(x), int(y)))

matplotlib.pyplot.figure(figsize = (16,12))
imshow(grid_image)