# Sampling images from database

This notebook provides Python code to access different combinations of images from the database and use them to create montages (tiled grids of images).

In [1]:
import sqlite3
import random
import itertools
import subprocess
import os
import shlex
import re

In [2]:
# path on rte
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

In [3]:
# Here we import the sqlite3 database and create a cursor

db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
sql = '''
    SELECT path, filename
    FROM images
    WHERE identifier = ?
    '''
c.execute(sql, ("1608.06451",))
rows = c.fetchall()

print("number of rows:",len(rows))

In [None]:
print(rows)

In [None]:
files = []

for r in rows:
    files.append("/".join(r))

print(files[1])
# subprocess.run(["feh", files[0]])

os.chdir("/home/rte/arXiv/src_all/")
!xdg-open {files[1]} &

In [None]:
# test that we can fetch the pragma for each table

c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
# Get total number of images per primary category only

targetDate = "2018-10-01"

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created BETWEEN date(?) 
    AND date(?, "start of month","+1 month","-1 day")
    AND images.x != ''
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(images.identifier) DESC    
    ''', (targetDate, targetDate,))
categories = c.fetchall()
for row in categories:
    print(row)

In [None]:
print(len(categories))

In [None]:
print(categories[0][0])

In [None]:
# remove entries with less than 144 total images

catlist = []

for cat in categories:
    if cat[1] >= 144:
        catlist.append(cat)
        print(cat)

print('-' * 20)
print("total number of categories with required entries: ")
print(len(catlist))

In [None]:
print(catlist[0][0])

In [None]:
# Get each image entry with a particular category in a given month

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE metadata.created BETWEEN date('2018-10-01') "
    "AND date('2018-10-31') "
    "AND images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, (catlist[0][0],))
rows = c.fetchall()

In [None]:
# Get each image entry with a particular category in a given month
# Specifically, get all images from cs.CV from 2012

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier  "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, ('cs.CV',))
rows = c.fetchall()

In [63]:
# Get each image entry with a particular category in a given month
# Specifically, get all images from stat.ML from 2012

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier  "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, ('stat.ML',))
rows = c.fetchall()

In [64]:
print(len(rows))

2889


In [None]:
for row in rows[:100]:
    print(row)

In [None]:
without_duplicates = []
for row in rows[:]:
    if row not in without_duplicates:
        without_duplicates.append(row)

In [None]:
print(len(without_duplicates))
for row in without_duplicates[:]:
    print(row)

In [None]:
rows = without_duplicates

In [None]:
# TESTING ONLY

# Get each image entry with a particular category in a given month
# Specifically, get all images from cs.CV from 2012

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier  "
    "FROM metadata "
    "WHERE metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, ('cs.CV',))
rows = c.fetchall()

In [None]:
print(len(rows))

In [None]:
for row in rows[:100]:
    print(row)

In [None]:
# convert to tuples
rows = [(x,) for x in rows[:]]

### Get accreditations formatted

In [13]:
# go through all of the retrieved SQL rows and format as an accreditation
# if html is True, format with <a href=####> for web usage
# input = [[author, title, date, identifier], [author, title, date, identifier],...[author, title, date, identifier]]

def format_credits(rows, html=False):
    accreditations = []

    empty_counter = 0
    
    # html = True
#     html = False

    # grab the details and check each
    for i, row in enumerate(rows):
        print(i)
        if row:
            s = row[0]
            start = "['"
            end = "']"
            author = s[s.find(start)+len(start):s.rfind(end)]
            author = author[:-2]
            print(author)

            title = row[1]
            # replace line breaks and double spaces
            title = title.replace("\n", "").replace("  "," ")
            print(title)

            date = row[2].split("-")[0]
            print(date)

            identifier = row[3]
            print(identifier)
        #     reg_exp = re.compile("/[^\d]\d{2}[^\d]/")
            longest_digits = max(re.findall(r'\d+', identifier), key = len)
            print(len(longest_digits))

            # if the identifier contains seven consecutive numbers, add a slash
            if len(longest_digits) == 7:
                print("----- regex match -----")
                reverse = identifier[::-1]
                print(reverse)
                identifier_reverse = reverse[:7] + "/" + reverse[7:]
                identifier = identifier_reverse[::-1]
            else:
                # otherwise we can leave the identifier how it is
                print("----- no match -----")
            print(identifier)
            url = "https://arxiv.org/abs/" + identifier
            print(url)
            print("*" * 20)
        #     print(row[0])

            # format string and append
            if html:
                fmt_str = '{}: {}, {}, <a href="{}">{}</a>'
                accreditations.append(fmt_str.format(author, title, str(date), url, url))  
            else:
                fmt_str = '{}: {}, {}, {}'
                accreditations.append(fmt_str.format(author, title, str(date), url, url))
        else:
            empty_counter += 1
            print("empty!")

    print("number of empty slots:", empty_counter)
    return accreditations

In [None]:
# print out with line breaks
for row in accreditations:
    print(row)

### Get random images

In [None]:
# Get 144 random images

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

c.execute(sql, )
rows = c.fetchall()

In [None]:
# Get 144 images from stat.ML

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
    "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

c.execute(sql, ("stat.ML", ))
rows = c.fetchall()

In [None]:
# Get 144 images from cs.CV from October 2012

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND metadata.created BETWEEN date('2012-10-01') "
    "AND date('2012-10-31') "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
    "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

c.execute(sql, ("cs.CV", ))
rows = c.fetchall()

In [19]:
# Get all images from cs.AI from 2012 (whole year)

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?")

c.execute(sql, ("cs.AI", ))
rows = c.fetchall()

In [None]:
# Get 16 random images

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
    "ORDER BY RANDOM() LIMIT 144 ")

c.execute(sql, ("stat.ML", ))
rows = c.fetchall()

In [10]:
print(len(rows))

319


In [6]:
for row in rows:
    print(row)

('cs.AI cs.DB', './0606/cs0606024/nutHC', 'presenceSynth2Normms8hc500.eps', 'cs0606024', '2006-06-06', 2003)
('cs.AI cs.DB', './0606/cs0606024/nutHC', 'presenceSynth2CS1000e500hc0ms40.eps', 'cs0606024', '2006-06-06', 2004)
('cs.AI cs.DB', './0606/cs0606024/nutHC/backup', 'presenceSynth2Normms8hc500.eps', 'cs0606024', '2006-06-06', 2005)
('cs.AI cs.DB', './0606/cs0606024/nutHC/backup', 'presenceBioCS5000e900hc310ms8.eps', 'cs0606024', '2006-06-06', 2006)
('cs.AI cs.DB', './0606/cs0606024/nutHC/backup', 'presenceBioCS1000e500ms689HC900.eps', 'cs0606024', '2006-06-06', 2007)
('cs.AI cs.DB', './0606/cs0606024/nutHC/backup', 'presenceSynth2CS1000e500hc0ms40.eps', 'cs0606024', '2006-06-06', 2008)
('cs.AI cs.DB', './0606/cs0606024/nutHC/backup', 'presenceSynth2CS1000e500hc310ms8.eps', 'cs0606024', '2006-06-06', 2009)
('cs.AI cs.DB', './0606/cs0606024/nutHC/backup', 'presenceBioCS1000e500ms689HC200.eps', 'cs0606024', '2006-06-06', 2010)
('cs.AI cs.DB', './0606/cs0606024/nutHC/backup', 'presenc

('cs.AI cs.CL', './1811/1811.05106/images', '24_13_fake.png', '1811.05106', '2018-11-12', 617115)
('cs.AI cs.CL', './1811/1811.05106/images', '3_4_fake.png', '1811.05106', '2018-11-12', 617116)
('cs.AI cs.CL', './1811/1811.05106/images', 'yumi_examp_rere.PNG', '1811.05106', '2018-11-12', 617117)
('cs.AI cs.CL', './1811/1811.05106/images', 'main_model.PNG', '1811.05106', '2018-11-12', 617118)
('cs.AI cs.CL', './1811/1811.05106/images', 'order_hint.png', '1811.05106', '2018-11-12', 617119)
('cs.AI', './1811/1811.05245/img', 'approved.eps', '1811.05245', '2018-11-13', 618195)
('cs.AI', './1811/1811.05245/img', 'rejected.eps', '1811.05245', '2018-11-13', 618196)
('cs.AI', './1811/1811.05245/img', 'explainable.eps', '1811.05245', '2018-11-13', 618197)
('cs.AI cs.CL', './1811/1811.05303', 'linearizations.pdf', '1811.05303', '2018-11-13', 618468)
('cs.AI cs.CL', './1811/1811.05303', 'lins.pdf', '1811.05303', '2018-11-13', 618469)
('cs.AI cs.CL', './1811/1811.05303', 'bfol.pdf', '1811.05303', 

('cs.AI', './1711/1711.06892/figures', 'Plot2tree_bmps.pdf', '1711.06892', '2017-11-18', 1198879)
('cs.AI', './1711/1711.06892/figures', 'Plot1tree_bmps.pdf', '1711.06892', '2017-11-18', 1198880)
('cs.AI', './1711/1711.06892/figures', 'tornado.pdf', '1711.06892', '2017-11-18', 1198881)
('cs.AI', './1711/1711.06892/figures', 'voi-vpi.pdf', '1711.06892', '2017-11-18', 1198882)
('cs.AI', './1711/1711.06892/figures', 'Plotaonelb_bmps.pdf', '1711.06892', '2017-11-18', 1198883)
('cs.AI', './1711/1711.06892/figures', 'unrealistic-nsim.pdf', '1711.06892', '2017-11-18', 1198884)
('cs.AI cs.LG stat.ML', './1711/1711.06922/figures', '20_threads_ddpg_vs_baseline_reward.pdf', '1711.06922', '2017-11-18', 1199088)
('cs.AI cs.LG stat.ML', './1711/1711.06922/figures', '20_threads_ddpg_vs_baseline_reward.svg', '1711.06922', '2017-11-18', 1199089)
('cs.AI cs.LG stat.ML', './1711/1711.06922/figures', '8_threads_ddpg_reward.pdf', '1711.06922', '2017-11-18', 1199090)
('cs.AI cs.LG stat.ML', './1711/1711.069

('cs.AI cs.GT', './1707/1707.00627/pix2', 'mu7b-eps-converted-to.pdf', '1707.00627', '2017-04-26', 1837053)
('cs.AI cs.GT', './1707/1707.00627/pix2', 'mu5-eps-converted-to.pdf', '1707.00627', '2017-04-26', 1837054)
('cs.AI cs.GT', './1707/1707.00627/pix2', 'caprev3.eps', '1707.00627', '2017-04-26', 1837055)
('cs.AI cs.GT', './1707/1707.00627/pix2', 'mu8.eps', '1707.00627', '2017-04-26', 1837056)
('cs.AI cs.GT', './1707/1707.00627/pix2', 'caprev-eps-converted-to.pdf', '1707.00627', '2017-04-26', 1837057)
('cs.AI cs.GT', './1707/1707.00627/pix2', 'mu5.eps', '1707.00627', '2017-04-26', 1837058)
('cs.AI cs.GT', './1707/1707.00627/pix2', 'caprev3-eps-converted-to.pdf', '1707.00627', '2017-04-26', 1837059)
('cs.AI cs.LG cs.NE stat.ML', './1707/1707.03141', 'snail.pdf', '1707.03141', '2017-07-11', 1838064)
('cs.AI cs.CL cs.LG stat.ML', './1707/1707.08616', 'all_frogger_maps.png', '1707.08616', '2017-07-26', 1839795)
('cs.AI cs.CL cs.LG stat.ML', './1707/1707.08616', 'multiGraph_stoch-det.png'

('cs.AI cs.CC stat.ML', './1401/1401.6686', 'perturbedbp_1sRSB.pdf', '1401.6686', '2014-01-26', 2607063)
('cs.AI cs.CC stat.ML', './1401/1401.6686', 'RS.pdf', '1401.6686', '2014-01-26', 2607064)
('cs.AI cs.CC stat.ML', './1401/1401.6686', 'all_3col_8.pdf', '1401.6686', '2014-01-26', 2607065)
('cs.AI cs.CC stat.ML', './1401/1401.6686', 'simple3sat.pdf', '1401.6686', '2014-01-26', 2607066)
('cs.AI cs.CC stat.ML', './1401/1401.6686', 'bpdeclarge_time.pdf', '1401.6686', '2014-01-26', 2607067)
('cs.AI cs.CC stat.ML', './1401/1401.6686', 'bpdec100_iters.pdf', '1401.6686', '2014-01-26', 2607068)
('cs.AI cs.CC stat.ML', './1401/1401.6686', 'all_3sat_8.pdf', '1401.6686', '2014-01-26', 2607069)
('cs.AI cs.CC stat.ML', './1401/1401.6686', '3satfg.pdf', '1401.6686', '2014-01-26', 2607070)
('cs.AI cs.CC stat.ML', './1401/1401.6686', 'all_3col_9.pdf', '1401.6686', '2014-01-26', 2607071)
('cs.AI cs.CC stat.ML', './1401/1401.6686', 'bpdec100_time.pdf', '1401.6686', '2014-01-26', 2607072)
('cs.AI cs.CC

('cs.AI', './1109/1109.4603/plots', 'cov1_sigma_primal.pdf', '1109.4603', '2011-09-21', 3542861)
('cs.AI', './1109/1109.4603/plots', 'timit3_sigma_primal.pdf', '1109.4603', '2011-09-21', 3542862)
('cs.AI', './1109/1109.4603/plots', 'timit3_kernel_cost.pdf', '1109.4603', '2011-09-21', 3542863)
('cs.AI', './1109/1109.4603/plots', 'cov1_kernel_cost.pdf', '1109.4603', '2011-09-21', 3542864)
('cs.AI', './1109/1109.4603/plots', 'adult_classification.pdf', '1109.4603', '2011-09-21', 3542865)
('cs.AI', './1109/1109.4603/plots', 'mnist_classification.pdf', '1109.4603', '2011-09-21', 3542866)
('cs.AI', './1109/1109.3737/figures', 'template-example-face-2.pdf', '1109.3737', '2011-09-16', 3546526)
('cs.AI', './1109/1109.3737/figures', 'full-model.pdf', '1109.3737', '2011-09-16', 3546527)
('cs.AI', './1109/1109.3737/figures', 'faceBestObservation.pdf', '1109.3737', '2011-09-16', 3546528)
('cs.AI', './1109/1109.3737/figures', 'template-example-face-1.pdf', '1109.3737', '2011-09-16', 3546529)
('cs.AI

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [12]:
# check to see if there are multiple categories
for row in rows:
    if len(row[0]) > 5:
        print(row)

('cs.AI cs.LG cs.PF stat.ML', './1211/1211.0906', 'figures__def-redone-2-fold-cv__data_SAT_INDU-HAND-RAND-minisat_csv_NN-pred.jpg', '1211.0906', '2012-11-05', 8083834)
('cs.AI cs.NE', './1210/1210.4021', 'GAvsY211rl.pdf', '1210.4021', '2012-10-15', 9086041)
('cs.AI cs.CG cs.LG', './1211/1211.6727', 'newints_phi.eps', '1211.6727', '2012-11-28', 8070798)
('cs.AI cs.LG cs.PF stat.ML', './1211/1211.0906', 'figures__default_matrix__2d__redone__CPLEX12-cat-CORLAT-RR-10000testC_trainI.jpg', '1211.0906', '2012-11-05', 8083888)
('cs.AI cs.CG cs.LG', './1211/1211.6727', 'newbnd_phi.eps', '1211.6727', '2012-11-28', 8070799)
('cs.AI cs.LG cs.PF stat.ML', './1211/1211.0906', 'figures__default_matrix__2d__redone__CPLEX12-cat-CORLAT-truematrix-testC_testI.jpg', '1211.0906', '2012-11-05', 8083851)
('cs.AI cs.LG cs.PF stat.ML', './1211/1211.0906', 'figures__default_matrix__2d__redone__CPLEX12-cat-CORLAT-truematrix-trainC_testI.jpg', '1211.0906', '2012-11-05', 8083842)
('cs.AI cs.LG cs.PF stat.ML', './1

### Shuffle list and then grab the filepath and id

In [20]:
# shuffle first, then get ids and paths
random.seed(4)
random.shuffle(rows)

In [21]:
ids = []
filepaths = []

for row in rows[:144]:
    print(row[5])
    ids.append(row[5])
    path = row[1] + '/' + row[2]
    print(path)
    filepaths.append(path)

4435162
./1204/1204.4200/dDGP-XCS-Maze4-Perf.eps
4248638
./1212/1212.1143/figs/gridworld-graph-2.pdf
6657455
./1206/1206.3111/results-opt-team.eps
7925457
./1202/1202.2112/robot_on_map.png
4435163
./1204/1204.4200/dDGP-XCS-Woods1-SizeMut.eps
6655502
./1206/1206.5940/img/heuristic_quality_sailing.png
4011393
./1209/1209.3734/query_conference_2.pdf
2928473
./1207/1207.0206/ppdata/pprldmany_40D_separ.eps
2911204
./1207/1207.5926/sixsud04.eps
3988146
./1209/1209.4275/hall.pdf
9066813
./1210/1210.6415/results/res_pdf/graphite_asteroidsparallel_image.pdf
2915664
./1207/1207.1230/ECoGperformanceDS20.pdf
2889011
./1207/1207.0833/duration.png
2911107
./1207/1207.5926/sixsud34.eps
2884940
./1207/1207.1811/holes_block_structure.pdf
6654340
./1206/1206.5928/img/human_exp.pdf
2911164
./1207/1207.5926/sixnonsud23.eps
2674244
./1208/1208.5340/FURNIC.eps
4017811
./1209/1209.5663/Figures/ApresValidation.png
4273921
./1212/1212.5276/zeno3__cost__IBEA__epsilon_+_.eps
1120807
./1201/1201.6615/pendulum_RBF

In [15]:
print(filepaths)

['./1211/1211.2972/output_pdf/plot_segregated.pdf', './1210/1210.2715/Automat_1.eps', './1211/1211.2972/figs/XC25760_spec.png', './1210/1210.6415/results/legend_fnodes_comp.pdf', './1211/1211.0906/figures__def-redone-2-fold-cv__data_SAT_INDU-HAND-RAND-minisat_csv_NN-pred.jpg', './1210/1210.4021/GAvsY211rl.pdf', './1210/1210.1568/Sl_002.eps', './1211/1211.6727/newints_phi.eps', './1211/1211.0906/figures__default_matrix__2d__redone__CPLEX12-cat-CORLAT-RR-10000testC_trainI.jpg', './1211/1211.6727/newbnd_phi.eps', './1211/1211.0906/figures__default_matrix__2d__redone__CPLEX12-cat-CORLAT-truematrix-testC_testI.jpg', './1211/1211.0906/figures__default_matrix__2d__redone__CPLEX12-cat-CORLAT-truematrix-trainC_testI.jpg', './1211/1211.2972/output_pdf/plot_chchstats_Ftrans_locked.pdf', './1211/1211.6097/figures/HLS-SVRI.pdf', './1211/1211.5643/figures/ApplicablePasts.pdf', './1211/1211.2972/output_pdf/plot_multitest_Ftrans_coh.pdf', './1211/1211.0906/figures__def-redone-2-fold-cv__data_MIP_RCW-c

In [16]:
print(ids)

[8110292, 9061966, 8110265, 9066805, 8083834, 9086041, 9064031, 8070798, 8083888, 8070799, 8083851, 8083842, 8110290, 8068294, 8114649, 8110282, 8083899, 9063185, 9086039, 9098169, 8070809, 8083837, 8110274, 8083825, 8083878, 8104304, 8114650, 8083877, 8070801, 8110294, 8083893, 9067823, 8110285, 8091531, 8091544, 8070797, 8110277, 8083886, 8110287, 8083829, 8083846, 8091541, 8104305, 9067822, 9094492, 9086055, 8083850, 9063184, 8083838, 9066820, 8091538, 8083884, 8083879, 8091530, 8091548, 8083826, 9057499, 9098154, 8090326, 9064589, 9086047, 8070813, 8091532, 9066815, 8090325, 8083892, 8115591, 9066819, 9064578, 9066807, 8083824, 9064577, 8083844, 8083858, 8091550, 8091536, 8083869, 8070800, 8110295, 9064584, 9086050, 9066803, 8091545, 9094486, 8091540, 8070807, 8083833, 8083828, 9066822, 8083885, 8091534, 8110293, 9061964, 9067817, 8083835, 8110281, 9066814, 9071472, 8088966, 8083872, 8090321, 8110280, 9098554, 9066824, 9057501, 8110270, 9061968, 9066813, 8091533, 9086053, 9067820, 

### Used for generating figures for paper

In [27]:
name = "cs.AI_montage_144_all2012.jpg"
outputname = ["/home/rte/documentation/data-samples/" + name]

In [28]:
# get a montage of some images

# print the current working directory
os.chdir('/home/rte/re-imaging/sqlite-scripts/')
print(os.getcwd())

os.chdir('/home/rte/arXiv/')
print(os.getcwd())

# arguments = shlex.split("-colorspace sRGB -background white -alpha background -trim +repage -flatten -geometry 240x240+2+2 -tile 12x /home/rte/documentation/data-samples/test_py_montage.jpg")

# for bigger montage of 12x12
arguments = shlex.split("-colorspace sRGB -background white -alpha background -geometry 240x240+2+2 -tile 12x")
# for smaller montage of 4x4
# arguments = shlex.split("-colorspace sRGB -background white -alpha background -geometry 480x480+2+2 -tile 4x")

# outputname = ["/home/rte/documentation/data-samples/random_montage_12x12_stat.ML.jpg"]
# outputname = ["/home/rte/documentation/data-samples/random_montage_nlin.CG.jpg"]
# print(arguments)

filelist = []
# this takes a slice of the larger shuffled list
for filepath in itertools.islice(filepaths, 0, 144):
#     print(filepath)
    # put the filepath into the list but add the directory, remove the dot
    # and also add [0] to only use the first page of multi-page image documents
    filelist.append("src_all" + filepath.replace('./','/') + '[0]')
           
# print(filelist)

# call the montage command and parse list of files and arguments
montage_cmd = ["montage"] + prearg + filelist + arguments + outputname

result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = result.communicate()
print(out)
print(err)
print("subprocess finished")
print("-" * 40)


/home/rte/re-imaging/sqlite-scripts
/mnt/hd-4tb/arXiv
b''
b''
subprocess finished
----------------------------------------


In [None]:
# use this to copy to other files, paste as variable
print(ids)

In [26]:
# testing writing filenames

# os.chdir("/home/rte/re-imaging/sqlite-scripts/")
os.chdir("/home/rte/documentation/data-samples/")
print(os.getcwd())

# fname = "2018-10_" + catlist[0][0] + "_ids.txt"
# fname = "random_montage_4x4_v2_ids.txt"
fname = "cs.AI_montage_144_all2012.txt"
f = open(fname, "w+")
for row in ids:
    f.write(str(row) + "\n")
f.close()

/home/rte/documentation/data-samples


In [30]:
# search ARTICLES IDENTIFIER

rows = []

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier "
    "FROM metadata "
    "WHERE metadata.identifier is ? ")

# iterate over the file_ids list and grab sql data
for i, file_id in enumerate(ids[:]):
    print("article id:",file_id)
    c.execute(sql, (file_id, ))
    row = c.fetchall()
    rows.append(row)
    print(i, row)
#     print(len(row))
#     print(len(row[0]))
#     print(list(rows))


article id: 4435162
0 []
article id: 4248638
1 []
article id: 6657455
2 []
article id: 7925457
3 []
article id: 4435163
4 []
article id: 6655502
5 []
article id: 4011393
6 []
article id: 2928473
7 []
article id: 2911204
8 []
article id: 3988146
9 []
article id: 9066813
10 []
article id: 2915664
11 []
article id: 2889011
12 []
article id: 2911107
13 []
article id: 2884940
14 []
article id: 6654340
15 []
article id: 2911164
16 []
article id: 2674244
17 []
article id: 4017811
18 []
article id: 4273921
19 []
article id: 1120807
20 []
article id: 1132786
21 []
article id: 7946850
22 []
article id: 1097379
23 []
article id: 8083834
24 []
article id: 8091532
25 []
article id: 2889770
26 []
article id: 6654347
27 []
article id: 2911240
28 []
article id: 4244972
29 []
article id: 8083848
30 []
article id: 1118634
31 []
article id: 9066814
32 []
article id: 6634002
33 []
article id: 1124693
34 []
article id: 2911124
35 []
article id: 2661944
36 []
article id: 4278205
37 []
article id: 4244979
38

KeyboardInterrupt: 

In [56]:
### search for a specific set of entries and retrieve metadata
# this is for the image identifier
# author, title, date, identifier

rows = []

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.id is ? ")

# iterate over the file_ids list and grab sql data
for file_id in ids[:]:
    print("image parent article id:",file_id)
    c.execute(sql, (file_id, ))
    row = c.fetchall()
    rows.append(row[0])
    print(row[0])
#     print(len(row))
#     print(len(row[0]))
#     print(list(rows))


image parent article id: 4435162
("['Preen, Richard J.; Bull, Larry; ']", 'Discrete Dynamical Genetic Programming in XCS', '2012-04-18', '1204.4200', 4435162)
image parent article id: 4248638
("['Bouvrie, Jake; Maggioni, Mauro; ']", 'Multiscale Markov Decision Problems: Compression, Solution, and Transfer\n  Learning', '2012-12-05', '1212.1143', 4248638)
image parent article id: 6657455
("['Calimeri, Francesco; Ianni, Giovambattista; Ricca, Francesco; ']", 'The third open Answer Set Programming competition', '2012-06-14', '1206.3111', 6657455)
image parent article id: 7925457
("['Dey, Debadeepta; Liu, Tian Yu; Hebert, Martial; Bagnell, J. Andrew; ']", 'Predicting Contextual Sequences via Submodular Function Maximization', '2012-02-09', '1202.2112', 7925457)
image parent article id: 4435163
("['Preen, Richard J.; Bull, Larry; ']", 'Discrete Dynamical Genetic Programming in XCS', '2012-04-18', '1204.4200', 4435163)
image parent article id: 6655502
("['Nguyen, Truong-Huy Dinh; Lee, Wee-Su

("['Srivastava, Rupesh Kumar; Steunebrink, Bas R.; Schmidhuber, Jürgen; ']", 'First Experiments with PowerPlay', '2012-10-31', '1210.8385', 9064585)
image parent article id: 2889781
("['Henriques, Rui; Lynce, Inês; Manquinho, Vasco; ']", 'On When and How to use SAT to Mine Frequent Itemsets', '2012-07-26', '1207.6253', 2889781)
image parent article id: 4425256
("['Skarlatidis, Anastasios; Artikis, Alexander; Filippou, Jason; Paliouras, Georgios; ']", 'A Probabilistic Logic Programming Event Calculus', '2012-04-09', '1204.1851', 4425256)
image parent article id: 8070812
("['Belkin, Mikhail; Que, Qichao; Wang, Yusu; Zhou, Xueyuan; ']", 'Graph Laplacians on Singular Manifolds: Toward understanding complex\n  spaces: graph Laplacians on manifolds with singularities and boundaries', '2012-11-28', '1211.6727', 8070812)
image parent article id: 9066809
("['Edelkamp, Stefan; Kissmann, Peter; Torralba, Álvaro; ']", 'Lex-Partitioning: A New Option for BDD Search', '2012-10-23', '1210.6415', 9066

("['Demoen, Bart; de la Banda, Maria Garcia; ']", 'Redundant Sudoku Rules', '2012-07-25', '1207.5926', 2911229)
image parent article id: 9066812
("['Edelkamp, Stefan; Kissmann, Peter; Torralba, Álvaro; ']", 'Lex-Partitioning: A New Option for BDD Search', '2012-10-23', '1210.6415', 9066812)
image parent article id: 8070797
("['Belkin, Mikhail; Que, Qichao; Wang, Yusu; Zhou, Xueyuan; ']", 'Graph Laplacians on Singular Manifolds: Toward understanding complex\n  spaces: graph Laplacians on manifolds with singularities and boundaries', '2012-11-28', '1211.6727', 8070797)
image parent article id: 4397422
("['Bailleux, Olivier; ']", 'Unit contradiction versus unit propagation', '2012-04-03', '1204.0731', 4397422)
image parent article id: 2895177
("['Bui, Hung Hai; Huynh, Tuyen N.; Riedel, Sebastian; ']", 'Automorphism Groups of Graphical Models and Lifted Variational Inference', '2012-07-19', '1207.4814', 2895177)
image parent article id: 8110282
("['Stowell, Dan; Plumbley, Mark D.; ']", 'Se

("['Demoen, Bart; de la Banda, Maria Garcia; ']", 'Redundant Sudoku Rules', '2012-07-25', '1207.5926', 2911196)
image parent article id: 8083874
("['Hutter, Frank; Xu, Lin; Hoos, Holger H.; Leyton-Brown, Kevin; ']", 'Algorithm Runtime Prediction: Methods & Evaluation', '2012-11-05', '1211.0906', 8083874)
image parent article id: 4244982
("['Martins, Andre F. T.; Figueiredo, Mario A. T.; Aguiar, Pedro M. Q.; Smith, Noah A.; Xing, Eric P.; ']", 'Alternating Directions Dual Decomposition', '2012-12-28', '1212.6550', 4244982)
image parent article id: 4273931
("['Khouadjia, Mostepha Redouane; Schoenauer, Marc; Vidal, Vincent; Dréo, Johann; Savéant, Pierre; ']", 'Multi-Objective AI Planning: Evaluating DAE-YAHSP on a Tunable Benchmark', '2012-12-20', '1212.5276', 4273931)
image parent article id: 3999153
("['Borboudakis, Giorgos; Tsamardinos, Ioannis; ']", 'Scoring and Searching over Bayesian Networks with Causal and Associative\n  Priors', '2012-09-28', '1209.6561', 3999153)
image parent ar

In [58]:
print(len(rows))
print(rows[:1][0])

144
("['Preen, Richard J.; Bull, Larry; ']", 'Discrete Dynamical Genetic Programming in XCS', '2012-04-18', '1204.4200', 4435162)


In [49]:
data = [row[0] for row in rows]

In [59]:
accreditations = format_credits(rows, False)

0
Preen, Richard J.; Bull, Larry
Discrete Dynamical Genetic Programming in XCS
2012
1204.4200
4
----- no match -----
1204.4200
https://arxiv.org/abs/1204.4200
********************
1
Bouvrie, Jake; Maggioni, Mauro
Multiscale Markov Decision Problems: Compression, Solution, and Transfer Learning
2012
1212.1143
4
----- no match -----
1212.1143
https://arxiv.org/abs/1212.1143
********************
2
Calimeri, Francesco; Ianni, Giovambattista; Ricca, Francesco
The third open Answer Set Programming competition
2012
1206.3111
4
----- no match -----
1206.3111
https://arxiv.org/abs/1206.3111
********************
3
Dey, Debadeepta; Liu, Tian Yu; Hebert, Martial; Bagnell, J. Andrew
Predicting Contextual Sequences via Submodular Function Maximization
2012
1202.2112
4
----- no match -----
1202.2112
https://arxiv.org/abs/1202.2112
********************
4
Preen, Richard J.; Bull, Larry
Discrete Dynamical Genetic Programming in XCS
2012
1204.4200
4
----- no match -----
1204.4200
https://arxiv.org/abs/12

In [60]:
print(accreditations)

['Preen, Richard J.; Bull, Larry: Discrete Dynamical Genetic Programming in XCS, 2012, https://arxiv.org/abs/1204.4200', 'Bouvrie, Jake; Maggioni, Mauro: Multiscale Markov Decision Problems: Compression, Solution, and Transfer Learning, 2012, https://arxiv.org/abs/1212.1143', 'Calimeri, Francesco; Ianni, Giovambattista; Ricca, Francesco: The third open Answer Set Programming competition, 2012, https://arxiv.org/abs/1206.3111', 'Dey, Debadeepta; Liu, Tian Yu; Hebert, Martial; Bagnell, J. Andrew: Predicting Contextual Sequences via Submodular Function Maximization, 2012, https://arxiv.org/abs/1202.2112', 'Preen, Richard J.; Bull, Larry: Discrete Dynamical Genetic Programming in XCS, 2012, https://arxiv.org/abs/1204.4200', 'Nguyen, Truong-Huy Dinh; Lee, Wee-Sun; Leong, Tze-Yun: Bootstrapping Monte Carlo Tree Search with an Imperfect Heuristic, 2012, https://arxiv.org/abs/1206.5940', 'Rodler, Patrick; Shchekotykhin, Kostyantyn; Fleiss, Philipp; Friedrich, Gerhard: RIO: Minimizing User Inte

In [62]:
# testing writing filenames

# os.chdir("/home/rte/re-imaging/sqlite-scripts/")
os.chdir("/home/rte/documentation/data-samples/")
print(os.getcwd())

# fname = "2018-10_" + catlist[0][0] + "_ids.txt"
# fname = "random_montage_4x4_v2_ids.txt"
fname = "cs.AI_montage_144_all2012_credits.txt"
f = open(fname, "w+")
for row in accreditations:
    f.write(str(row) + "\n")
f.close()

/home/rte/documentation/data-samples


In [None]:
for cat in catlist:
    print(cat[0])

In [None]:
print(len(filelist))

for row in filelist:
    print(row)

In [None]:
# testing writing filenames

fname = "2018-10_" + catlist[0][0] + ".txt"
f = open(fname, "w+")
for row in filelist:
    f.write(row + "\n")
f.close()

In [None]:
print(targetDate[:7])

In [None]:
targetYM = targetDate[:7]
savepath = "/home/rte/Documents/documentation/data-samples/montages/category/" + targetYM + "/"

if os.path.isdir(savepath):
    print("directory exists, saving to: " + savepath)
else:
    try:
        os.makedirs(savepath)
    except OSError:
        print("failed to create directory: " + savepath)
    else:
        print("successfully created the directory: " + savepath)

In [None]:
print(len(catlist))

for row in catlist:
    print(row[0])

The following block of code takes the list of categories, queries the SQL database for a list of images that match each category and a given year, then shuffles this list, selects a subset, and uses that to run the montage command ^_^ 

In [None]:
# added date formatting
sql = ('''
    SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created BETWEEN date(?) AND date(?, 'start of month','+1 month','-1 day')
    AND images.x != ''
    AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?
    ''')

targetYM = targetDate[:7]
savepath = "/home/rte/Documents/documentation/data-samples/montages/category/" + targetYM + "/"

if os.path.isdir(savepath):
    print("saving to: " + savepath)
else:
    try:
        os.makedirs(path)
    except OSError:
        print("Failed to create directory: " + savepath)
    else:
        print("Successfully created the directory: " + savepath)
        

for cat in catlist:
    print("querying for category: " + str(cat[0]))
    c.execute(sql, (targetDate, targetDate, cat[0], ))
    rows = c.fetchall()
    
    print("total number of images found: " + str(len(rows)))
    
    filepaths = []

    for row in rows:
        path = row[1] + '/' + row[2]
    #     print(path)
        filepaths.append(path)

    # shuffle the whole list
    random.shuffle(filepaths)
    
    os.chdir('/home/rte/re-imaging/sqlite-scripts/')

    # os.chdir('src_all')
#     print(os.getcwd())

    # format the arguments for montage
    arguments = shlex.split("-colorspace sRGB -units PixelsPerInch -density 300 -background white -alpha off -geometry 240x240+2+2 -tile 12x")

    filelist = []
    # this takes a slice of the larger shuffled list
    for filepath in itertools.islice(filepaths, 0, 144):
        # put the filepath into the list but add the directory, remove the dot
        # and also add [0] to only use the first page of multi-page image documents
        filelist.append("src_all" + filepath.replace('./','/') + '[0]')

    # write list of images to file (for debugging purposes, mostly)
    fname = savepath + targetYM + "_" + cat[0] + "_" + str(cat[1]) + ".txt"
    f = open(fname, "w+")
    for row in filelist:
        f.write(row + "\n")
    f.close()
    
#     outputname = "test_py_montage.jpg"
    outputname = [savepath + "montage_" + targetYM + "_" + cat[0] + "_" + str(cat[1]) + ".jpg"]

    print("calling montage")
    # call the montage command and parse list of files and arguments
    montage_cmd = ["montage"] + filelist + arguments + outputname
#     print(montage_cmd)
    result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out, err = result.communicate()
    print(out)
    print(err)
    print("subprocess finished")
    print("-" * 40)

### Getting image montages for paper

In [None]:
# Get 144 images from nlin.CG from October 2012

targetDate = "2012-01-01"
endDate = "2012-12-31"

# sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
#     "FROM images "
#     "LEFT JOIN metadata ON images.identifier = metadata.identifier "
#     "WHERE images.x != '' "
#     "AND metadata.created BETWEEN date('2012-10-01') "
#     "AND date('2012-10-31') "
#     "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
#     "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

# c.execute(sql, ("nlin.CG", ))
# rows = c.fetchall()

sql = '''
    SELECT '/mnt/hd2/images/all/' || images.id || '.jpg'
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created BETWEEN date(?) 
    AND date(?)
    AND images.x != ''
    AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?
    '''
c.execute(sql, (targetDate, endDate, "nlin.CG"))
rows = c.fetchall()
print(len(rows))
for row in rows:
    print(row)

In [None]:
filepaths = []
for row in rows:
    filepaths.append(row[0])
# print(filepaths)

random.seed(4)
random.shuffle(filepaths)

# savepath = "/home/rte/documentation/dataset-methods-paper/montages/" + "nlin.CG_" + targetDate + ".jpg"
savepath = "/home/rte/documentation/dataset-methods-paper/montages/" + "nlin.CG_all2012"+ ".jpg"

print(savepath)
arguments = shlex.split("-colorspace sRGB -units PixelsPerInch -density 300 -background white -alpha off -geometry 240x240+2+2 -tile 12x")

filelist = []
# this takes a slice of the larger shuffled list
for filepath in itertools.islice(filepaths, 0, 144):
    # put the filepath into the list but add the directory, remove the dot
    # and also add [0] to only use the first page of multi-page image documents
    filelist.append(filepath.replace('./','/') + '[0]')
print(filelist)        
        
# # write list of images to file (for debugging purposes, mostly)
# fname = "nlin.CG_2012" + ".txt"
# with open(fname, "w+") as f:
#     for row in filelist:
#         f.write(row + "\n")
        
montage_cmd = ["montage"] + filelist + arguments + [savepath]

result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = result.communicate()
print(out)
print(err)
print("subprocess finished")
print("-" * 40)


### AUTOMATED

In [None]:
category = "nlin.CG"
# category = "astro-ph"

In [None]:
categories = ["nlin.CG", "physics.med-ph", "cs.CV", "stat.ML", "physics.pop-ph"]

In [None]:
categories = ["astro-ph.IM", "astro-ph.HE"]

In [None]:
# all categories (20200206-1240)
categories = ["nlin.CG", "physics.med-ph", "cs.CV", "cs.DB", "stat.ML", "hep-ph", "physics.pop-ph", "astro-ph.IM", "astro-ph.HE", "astro-ph", "math.AG", "q-bio.MN", "q-bio.GN"]

In [None]:
years = [2009, 2012, 2015, 2018]

In [None]:
# every year
years = [x for x in range(1990, 2019, 1)]

In [None]:
print(years)

In [None]:
categories = ["astro-ph"]

In [None]:
years = [1991, 1994, 1996, 2000]

In [None]:
targetDate = "2012-01-01"
endDate = "2012-12-31"

In [None]:
for category in categories:
    for year in years:
        targetDate = str(year) + "-01-01"
        endDate = str(year) + "-12-31"

        sql = '''
            SELECT '/mnt/hd2/images/all/' || images.id || '.jpg'
            FROM images
            LEFT JOIN metadata ON images.identifier = metadata.identifier
            WHERE metadata.created BETWEEN date(?) 
            AND date(?)
            AND images.x != ''
            AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?
            '''
        c.execute(sql, (targetDate, endDate, category))
        rows = c.fetchall()
        print("length:",len(rows))
        for row in rows:
            print(row)
        print("*****")

        filepaths = []
        for row in rows:
            filepaths.append(row[0])

        random.seed(4) # keep the same seed for reproducibility
        random.shuffle(filepaths)

        savepathroot = "/home/rte/documentation/dataset-methods-paper/montages/"
        savepath = savepathroot + category + "_" + str(year) + "_montage"
        print(savepath)

        arguments = shlex.split("-colorspace sRGB -units PixelsPerInch -density 300 -background white -alpha off -geometry 240x240+2+2 -tile 12x")
        print(arguments)

        filelist = []
        # this takes a slice of the larger shuffled list
        for filepath in itertools.islice(filepaths, 0, 144):
            # put the filepath into the list but add the directory, remove the dot
            # and also add [0] to only use the first page of multi-page image documents
            filelist.append(filepath.replace('./','/') + '[0]')

        print("*****")
        print("selected file paths")
        for file in filelist:
            print(file)

        # write list of images to file to be able to retrieve the accreditations later
        # fname = savepathroot + category + str(year) + ".txt"
        with open(savepath + ".txt", "w+") as f:
            for row in filelist:
                f.write(row + "\n")

        montage_cmd = ["montage"] + filelist + arguments + [savepath + ".jpg"]

        result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out, err = result.communicate()
        print(out)
        print(err)
        print("subprocess finished")
        print("-" * 40)