# Sampling images from database

This notebook provides Python code to access different combinations of images from the database and use them to create montages (tiled grids of images).

In [None]:
import sqlite3
import random
import itertools
import subprocess
import os
import shlex
import re

In [None]:
# path on rte
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

In [None]:
# Here we import the sqlite3 database and create a cursor

db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
# test that we can fetch the pragma for each table

c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
# Get total number of images per primary category only

targetDate = "2018-10-01"

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created BETWEEN date(?) 
    AND date(?, "start of month","+1 month","-1 day")
    AND images.x != ''
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(images.identifier) DESC    
    ''', (targetDate, targetDate,))
categories = c.fetchall()
for row in categories:
    print(row)

In [None]:
print(len(categories))

In [None]:
print(categories[0][0])

In [None]:
# remove entries with less than 144 total images

catlist = []

for cat in categories:
    if cat[1] >= 144:
        catlist.append(cat)
        print(cat)

print('-' * 20)
print("total number of categories with required entries: ")
print(len(catlist))

In [None]:
print(catlist[0][0])

In [None]:
# Get each image entry with a particular category in a given month

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE metadata.created BETWEEN date('2018-10-01') "
    "AND date('2018-10-31') "
    "AND images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, (catlist[0][0],))
rows = c.fetchall()

In [None]:
# Get each image entry with a particular category in a given month
# Specifically, get all images from cs.CV from 2012

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier  "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, ('cs.CV',))
rows = c.fetchall()

In [None]:
# Get each image entry with a particular category in a given month
# Specifically, get all images from stat.ML from 2012

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier  "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, ('stat.ML',))
rows = c.fetchall()

In [None]:
print(len(rows))

In [None]:
for row in rows[:100]:
    print(row)

In [None]:
without_duplicates = []
for row in rows[:]:
    if row not in without_duplicates:
        without_duplicates.append(row)

In [None]:
print(len(without_duplicates))
for row in without_duplicates[:]:
    print(row)

In [None]:
rows = without_duplicates

In [None]:
# TESTING ONLY

# Get each image entry with a particular category in a given month
# Specifically, get all images from cs.CV from 2012

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier  "
    "FROM metadata "
    "WHERE metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, ('cs.CV',))
rows = c.fetchall()

In [None]:
print(len(rows))

In [None]:
for row in rows[:100]:
    print(row)

In [None]:
# convert to tuples
rows = [(x,) for x in rows[:]]

### Get accreditations formatted

In [None]:
# go through all of the retrieved SQL rows and format as an accreditation
# if html is True, format with <a href=####> for web usage
# input = [[author, title, date, identifier], [author, title, date, identifier],...[author, title, date, identifier]]

def format_credits(rows, html=False):
    accreditations = []

    empty_counter = 0
    
    # html = True
#     html = False

    # grab the details and check each
    for i, row in enumerate(rows):
        print(i)
        if row:
            s = row[0]
            start = "['"
            end = "']"
            author = s[s.find(start)+len(start):s.rfind(end)]
            author = author[:-2]
            print(author)

            title = row[1]
            # replace line breaks and double spaces
            title = title.replace("\n", "").replace("  "," ")
            print(title)

            date = row[2].split("-")[0]
            print(date)

            identifier = row[3]
            print(identifier)
        #     reg_exp = re.compile("/[^\d]\d{2}[^\d]/")
            longest_digits = max(re.findall(r'\d+', identifier), key = len)
            print(len(longest_digits))

            # if the identifier contains seven consecutive numbers, add a slash
            if len(longest_digits) == 7:
                print("----- regex match -----")
                reverse = identifier[::-1]
                print(reverse)
                identifier_reverse = reverse[:7] + "/" + reverse[7:]
                identifier = identifier_reverse[::-1]
            else:
                # otherwise we can leave the identifier how it is
                print("----- no match -----")
            print(identifier)
            url = "https://arxiv.org/abs/" + identifier
            print(url)
            print("*" * 20)
        #     print(row[0])

            # format string and append
            if html:
                fmt_str = '{}: {}, {}, <a href="{}">{}</a>'
                accreditations.append(fmt_str.format(author, title, str(date), url, url))  
            else:
                fmt_str = '{}: {}, {}, {}'
                accreditations.append(fmt_str.format(author, title, str(date), url, url))
        else:
            empty_counter += 1
            print("empty!")

    print("number of empty slots:", empty_counter)
    return accreditations

In [None]:
# print out with line breaks
for row in accreditations:
    print(row)

### Get random images

In [None]:
# Get 144 random images

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

c.execute(sql, )
rows = c.fetchall()

In [None]:
# Get 144 images from stat.ML

# maybe move this???

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
    "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

c.execute(sql, ("stat.ML", ))
rows = c.fetchall()

In [None]:
# Get 144 images from stat.ML from October 2012

# maybe move this???

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND metadata.created BETWEEN date('2012-10-01') "
    "AND date('2012-10-31') "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
    "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

c.execute(sql, ("stat.ML", ))
rows = c.fetchall()

In [None]:
# Get 16 random images

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
    "ORDER BY RANDOM() LIMIT 144 ")

c.execute(sql, ("stat.ML", ))
rows = c.fetchall()

In [None]:
print(len(rows))

In [None]:
for row in rows:
    print(row)

In [None]:
# check to see if there are multiple categories
for row in rows:
    if len(row[0]) > 5:
        print(row)

### Shuffle list and then grab the filepath and id

In [None]:
# shuffle first, then get ids and paths

random.shuffle(rows)

In [None]:
ids = []
filepaths = []

for row in rows[:144]:
    print(row[5])
    ids.append(row[5])
    path = row[1] + '/' + row[2]
    print(path)
    filepaths.append(path)

In [None]:
print(filepaths)

In [None]:
print(ids)

### Used for generating figures for paper

In [None]:
# get a montage of some images

# print the current working directory
os.chdir('/home/rte/re-imaging/sqlite-scripts/')
print(os.getcwd())

os.chdir('/home/rte/arXiv/')
print(os.getcwd())

prearg = shlex.split("-colorspace CMYK")
# arguments = shlex.split("-colorspace sRGB -background white -alpha background -trim +repage -flatten -geometry 240x240+2+2 -tile 12x /home/rte/documentation/data-samples/test_py_montage.jpg")

# for bigger montage of 12x12
arguments = shlex.split("-colorspace sRGB -background white -alpha background -geometry 240x240+2+2 -tile 12x")
# for smaller montage of 4x4
# arguments = shlex.split("-colorspace sRGB -background white -alpha background -geometry 480x480+2+2 -tile 4x")

# outputname = ["/home/rte/documentation/data-samples/random_montage_12x12_stat.ML.jpg"]
outputname = ["/home/rte/documentation/data-samples/random_montage_nlin.CG.jpg"]
# print(arguments)

filelist = []
# this takes a slice of the larger shuffled list
for filepath in itertools.islice(filepaths, 0, 144):
#     print(filepath)
    # put the filepath into the list but add the directory, remove the dot
    # and also add [0] to only use the first page of multi-page image documents
    filelist.append("src_all" + filepath.replace('./','/') + '[0]')
           
# print(filelist)

# call the montage command and parse list of files and arguments
montage_cmd = ["montage"] + prearg + filelist + arguments + outputname

result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = result.communicate()
print(out)
print(err)
print("subprocess finished")
print("-" * 40)


In [None]:
# use this to copy to other files, paste as variable
print(ids)

In [None]:
# testing writing filenames

# os.chdir("/home/rte/re-imaging/sqlite-scripts/")
os.chdir("/home/rte/documentation/data-samples/")
print(os.getcwd())

# fname = "2018-10_" + catlist[0][0] + "_ids.txt"
fname = "random_montage_4x4_v2_ids.txt"
f = open(fname, "w+")
for row in ids:
    f.write(str(row) + "\n")
f.close()

In [None]:
for cat in catlist:
    print(cat[0])

In [None]:
print(len(filelist))

for row in filelist:
    print(row)

In [None]:
# testing writing filenames

fname = "2018-10_" + catlist[0][0] + ".txt"
f = open(fname, "w+")
for row in filelist:
    f.write(row + "\n")
f.close()

In [None]:
print(targetDate[:7])

In [None]:
targetYM = targetDate[:7]
savepath = "/home/rte/Documents/documentation/data-samples/montages/category/" + targetYM + "/"

if os.path.isdir(savepath):
    print("directory exists, saving to: " + savepath)
else:
    try:
        os.makedirs(savepath)
    except OSError:
        print("failed to create directory: " + savepath)
    else:
        print("successfully created the directory: " + savepath)

In [None]:
print(len(catlist))

for row in catlist:
    print(row[0])

The following block of code takes the list of categories, queries the SQL database for a list of images that match each category and a given year, then shuffles this list, selects a subset, and uses that to run the montage command ^_^ 

In [None]:
# added date formatting
sql = ('''
    SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created BETWEEN date(?) AND date(?, 'start of month','+1 month','-1 day')
    AND images.x != ''
    AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?
    ''')

targetYM = targetDate[:7]
savepath = "/home/rte/Documents/documentation/data-samples/montages/category/" + targetYM + "/"

if os.path.isdir(savepath):
    print("saving to: " + savepath)
else:
    try:
        os.makedirs(path)
    except OSError:
        print("Failed to create directory: " + savepath)
    else:
        print("Successfully created the directory: " + savepath)
        

for cat in catlist:
    print("querying for category: " + str(cat[0]))
    c.execute(sql, (targetDate, targetDate, cat[0], ))
    rows = c.fetchall()
    
    print("total number of images found: " + str(len(rows)))
    
    filepaths = []

    for row in rows:
        path = row[1] + '/' + row[2]
    #     print(path)
        filepaths.append(path)

    # shuffle the whole list
    random.shuffle(filepaths)
    
    os.chdir('/home/rte/re-imaging/sqlite-scripts/')

    # os.chdir('src_all')
#     print(os.getcwd())

    # format the arguments for montage
    arguments = shlex.split("-colorspace sRGB -units PixelsPerInch -density 300 -background white -alpha off -geometry 240x240+2+2 -tile 12x")

    filelist = []
    # this takes a slice of the larger shuffled list
    for filepath in itertools.islice(filepaths, 0, 144):
        # put the filepath into the list but add the directory, remove the dot
        # and also add [0] to only use the first page of multi-page image documents
        filelist.append("src_all" + filepath.replace('./','/') + '[0]')

    # write list of images to file (for debugging purposes, mostly)
    fname = savepath + targetYM + "_" + cat[0] + "_" + str(cat[1]) + ".txt"
    f = open(fname, "w+")
    for row in filelist:
        f.write(row + "\n")
    f.close()
    
#     outputname = "test_py_montage.jpg"
    outputname = [savepath + "montage_" + targetYM + "_" + cat[0] + "_" + str(cat[1]) + ".jpg"]

    print("calling montage")
    # call the montage command and parse list of files and arguments
    montage_cmd = ["montage"] + filelist + arguments + outputname
#     print(montage_cmd)
    result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out, err = result.communicate()
    print(out)
    print(err)
    print("subprocess finished")
    print("-" * 40)

### Getting image montages for paper

In [None]:
# Get 144 images from nlin.CG from October 2012

targetDate = "2012-01-01"
endDate = "2012-12-31"

# sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
#     "FROM images "
#     "LEFT JOIN metadata ON images.identifier = metadata.identifier "
#     "WHERE images.x != '' "
#     "AND metadata.created BETWEEN date('2012-10-01') "
#     "AND date('2012-10-31') "
#     "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
#     "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

# c.execute(sql, ("nlin.CG", ))
# rows = c.fetchall()

sql = '''
    SELECT '/mnt/hd2/images/all/' || images.id || '.jpg'
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created BETWEEN date(?) 
    AND date(?)
    AND images.x != ''
    AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?
    '''
c.execute(sql, (targetDate, endDate, "nlin.CG"))
rows = c.fetchall()
print(len(rows))
for row in rows:
    print(row)

In [None]:
filepaths = []
for row in rows:
    filepaths.append(row[0])
# print(filepaths)

random.shuffle(filepaths)

# savepath = "/home/rte/documentation/dataset-methods-paper/montages/" + "nlin.CG_" + targetDate + ".jpg"
savepath = "/home/rte/documentation/dataset-methods-paper/montages/" + "nlin.CG_all2012"+ ".jpg"

print(savepath)
arguments = shlex.split("-colorspace sRGB -units PixelsPerInch -density 300 -background white -alpha off -geometry 240x240+2+2 -tile 12x")

filelist = []
# this takes a slice of the larger shuffled list
for filepath in itertools.islice(filepaths, 0, 144):
    # put the filepath into the list but add the directory, remove the dot
    # and also add [0] to only use the first page of multi-page image documents
    filelist.append(filepath.replace('./','/') + '[0]')
print(filelist)        
        
# # write list of images to file (for debugging purposes, mostly)
# fname = "nlin.CG_2012" + ".txt"
# with open(fname, "w+") as f:
#     for row in filelist:
#         f.write(row + "\n")
        
montage_cmd = ["montage"] + filelist + arguments + [savepath]

result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = result.communicate()
print(out)
print(err)
print("subprocess finished")
print("-" * 40)


### AUTOMATED

In [None]:
category = "nlin.CG"
# category = "astro-ph"

In [None]:
categories = ["nlin.CG", "physics.med-ph", "cs.CV", "stat.ML", "physics.pop-ph"]

In [None]:
categories = ["astro-ph.IM", "astro-ph.HE"]

In [None]:
# all categories (20200206-1240)
categories = ["nlin.CG", "physics.med-ph", "cs.CV", "cs.DB", "stat.ML", "hep-ph", "physics.pop-ph", "astro-ph.IM", "astro-ph.HE", "astro-ph", "math.AG", "q-bio.MN", "q-bio.GN"]

In [None]:
years = [2009, 2012, 2015, 2018]

In [None]:
# every year
years = [x for x in range(1990, 2019, 1)]

In [None]:
print(years)

In [None]:
categories = ["astro-ph"]

In [None]:
years = [1991, 1994, 1996, 2000]

In [None]:
targetDate = "2012-01-01"
endDate = "2012-12-31"

In [None]:
for category in categories:
    for year in years:
        targetDate = str(year) + "-01-01"
        endDate = str(year) + "-12-31"

        sql = '''
            SELECT '/mnt/hd2/images/all/' || images.id || '.jpg'
            FROM images
            LEFT JOIN metadata ON images.identifier = metadata.identifier
            WHERE metadata.created BETWEEN date(?) 
            AND date(?)
            AND images.x != ''
            AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?
            '''
        c.execute(sql, (targetDate, endDate, category))
        rows = c.fetchall()
        print("length:",len(rows))
        for row in rows:
            print(row)
        print("*****")

        filepaths = []
        for row in rows:
            filepaths.append(row[0])

        random.seed(4) # keep the same seed for reproducibility
        random.shuffle(filepaths)

        savepathroot = "/home/rte/documentation/dataset-methods-paper/montages/"
        savepath = savepathroot + category + "_" + str(year) + "_montage"
        print(savepath)

        arguments = shlex.split("-colorspace sRGB -units PixelsPerInch -density 300 -background white -alpha off -geometry 240x240+2+2 -tile 12x")
        print(arguments)

        filelist = []
        # this takes a slice of the larger shuffled list
        for filepath in itertools.islice(filepaths, 0, 144):
            # put the filepath into the list but add the directory, remove the dot
            # and also add [0] to only use the first page of multi-page image documents
            filelist.append(filepath.replace('./','/') + '[0]')

        print("*****")
        print("selected file paths")
        for file in filelist:
            print(file)

        # write list of images to file to be able to retrieve the accreditations later
        # fname = savepathroot + category + str(year) + ".txt"
        with open(savepath + ".txt", "w+") as f:
            for row in filelist:
                f.write(row + "\n")

        montage_cmd = ["montage"] + filelist + arguments + [savepath + ".jpg"]

        result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out, err = result.communicate()
        print(out)
        print(err)
        print("subprocess finished")
        print("-" * 40)