In [None]:
import sqlite3
import random
import itertools
import subprocess
import os
import shlex

In [None]:
# path on rte
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

In [None]:
# Here we import the sqlite3 database and create a cursor

db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
# test that we can fetch the pragma for each table

c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
# Get total number of images per primary category only

targetDate = "2006-10-01"

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created BETWEEN date(?) AND date(?, "start of month","+1 month","-1 day")
    AND images.x != ''
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(images.identifier) DESC    
    ''', (targetDate, targetDate,))
categories = c.fetchall()
for row in categories:
    print(row)

In [None]:
print(len(categories))

In [None]:
print(categories[0][0])

In [None]:
# remove entries with less than 144 total images

catlist = []

for cat in categories:
    if cat[1] >= 144:
        catlist.append(cat)
        print(cat)

print('-' * 20)
print("total number of categories with required entries: ")
print(len(catlist))

In [None]:
# for testing only! slice list into something shorter

del catlist[3:]

for row in catlist:
    print(row)

In [None]:
type(catlist)

# for row in catlist:
#     del row[1]
    
# print(catlist)

In [None]:
print(catlist[0][0])

In [None]:
# Get each image entry with a particular category in a given month

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE metadata.created BETWEEN date('2018-10-01') AND date('2018-10-31') "
    "AND images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, (catlist[0][0],))
rows = c.fetchall()

In [None]:
print(len(rows))

In [None]:
for row in rows:
    print(row)

In [None]:
# check to see if there are multiple categories
for row in rows:
    if len(row[0]) > 5:
        print(row)

In [None]:
filepaths = []

for row in rows:
    path = row[1] + '/' + row[2]
#     print(path)
    filepaths.append(path)

# shuffle the whole list
random.shuffle(filepaths)

In [None]:
print(filepaths)

In [None]:
# testing only
filepaths = ["./folder/flow-cmp.pdf"]

In [None]:
# print the current working directory
print(os.getcwd())
os.chdir('/home/rte/re-imaging/sqlite-scripts/')

# os.chdir('src_all')
print(os.getcwd())

arguments = shlex.split("-colorspace sRGB -units PixelsPerInch -density 300 -background white -alpha off -geometry 240x240+2+2 -tile 12x test_py_montage.jpg")
# print(arguments)

filelist = []
# this takes a slice of the larger shuffled list
for filepath in itertools.islice(filepaths, 0, 144):
#     print(filepath)
    # put the filepath into the list but add the directory, remove the dot
    # and also add [0] to only use the first page of multi-page image documents
    filelist.append("src_all" + filepath.replace('./','/') + '[0]')
           
print(filelist)

# call the montage command and parse list of files and arguments
subprocess.run(["montage"] + filelist + arguments)

# modify to see the results of stdout


In [None]:
for cat in catlist:
    print(cat[0])

In [None]:
print(len(filelist))

for row in filelist:
    print(row)

In [None]:
# testing writing filenames

fname = "2018-10_" + catlist[0][0] + ".txt"
f = open(fname, "w+")
for row in filelist:
    f.write(row + "\n")
f.close()

In [None]:
print(targetDate[:7])

In [None]:
targetYM = targetDate[:7]
savepath = "/home/rte/Documents/documentation/data-samples/montages/category/" + targetYM + "/"

if os.path.isdir(savepath):
    print("directory exists, saving to: " + savepath)
else:
    try:
        os.makedirs(savepath)
    except OSError:
        print("failed to create directory: " + savepath)
    else:
        print("successfully created the directory: " + savepath)

In [None]:
print(len(catlist))

for row in catlist:
    print(row[0])

The following block of code takes the list of categories, queries the SQL database for a list of images that match each category and a given year, then shuffles this list, selects a subset, and uses that to run the montage command ^_^ 

In [None]:
# added date formatting
sql = ('''
    SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created BETWEEN date(?) AND date(?, 'start of month','+1 month','-1 day')
    AND images.x != ''
    AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?
    ''')

targetYM = targetDate[:7]
savepath = "/home/rte/Documents/documentation/data-samples/montages/category/" + targetYM + "/"

if os.path.isdir(savepath):
    print("saving to: " + savepath)
else:
    try:
        os.makedirs(path)
    except OSError:
        print("Failed to create directory: " + savepath)
    else:
        print("Successfully created the directory: " + savepath)
        

for cat in catlist:
    print("querying for category: " + str(cat[0]))
    c.execute(sql, (targetDate, targetDate, cat[0], ))
    rows = c.fetchall()
    
    print("total number of images found: " + str(len(rows)))
    
    filepaths = []

    for row in rows:
        path = row[1] + '/' + row[2]
    #     print(path)
        filepaths.append(path)

    # shuffle the whole list
    random.shuffle(filepaths)
    
    os.chdir('/home/rte/re-imaging/sqlite-scripts/')

    # os.chdir('src_all')
#     print(os.getcwd())

    # format the arguments for montage
    arguments = shlex.split("-colorspace sRGB -units PixelsPerInch -density 300 -background white -alpha off -geometry 240x240+2+2 -tile 12x")

    filelist = []
    # this takes a slice of the larger shuffled list
    for filepath in itertools.islice(filepaths, 0, 144):
        # put the filepath into the list but add the directory, remove the dot
        # and also add [0] to only use the first page of multi-page image documents
        filelist.append("src_all" + filepath.replace('./','/') + '[0]')

    # write list of images to file (for debugging purposes, mostly)
    fname = savepath + targetYM + "_" + cat[0] + "_" + str(cat[1]) + ".txt"
    f = open(fname, "w+")
    for row in filelist:
        f.write(row + "\n")
    f.close()
    
#     outputname = "test_py_montage.jpg"
    outputname = [savepath + "montage_" + targetYM + "_" + cat[0] + "_" + str(cat[1]) + ".jpg"]

    print("calling montage")
    # call the montage command and parse list of files and arguments
    montage_cmd = ["montage"] + filelist + arguments + outputname
#     print(montage_cmd)
    result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out, err = result.communicate()
    print(out)
    print(err)
    print("subprocess finished")
    print("-" * 40)

In [None]:
# make list of categories
# make list of dates (year/month?)
# iterate through with a new sqlite select command
# run montage