# Sampling images from database

This notebook provides Python code to access different combinations of images from the database and use them to create montages (tiled grids of images).

In [1]:
import sqlite3
import random
import itertools
import subprocess
import os
import shlex
import re

In [2]:
# path on rte
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

In [3]:
# Here we import the sqlite3 database and create a cursor

db = sqlite3.connect(db_path)
c = db.cursor()

In [4]:
# test that we can fetch the pragma for each table

c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)


Column Info:
ID, Name, Type, NotNull, DefaultVal, PrimaryKey
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'identifier', 'TEXT', 0, None, 0)
(2, 'created', 'TEXT', 0, None, 0)
(3, 'cat', 'TEXT', 0, None, 0)
(4, 'authors', 'TEXT', 0, None, 0)
(5, 'title', 'TEXT', 0, None, 0)
(6, 'abstract', 'TEXT', 0, None, 0)
(7, 'licence', 'TEXT', 0, None, 0)


In [5]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)


Column Info:
ID, Name, Type, NotNull, DefaultVal, PrimaryKey
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'identifier', 'TEXT', 0, None, 0)
(2, 'filename', 'TEXT', 0, None, 0)
(3, 'filesize', 'INT', 0, None, 0)
(4, 'path', 'TEXT', 0, None, 0)
(5, 'x', 'INT', 0, None, 0)
(6, 'y', 'INT', 0, None, 0)
(7, 'imageformat', 'TEXT', 0, None, 0)
(8, 'creator', 'TEXT', 0, None, 0)


In [6]:
# Get total number of images per primary category only

targetDate = "2018-01-01"

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created BETWEEN date(?) 
    AND date(?, "start of month","+12 month","-1 day")
    AND images.x != ''
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(images.identifier) DESC    
    ''', (targetDate, targetDate,))
categories = c.fetchall()
for row in categories:
    print(row)

('cs.CV', 188042)
('cs.LG', 78680)
('astro-ph.GA', 74039)
('hep-ph', 55548)
('astro-ph.SR', 45081)
('math.NA', 37881)
('astro-ph.HE', 34364)
('quant-ph', 34180)
('stat.ML', 33367)
('hep-th', 25609)
('astro-ph.CO', 24586)
('astro-ph.EP', 23525)
('cond-mat.mes-hall', 23393)
('cs.RO', 23304)
('physics.flu-dyn', 21927)
('cond-mat.str-el', 21553)
('math.OC', 21222)
('gr-qc', 21053)
('cs.CL', 19844)
('cs.IT', 19725)
('eess.SP', 18039)
('cond-mat.mtrl-sci', 17494)
('cs.CR', 17362)
('astro-ph.IM', 17216)
('cs.NI', 16822)
('stat.ME', 16047)
('hep-ex', 15880)
('cs.DC', 13721)
('cs.AI', 13540)
('cond-mat.stat-mech', 13532)
('nucl-th', 13388)
('cond-mat.soft', 13306)
('physics.ins-det', 13084)
('physics.comp-ph', 12532)
('cs.SI', 12247)
('cs.SY', 10670)
('math.GT', 8787)
('math.CO', 8637)
('stat.AP', 8635)
('eess.IV', 8486)
('cs.GR', 8025)
('physics.optics', 8002)
('physics.soc-ph', 8001)
('cond-mat.quant-gas', 7706)
('math.DS', 7417)
('cond-mat.supr-con', 7410)
('cs.SE', 7388)
('math.AP', 7314)
(

In [None]:
print(len(categories))

In [None]:
print(categories[0][0])

In [None]:
# remove entries with less than 144 total images

catlist = []

for cat in categories:
    if cat[1] >= 144:
        catlist.append(cat)
        print(cat)

print('-' * 20)
print("total number of categories with required entries: ")
print(len(catlist))

In [None]:
print(catlist[0][0])

In [None]:
# Get each image entry with a particular category in a given month

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE metadata.created BETWEEN date('2018-10-01') "
    "AND date('2018-10-31') "
    "AND images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, (catlist[0][0],))
rows = c.fetchall()

In [None]:
# Get each image entry with a particular category in a given month
# Specifically, get all images from cs.CV from 2012

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier  "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, ('cs.CV',))
rows = c.fetchall()

In [None]:
# Get each image entry with a particular category in a given month
# Specifically, get all images from stat.ML from 2012

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier  "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, ('stat.ML',))
rows = c.fetchall()

In [None]:
print(len(rows))

In [None]:
for row in rows[:100]:
    print(row)

In [None]:
without_duplicates = []
for row in rows[:]:
    if row not in without_duplicates:
        without_duplicates.append(row)

In [None]:
print(len(without_duplicates))
for row in without_duplicates[:]:
    print(row)

In [None]:
rows = without_duplicates

In [None]:
# TESTING ONLY

# Get each image entry with a particular category in a given month
# Specifically, get all images from cs.CV from 2012

sql = ("SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier  "
    "FROM metadata "
    "WHERE metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? ")

c.execute(sql, ('cs.CV',))
rows = c.fetchall()

In [None]:
print(len(rows))

In [None]:
for row in rows[:100]:
    print(row)

In [None]:
# convert to tuples
rows = [(x,) for x in rows[:]]

### Get accreditations formatted

In [7]:
# go through all of the retrieved SQL rows and format as an accreditation
# if html is True, format with <a href=####> for web usage
# input = [[author, title, date, identifier], [author, title, date, identifier],...[author, title, date, identifier]]

def format_credits(rows, html=False):
    accreditations = []

    empty_counter = 0
    
    # html = True
#     html = False

    # grab the details and check each
    for i, row in enumerate(rows):
        print(i)
        if row:
            s = row[0]
            start = "['"
            end = "']"
            author = s[s.find(start)+len(start):s.rfind(end)]
            author = author[:-2]
            print(author)

            title = row[1]
            # replace line breaks and double spaces
            title = title.replace("\n", "").replace("  "," ")
            print(title)

            date = row[2].split("-")[0]
            print(date)

            identifier = row[3]
            print(identifier)
        #     reg_exp = re.compile("/[^\d]\d{2}[^\d]/")
            longest_digits = max(re.findall(r'\d+', identifier), key = len)
            print(len(longest_digits))

            # if the identifier contains seven consecutive numbers, add a slash
            if len(longest_digits) == 7:
                print("----- regex match -----")
                reverse = identifier[::-1]
                print(reverse)
                identifier_reverse = reverse[:7] + "/" + reverse[7:]
                identifier = identifier_reverse[::-1]
            else:
                # otherwise we can leave the identifier how it is
                print("----- no match -----")
            print(identifier)
            url = "https://arxiv.org/abs/" + identifier
            print(url)
            print("*" * 20)
        #     print(row[0])

            # format string and append
            if html:
                fmt_str = '{}: {}, {}, <a href="{}">{}</a>'
                accreditations.append(fmt_str.format(author, title, str(date), url, url))  
            else:
                fmt_str = '{}: {}, {}, {}'
                accreditations.append(fmt_str.format(author, title, str(date), url, url))
        else:
            empty_counter += 1
            print("empty!")

    print("number of empty slots:", empty_counter)
    return accreditations

In [None]:
# print out with line breaks
for row in accreditations:
    print(row)

### Get random images

In [None]:
# Get 144 random images

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

c.execute(sql, )
rows = c.fetchall()

In [None]:
# Get 144 images from stat.ML

# maybe move this???

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
    "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

c.execute(sql, ("stat.ML", ))
rows = c.fetchall()

In [None]:
# Get 144 images from stat.ML from October 2012

# maybe move this???

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND metadata.created BETWEEN date('2012-10-01') "
    "AND date('2012-10-31') "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
    "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

c.execute(sql, ("stat.ML", ))
rows = c.fetchall()

In [None]:
# Get 16 random images

sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
    "FROM images "
    "LEFT JOIN metadata ON images.identifier = metadata.identifier "
    "WHERE images.x != '' "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
    "ORDER BY RANDOM() LIMIT 144 ")

c.execute(sql, ("stat.ML", ))
rows = c.fetchall()

In [None]:
print(len(rows))

In [None]:
for row in rows:
    print(row)

In [None]:
# check to see if there are multiple categories
for row in rows:
    if len(row[0]) > 5:
        print(row)

### Shuffle list and then grab the filepath and id

In [10]:
# shuffle first, then get ids and paths

random.shuffle(rows)

In [11]:
ids = []
filepaths = []

for row in rows[:144]:
    print(row[5])
    ids.append(row[5])
    path = row[1] + '/' + row[2]
    print(path)
    filepaths.append(path)

9059685
./1210/1210.8363/7lane_formation_start-eps-converted-to.pdf
9059696
./1210/1210.8363/6transient_block3-eps-converted-to.pdf
9059692
./1210/1210.8363/3merge-eps-converted-to.pdf
9059687
./1210/1210.8363/w11-eps-converted-to.pdf
9059690
./1210/1210.8363/jam-eps-converted-to.pdf
9059698
./1210/1210.8363/w13-eps-converted-to.pdf
9059689
./1210/1210.8363/BM_nstudyN=1-eps-converted-to.pdf
9059699
./1210/1210.8363/q-eps-converted-to.pdf
9059702
./1210/1210.8363/2encounter-eps-converted-to.pdf
9082443
./1210/1210.7562/fig2.eps
9059691
./1210/1210.8363/blockphase-eps-converted-to.pdf
9059686
./1210/1210.8363/simu07-eps-converted-to.pdf
9059697
./1210/1210.8363/simu10-eps-converted-to.pdf
9059700
./1210/1210.8363/clogging-eps-converted-to.pdf
9059693
./1210/1210.8363/8lane-eps-converted-to.pdf
9059684
./1210/1210.8363/4transient_block-eps-converted-to.pdf
9082441
./1210/1210.7562/fig1b.eps
9059694
./1210/1210.8363/1initial-eps-converted-to.pdf
9059688
./1210/1210.8363/w12-eps-converted-t

In [12]:
print(filepaths)

['./1210/1210.8363/7lane_formation_start-eps-converted-to.pdf', './1210/1210.8363/6transient_block3-eps-converted-to.pdf', './1210/1210.8363/3merge-eps-converted-to.pdf', './1210/1210.8363/w11-eps-converted-to.pdf', './1210/1210.8363/jam-eps-converted-to.pdf', './1210/1210.8363/w13-eps-converted-to.pdf', './1210/1210.8363/BM_nstudyN=1-eps-converted-to.pdf', './1210/1210.8363/q-eps-converted-to.pdf', './1210/1210.8363/2encounter-eps-converted-to.pdf', './1210/1210.7562/fig2.eps', './1210/1210.8363/blockphase-eps-converted-to.pdf', './1210/1210.8363/simu07-eps-converted-to.pdf', './1210/1210.8363/simu10-eps-converted-to.pdf', './1210/1210.8363/clogging-eps-converted-to.pdf', './1210/1210.8363/8lane-eps-converted-to.pdf', './1210/1210.8363/4transient_block-eps-converted-to.pdf', './1210/1210.7562/fig1b.eps', './1210/1210.8363/1initial-eps-converted-to.pdf', './1210/1210.8363/w12-eps-converted-to.pdf', './1210/1210.8363/schematic-eps-converted-to.pdf', './1210/1210.8363/9normal-eps-convert

In [13]:
print(ids)

[9059685, 9059696, 9059692, 9059687, 9059690, 9059698, 9059689, 9059699, 9059702, 9082443, 9059691, 9059686, 9059697, 9059700, 9059693, 9059684, 9082441, 9059694, 9059688, 9059695, 9059703, 9059704, 9082442, 9059701]


### Used for generating figures for paper

In [15]:
# get a montage of some images

# print the current working directory
os.chdir('/home/rte/re-imaging/sqlite-scripts/')
print(os.getcwd())

os.chdir('/home/rte/arXiv/')
print(os.getcwd())

prearg = shlex.split("-colorspace CMYK")
# arguments = shlex.split("-colorspace sRGB -background white -alpha background -trim +repage -flatten -geometry 240x240+2+2 -tile 12x /home/rte/documentation/data-samples/test_py_montage.jpg")

# for bigger montage of 12x12
arguments = shlex.split("-colorspace sRGB -background white -alpha background -geometry 240x240+2+2 -tile 12x")
# for smaller montage of 4x4
# arguments = shlex.split("-colorspace sRGB -background white -alpha background -geometry 480x480+2+2 -tile 4x")

# outputname = ["/home/rte/documentation/data-samples/random_montage_12x12_stat.ML.jpg"]
outputname = ["/home/rte/documentation/data-samples/random_montage_nlin.CG.jpg"]
# print(arguments)

filelist = []
# this takes a slice of the larger shuffled list
for filepath in itertools.islice(filepaths, 0, 144):
#     print(filepath)
    # put the filepath into the list but add the directory, remove the dot
    # and also add [0] to only use the first page of multi-page image documents
    filelist.append("src_all" + filepath.replace('./','/') + '[0]')
           
# print(filelist)

# call the montage command and parse list of files and arguments
montage_cmd = ["montage"] + prearg + filelist + arguments + outputname

result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = result.communicate()
print(out)
print(err)
print("subprocess finished")
print("-" * 40)


/home/rte/re-imaging/sqlite-scripts
/mnt/hd-4tb/arXiv
b''
b''
subprocess finished
----------------------------------------


In [None]:
# use this to copy to other files, paste as variable
print(ids)

In [None]:
# testing writing filenames

# os.chdir("/home/rte/re-imaging/sqlite-scripts/")
os.chdir("/home/rte/documentation/data-samples/")
print(os.getcwd())

# fname = "2018-10_" + catlist[0][0] + "_ids.txt"
fname = "random_montage_4x4_v2_ids.txt"
f = open(fname, "w+")
for row in ids:
    f.write(str(row) + "\n")
f.close()

In [None]:
for cat in catlist:
    print(cat[0])

In [None]:
print(len(filelist))

for row in filelist:
    print(row)

In [None]:
# testing writing filenames

fname = "2018-10_" + catlist[0][0] + ".txt"
f = open(fname, "w+")
for row in filelist:
    f.write(row + "\n")
f.close()

In [None]:
print(targetDate[:7])

In [None]:
targetYM = targetDate[:7]
savepath = "/home/rte/Documents/documentation/data-samples/montages/category/" + targetYM + "/"

if os.path.isdir(savepath):
    print("directory exists, saving to: " + savepath)
else:
    try:
        os.makedirs(savepath)
    except OSError:
        print("failed to create directory: " + savepath)
    else:
        print("successfully created the directory: " + savepath)

In [None]:
print(len(catlist))

for row in catlist:
    print(row[0])

The following block of code takes the list of categories, queries the SQL database for a list of images that match each category and a given year, then shuffles this list, selects a subset, and uses that to run the montage command ^_^ 

In [None]:
# added date formatting
sql = ('''
    SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created BETWEEN date(?) AND date(?, 'start of month','+1 month','-1 day')
    AND images.x != ''
    AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?
    ''')

targetYM = targetDate[:7]
savepath = "/home/rte/Documents/documentation/data-samples/montages/category/" + targetYM + "/"

if os.path.isdir(savepath):
    print("saving to: " + savepath)
else:
    try:
        os.makedirs(path)
    except OSError:
        print("Failed to create directory: " + savepath)
    else:
        print("Successfully created the directory: " + savepath)
        

for cat in catlist:
    print("querying for category: " + str(cat[0]))
    c.execute(sql, (targetDate, targetDate, cat[0], ))
    rows = c.fetchall()
    
    print("total number of images found: " + str(len(rows)))
    
    filepaths = []

    for row in rows:
        path = row[1] + '/' + row[2]
    #     print(path)
        filepaths.append(path)

    # shuffle the whole list
    random.shuffle(filepaths)
    
    os.chdir('/home/rte/re-imaging/sqlite-scripts/')

    # os.chdir('src_all')
#     print(os.getcwd())

    # format the arguments for montage
    arguments = shlex.split("-colorspace sRGB -units PixelsPerInch -density 300 -background white -alpha off -geometry 240x240+2+2 -tile 12x")

    filelist = []
    # this takes a slice of the larger shuffled list
    for filepath in itertools.islice(filepaths, 0, 144):
        # put the filepath into the list but add the directory, remove the dot
        # and also add [0] to only use the first page of multi-page image documents
        filelist.append("src_all" + filepath.replace('./','/') + '[0]')

    # write list of images to file (for debugging purposes, mostly)
    fname = savepath + targetYM + "_" + cat[0] + "_" + str(cat[1]) + ".txt"
    f = open(fname, "w+")
    for row in filelist:
        f.write(row + "\n")
    f.close()
    
#     outputname = "test_py_montage.jpg"
    outputname = [savepath + "montage_" + targetYM + "_" + cat[0] + "_" + str(cat[1]) + ".jpg"]

    print("calling montage")
    # call the montage command and parse list of files and arguments
    montage_cmd = ["montage"] + filelist + arguments + outputname
#     print(montage_cmd)
    result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out, err = result.communicate()
    print(out)
    print(err)
    print("subprocess finished")
    print("-" * 40)

### Getting image montages for paper

In [31]:
# Get 144 images from nlin.CG from October 2012

targetDate = "2012-01-01"
endDate = "2012-12-31"

# sql = ("SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id "
#     "FROM images "
#     "LEFT JOIN metadata ON images.identifier = metadata.identifier "
#     "WHERE images.x != '' "
#     "AND metadata.created BETWEEN date('2012-10-01') "
#     "AND date('2012-10-31') "
#     "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? "
#     "AND images.id IN (SELECT images.id FROM images ORDER BY RANDOM() LIMIT 144) ")

# c.execute(sql, ("nlin.CG", ))
# rows = c.fetchall()

sql = '''
    SELECT '/mnt/hd2/images/all/' || images.id || '.jpg'
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created BETWEEN date(?) 
    AND date(?)
    AND images.x != ''
    AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?
    '''
c.execute(sql, (targetDate, endDate, "nlin.CG"))
rows = c.fetchall()
print(len(rows))
for row in rows:
    print(row)

('/mnt/hd2/images/all/9082441.jpg',)
('/mnt/hd2/images/all/9082443.jpg',)
('/mnt/hd2/images/all/9082442.jpg',)
('/mnt/hd2/images/all/9059699.jpg',)
('/mnt/hd2/images/all/9059695.jpg',)
('/mnt/hd2/images/all/9059687.jpg',)
('/mnt/hd2/images/all/9059698.jpg',)
('/mnt/hd2/images/all/9059690.jpg',)
('/mnt/hd2/images/all/9059688.jpg',)
('/mnt/hd2/images/all/9059696.jpg',)
('/mnt/hd2/images/all/9059693.jpg',)
('/mnt/hd2/images/all/9059703.jpg',)
('/mnt/hd2/images/all/9059685.jpg',)
('/mnt/hd2/images/all/9059684.jpg',)
('/mnt/hd2/images/all/9059692.jpg',)
('/mnt/hd2/images/all/9059701.jpg',)
('/mnt/hd2/images/all/9059702.jpg',)
('/mnt/hd2/images/all/9059694.jpg',)
('/mnt/hd2/images/all/9059686.jpg',)
('/mnt/hd2/images/all/9059689.jpg',)
('/mnt/hd2/images/all/9059691.jpg',)
('/mnt/hd2/images/all/9059697.jpg',)
('/mnt/hd2/images/all/9059700.jpg',)
('/mnt/hd2/images/all/9059704.jpg',)
('/mnt/hd2/images/all/1050795.jpg',)
('/mnt/hd2/images/all/1050787.jpg',)
('/mnt/hd2/images/all/1050796.jpg',)
(

In [33]:
filepaths = []
for row in rows:
    filepaths.append(row[0])
# print(filepaths)

random.shuffle(filepaths)

# savepath = "/home/rte/documentation/dataset-methods-paper/montages/" + "nlin.CG_" + targetDate + ".jpg"
savepath = "/home/rte/documentation/dataset-methods-paper/montages/" + "nlin.CG_all2012"+ ".jpg"

print(savepath)
arguments = shlex.split("-colorspace sRGB -units PixelsPerInch -density 300 -background white -alpha off -geometry 240x240+2+2 -tile 12x")

filelist = []
# this takes a slice of the larger shuffled list
for filepath in itertools.islice(filepaths, 0, 144):
    # put the filepath into the list but add the directory, remove the dot
    # and also add [0] to only use the first page of multi-page image documents
    filelist.append(filepath.replace('./','/') + '[0]')
print(filelist)        
        
# # write list of images to file (for debugging purposes, mostly)
# fname = "nlin.CG_2012" + ".txt"
# with open(fname, "w+") as f:
#     for row in filelist:
#         f.write(row + "\n")
        
montage_cmd = ["montage"] + filelist + arguments + [savepath]

result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = result.communicate()
print(out)
print(err)
print("subprocess finished")
print("-" * 40)


/home/rte/documentation/dataset-methods-paper/montages/nlin.CG_all2012.jpg
['/mnt/hd2/images/all/1057910.jpg[0]', '/mnt/hd2/images/all/4271058.jpg[0]', '/mnt/hd2/images/all/4421589.jpg[0]', '/mnt/hd2/images/all/4015104.jpg[0]', '/mnt/hd2/images/all/6647740.jpg[0]', '/mnt/hd2/images/all/6647748.jpg[0]', '/mnt/hd2/images/all/6647747.jpg[0]', '/mnt/hd2/images/all/4015137.jpg[0]', '/mnt/hd2/images/all/2646354.jpg[0]', '/mnt/hd2/images/all/9059700.jpg[0]', '/mnt/hd2/images/all/4271079.jpg[0]', '/mnt/hd2/images/all/2666562.jpg[0]', '/mnt/hd2/images/all/1050787.jpg[0]', '/mnt/hd2/images/all/1057902.jpg[0]', '/mnt/hd2/images/all/6633287.jpg[0]', '/mnt/hd2/images/all/2646352.jpg[0]', '/mnt/hd2/images/all/9059688.jpg[0]', '/mnt/hd2/images/all/1057905.jpg[0]', '/mnt/hd2/images/all/4263892.jpg[0]', '/mnt/hd2/images/all/1057889.jpg[0]', '/mnt/hd2/images/all/4015116.jpg[0]', '/mnt/hd2/images/all/4400318.jpg[0]', '/mnt/hd2/images/all/4271083.jpg[0]', '/mnt/hd2/images/all/4271064.jpg[0]', '/mnt/hd2/im