# Query database

Notebook for running Python sqlite3 commands for querying database.

Specific code blocks run separately to produce statistics.

## Setup:

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
import sqlite3
import re
import json

In [None]:
# path on rte
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

In [None]:
# Here we import the sqlite3 database and create a cursor

db = sqlite3.connect(db_path)
c = db.cursor()

Get the pragma table info for each table

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

Quick test to retrive one row

In [None]:
c.execute('''
    SELECT * 
    FROM images 
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
c.execute('''
    SELECT * 
    FROM metadata 
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c.fetchall()
for row in rows:
    print(row)

## Writing to org mode

Find a few random entries from both metadata and images, write to org mode

In [None]:
c.execute('''
    SELECT * 
    FROM metadata 
    ORDER BY RANDOM() 
    LIMIT 3
''')
rows = c.fetchall()
with open("metadata_sample.org", "w+") as write_file:
    print("|", file=write_file, end = '')
    for row in rows:
    #     print(row)
        for item in row:
            print(str(item).replace("\n", " "), file=write_file, end = '')
            print("|", file=write_file, end = '')
        print("\n|", file=write_file, end = '')
write_file.close()

In [None]:
c.execute('''
    SELECT * 
    FROM images 
    ORDER BY RANDOM() 
    LIMIT 3
''')
rows = c.fetchall()
with open("images_sample.org", "w+") as write_file:
    print("|", file=write_file, end = '')
    for row in rows:
    #     print(row)
        for item in row:
            print(str(item).replace("\n", " "), file=write_file, end = '')
            print("|", file=write_file, end = '')
        print("\n|", file=write_file, end = '')
write_file.close()

In [None]:
filename = "metadata_export.org"

with open(filename, "w+") as write_file:
    print("|", file=write_file, end = '')
    for row in rows:
    #     print(row)
        for item in row:
            print(str(item).replace("\n", " "), file=write_file, end = '')
            print("|", file=write_file, end = '')
        print("\n|", file=write_file, end = '')
write_file.close()

## Database Queries

### Find any duplicate identifiers

In [None]:
# look for duplicate rows in the metadata table
c.execute('''
    SELECT identifier, COUNT(identifier)
    FROM metadata
    GROUP BY identifier
    HAVING COUNT(identifier) > 1
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
duplicates = rows

In [None]:
sql = ('''
    SELECT id, identifier, created, cat, authors, title
    FROM metadata
    WHERE identifier IS ?
''')

for d in duplicates[:]:
    c.execute(sql, (d[0], ))
    rows = c.fetchall()
    
    same = True
    r1 = rows[0][1:]
    for row in rows[1:]:
        if r1 != row[1:]:
            same = False
    if same is False:
        print("-----",d[0])
        for row in rows:
            print(row)
        print("----- !!! mismatch !!! -----")
#     else:
#         print("*** entry identitical ***")

In [None]:
print(len(rows))

In [None]:
# look for duplicate rows in the images table (brings up totals for images by article ID)
c.execute('''
    SELECT identifier, COUNT(identifier)
    FROM images
    GROUP BY identifier
    HAVING COUNT(identifier) > 0
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# look for duplicate images (search by filename, filesize, and identifier)
# there is quite a lot of these, but that is how the source data was uploaded
c.execute('''
    SELECT identifier, filename, filesize, path, COUNT(*) c
    FROM images
    GROUP BY identifier, filename, filesize
    HAVING c > 1
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

In [None]:
# not currently working

# look for duplicate images (search by filename, filesize, and identifier)
# there is quite a lot of these, but that is how the source data was uploaded
c.execute('''
    SELECT info, COUNT(*)
    FROM ((SELECT filename AS info from images) union all
            (SELECT filesize AS info from images) union all
            (SELECT path AS info from images)
            ) t
    GROUP BY info
    ORDER BY COUNT(*) desc
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

### Earliest dates

In [None]:
# Find the earliest date of an article

c.execute('''
    SELECT created, identifier
    FROM metadata
    WHERE created IS NOT NULL
    ORDER BY created ASC
    LIMIT 20
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find the earliest dated articles and get the associated images

c.execute('''
    SELECT images.identifier, metadata.created, metadata.identifier 
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created IS NOT NULL
    ORDER BY created ASC
    LIMIT 40
    ''')
    
rows = c.fetchall()
for row in rows:
    print(row)

### Total numbers of articles and images

In [None]:
# Find total number of rows in metadata

c.execute('''
    SELECT count(*)
    FROM metadata
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find total number of rows in images

c.execute('''
    SELECT count(*)
    FROM images
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find total number of rows in images (filtering out images without size or imageformat)

c.execute('''
    SELECT count(*)
    FROM images
    WHERE x IS NOT null AND x != ''
    AND y IS NOT null AND y != ''
    AND imageformat is not null AND imageformat != ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)
print(len(rows))    

### Average image sizes

In [None]:
# Get average of the x size column

c.execute('''
    SELECT avg(x)
    FROM images
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get average of the y size column

c.execute('''
    SELECT avg(y)
    FROM images
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get median of the x size column

c.execute('''
    SELECT x
    FROM images
    ORDER by x
    LIMIT 1
    OFFSET (SELECT COUNT(*) FROM images) / 2
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get median of the y size column

c.execute('''
    SELECT y
    FROM images
    ORDER by y
    LIMIT 1
    OFFSET (SELECT COUNT(*) FROM images) / 2
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

### Missing fields

In [None]:
# Find where there are NULL values for x

c.execute('''
    SELECT id, identifier, path, filename, filesize, x, y, imageformat
    FROM images
    WHERE x is null or x = ''
    OR y is null or y = ''
    OR imageformat is null or imageformat = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# generate a text file listing paths of files that couldn't be identified
with open("identify_errors_db.txt", "a+") as f:
    for row in rows[:]:
        filepath = row[2][1:] + "/" + row[3]
        print(str(row[0]) + "," + filepath)
        f.write(str(row[0]) + "," + filepath + "\n")


In [None]:
print(len(rows))

In [None]:
# Find where there are NULL values for y

c.execute('''
    SELECT id, identifier, path, filename, filesize, x, y, imageformat
    FROM images
    WHERE y is null or y = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

In [None]:
# Find where there are NULL values for identifier

c.execute('''
    SELECT id, identifier, path, filesize, x, y, imageformat
    FROM images
    WHERE identifier is null or identifier = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)
    
# no result is a good result!

In [None]:
# Find where there are NULL values for filesize

c.execute('''
    SELECT id, identifier, path, filesize, x, y, imageformat
    FROM images
    WHERE filesize is null or filesize = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

In [None]:
# Get number of articles with no category
# this seems to give no results

c.execute('''
    SELECT identifier, cat
    FROM metadata
    WHERE cat IS NULL OR cat = '' OR cat = 'None'
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find where there is no matching metadata for a given image

c.execute('''
    SELECT count(images.identifier), images.identifier
    FROM images
    LEFT JOIN metadata ON metadata.identifier = images.identifier
    WHERE metadata.identifier IS NULL
    GROUP BY images.identifier
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find where there is no identifier for an image

c.execute('''
    SELECT identifier, id, filename, filesize, path, x, y
    FROM images
    WHERE identifier IS NULL OR identifier = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# More looking for specific entries with no identifier

c.execute('''
    SELECT identifier, id
    FROM images
    WHERE identifier IS NULL OR identifier = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(sum(row[0] for row in rows))

### Averages of images

In [None]:
# Get total number of images per article

c.execute('''
    SELECT images.identifier, metadata.cat, count(images.identifier)    
    FROM images 
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY images.identifier
    ORDER BY count(images.identifier)
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get total number of images per category

c.execute('''
    SELECT metadata.cat, count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY metadata.cat
    ORDER BY count(images.identifier) DESC    
    LIMIT 200
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get total number of images per primary category only

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(images.identifier) DESC    
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
fname = "stats_images_by_category.txt"
with open(fname, "w+") as write_file:
    print("|", file=write_file, end = '')
    for row in rows:
    #     print(row)
        for item in row:
            print(str(item).replace("\n", " "), file=write_file, end = '')
            print("|", file=write_file, end = '')
        print("\n|", file=write_file, end = '')
write_file.close()

In [None]:
# write text file

fname = "stats_images_by_category.txt"
f = open(fname, "w+")
for row in rows:
    f.write(row[0] + ", " + str(row[1]) + "\n")
f.close()

In [None]:
print(len(rows))

In [None]:
# Get total number of images per primary category only - with a specific filter

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) c, count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE c = "cond-mat.mes-hall"
    GROUP BY c
    ORDER BY count(images.identifier) DESC    
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find total number of images in a given category and year

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) c, 
            count(images.identifier), 
            strftime("%Y", metadata.created) as 'Y'
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE c = "cs.CV"
    AND Y = '2019'
    ''')
rows = c.fetchall()
for row in rows:
    print(row)
#         GROUP BY c
#     ORDER BY count(images.identifier) DESC    


In [None]:
# Get total number of images per primary category with multiple filters

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) c, count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE c = "cs.LG"
    OR c = "cs.CV"
    OR c = "stat.ML"
    OR c = "cs.AI"
    GROUP BY c
    ORDER BY count(images.identifier) DESC    
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get total number of images per primary category

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(images.identifier) DESC    
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
for row in rows:
    print(row[0])
for row in rows:
    print(row[1])

In [None]:
print(len(rows))

### Totals by month/year

In [None]:
# Find number of articles by year

c.execute('''
    SELECT count(metadata.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM metadata
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find number of articles in each month/year

c.execute('''
    SELECT count(metadata.identifier), strftime("%m-%Y", metadata.created) as 'mY'
    FROM metadata
    GROUP BY strftime("%m-%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find number of images by year

c.execute('''
    SELECT count(images.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find number of images in each month/year

c.execute('''
    SELECT count(images.identifier), strftime("%m-%Y", metadata.created) as 'mY'
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY strftime("%m-%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
total = 0
for row in rows:
    total += row[0]
print(total)

#### ...and by category

In [None]:
# Find number of articles by year by category

c.execute('''
    SELECT count(metadata.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM metadata
    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = 'cs.CV'
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find number of articles by year by category, for a specific query

c.execute('''
    SELECT count(metadata.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM metadata
    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = 'cs.CV'
    AND strftime("%Y", metadata.created) = '2018'
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find number of articles by year by category, for a specific query

c.execute('''
    SELECT count(metadata.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM metadata
    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = 'cs.CV'
    AND strftime("%Y", metadata.created) = '2012'
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

In [None]:
# values = np.array()
values = [item[0] for item in rows]
plt.plot(values)

### Other metadata

In [None]:
# Find number of images per each different imageformat

c.execute('''
    SELECT imageformat, count(imageformat)
    FROM images
    GROUP BY imageformat
    ORDER BY count(imageformat) DESC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get only the first listed category

c.execute('''
    SELECT identifier, cat, substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)
    FROM metadata
    LIMIT 20
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get the licence information for number of articles

c.execute('''
    SELECT licence, COUNT(licence)
    FROM metadata
    GROUP BY licence
    HAVING COUNT(identifier) > 0
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get the number of images per licence

c.execute('''
    SELECT metadata.licence, COUNT(metadata.licence)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    GROUP BY metadata.licence
    HAVING COUNT(images.identifier) > 0
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get number of articles for each primary category

c.execute('''
    SELECT count(substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)) AS cnt, substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)
    FROM metadata
    GROUP BY substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)
    ORDER BY cnt DESC
    ''')
rows = c.fetchall()
for row in rows:
    print(row[0])
for row in rows:
    print(row[1])

### Querying by image extension

In [None]:
# define reverse function
db.create_function("reverse", 1, lambda s: s[::-1])

In [None]:
# Get the number of images per extension
# DON'T filter based on X or Y dimension

c.execute('''
    SELECT COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1))), reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension
    FROM images
    GROUP BY extension 
    ''')

rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get the number of images per extension
# filter based on X or Y dimension

c.execute('''
    SELECT COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1))), reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension
    FROM images
    WHERE x is not null and x != ''
    AND y is not null and y != ''
    AND imageformat is not null and imageformat != ''
    GROUP BY extension
    ''')

rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get the number of images per extension by year
# filter based on X or Y dimension
# 2018 only

c.execute('''
    SELECT reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension, COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)))
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE x is not null and x != ''
    AND y is not null and y != ''
    AND imageformat is not null and imageformat != ''
    AND strftime("%Y", metadata.created) = '2018'
    GROUP BY extension
    ''')

rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get the number of images per extension
# filter based on X or Y dimension

c.execute('''
    SELECT reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension, COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)))
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE x is not null and x != ''
    AND y is not null and y != ''
    AND imageformat is not null and imageformat != ''
    GROUP BY extension
    ''')

rows = c.fetchall()
for row in rows:
    print(row)
    
total = 0
for row in rows:
    total += row[1]
print("total:", total)

print("*" * 20)

alldata = []
for i, row in enumerate(rows):
    alldata.append([row[0], row[1], row[1]/total])
for d in alldata:
    print("{} | {} | {:2.2%}".format(d[0], d[1], d[2]))

### Get extensions by years and get statistics for changing percentages

In [None]:
# Get list of years

c.execute('''
    SELECT strftime("%Y", metadata.created), COUNT(strftime("%Y", metadata.created))
    FROM metadata
    GROUP BY strftime("%Y", metadata.created)
    ''')

years = []

rows = c.fetchall()
for row in rows:
    print(row)
    years.append(row[0])
print(years)

In [None]:
# Get the number of images per extension by year
# filter based on X or Y dimension

data = []

# manually selecting years
# years = ["2000", "2005", "2010", "2015"]

db.create_function("reverse", 1, lambda s: s[::-1])
sql = ('''
    SELECT reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension, COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)))
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE x is not null and x != ''
    AND y is not null and y != ''
    AND imageformat is not null and imageformat != ''
    AND strftime("%Y", metadata.created) = ?
    GROUP BY extension
    ''')

# data = []

for y, year in enumerate(years):
    print("*" * 20)
    print(year)
    print("*" * 20)
    c.execute(sql, (year, ))

    rows = c.fetchall()
#     for row in rows:
#         print(row)
    total = 0
    for row in rows:
        total += row[1]
    print("total:", total)

    data = []
    for i, row in enumerate(rows):
        data.append([row[0], row[1], row[1]/total])
    for d in data:
        print("{} | {} | {:2.2%}".format(d[0], d[1], d[2]))
#     print(data)
        

In [None]:
for d in data:
    print(d)

In [None]:
# get proportion as percentage

total = 0
for row in rows:
    total += row[1]
print(total)

data = []
for row in rows:
    data.append([row[0], row[1], row[1]/total])
print(data)

In [None]:
# take second element for sort
def takeSecond(elem):
    return elem[1]
data.sort(key=takeSecond)

for d in data:
    print("{} | {} | {:2.2%}".format(d[0], d[1], d[2]))

In [None]:
values = []
names = []
for row in rows:
    values.append(row[0])
    names.append(row[1])

In [None]:
values_save = values
names_save = names

In [None]:
for i, name in enumerate(names.copy()):
    names[i] = name.lower()
print(names)

In [None]:
print(values)
print(names)
print(len(values))
print(len(names))

In [None]:
total = 0
for row in rows:
    total += row[0]
print(total)

In [None]:
# writing file to org mode for github

# write the data to a file
with open("stats_articles_by_cat.org", "w") as write_file:
    print("* number of articles by category", file=write_file)
    print("|-|-|", file=write_file)
    for cat in rows:
#         joined = list(zip(cat[1], cat[2]))
        #     print(joined)
        print('|' + str(cat[0]) + "|" + str(cat[1]) + "|", file=write_file)
    print("|-|-|", file=write_file)
write_file.close()

#### Image metadata

In [None]:
# Get the number of images per creator

c.execute('''
    SELECT images.creator, COUNT(images.creator)
    FROM images
    GROUP BY images.creator
    ORDER BY COUNT(images.creator) DESC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get the number of images per creator

c.execute('''
    SELECT images.creator, COUNT(images.creator), strftime("%Y", metadata.created)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY images.creator, strftime("%Y", metadata.created)
    ORDER BY images.creator ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))
print(rows[:20])

In [None]:
# remove the ® symbol
creators = []

for i, (creator, count) in enumerate(rows):
    if "®" in creator:
        print("found symbol")
        print(creator)
        new_creator = creator.replace("®", "")
        print(new_creator)
        creators.append([new_creator, count])
    else:
        creators.append([creator, count])
# for creator, count in rows:
#     print(creator)


In [None]:
# version with years
# remove the ® symbol
creators = []

for i, (creator, count, year) in enumerate(rows):
    if "®" in creator:
        print("found symbol")
        print(creator)
        new_creator = creator.replace("®", "")
        print(new_creator)
        creators.append([new_creator, count, year])
    else:
        creators.append([creator, count, year])
# for creator, count in rows:
#     print(creator)


In [None]:
for creator, total, year in creators[:50]:
    print(creator, total, year)

In [None]:
print(creators[:10])

In [None]:
cleaned = []

regex = r'\n?\s?(?:http://|www.|edited with|created with the |esp|afpl|windows|adobe|gpl\s|apple|wolfram|gnu\s|microsoft office |microsoft|the\s|version|v\.|v\s)?\s?((?:[\w\/]+(?:[\-]?(?!\d)))+)(?:\sversion|v\.|v\ |\,|\n|\s|\-|\.|\(|\))?\s?'

# regex = r'([\S\s]+?)\s?(?:version|wolfram|adobe|gpl|gnu|microsoft|the|v\.|v\s||www|http|\s\d)(?:\b\s[\S\s]*)?'
# regex = r'(\\d+(\\.\\d*)*)|(version)|wolfram|adobe|gpl|gnu |microsoft|the |v\\.|[:punct:]|www|http'
# (\d+(\.\d*)*)|version|wolfram|adobe|gpl|gnu|microsoft|the|v\.|[\s]+|www|http

for creator, total in creators[:]:
    print(creator + " | " + str(total))
    match = re.search(regex, creator, re.IGNORECASE)
    if match:
        cleaned.append([match.group(1), total])
        print(">>>",match.group(1))
    else:
        cleaned.append([creator, total])
# print(cleaned)

In [None]:
# version with years
cleaned = []

regex = r'\n?\s?(?:http://|www.|edited with|created with the |esp|afpl|windows|adobe|gpl\s|apple|wolfram|gnu\s|microsoft office |microsoft|the\s|version|v\.|v\s)?\s?((?:[\w\/]+(?:[\-]?(?!\d)))+)(?:\sversion|v\.|v\ |\,|\n|\s|\-|\.|\(|\))?\s?'

# regex = r'([\S\s]+?)\s?(?:version|wolfram|adobe|gpl|gnu|microsoft|the|v\.|v\s||www|http|\s\d)(?:\b\s[\S\s]*)?'
# regex = r'(\\d+(\\.\\d*)*)|(version)|wolfram|adobe|gpl|gnu |microsoft|the |v\\.|[:punct:]|www|http'
# (\d+(\.\d*)*)|version|wolfram|adobe|gpl|gnu|microsoft|the|v\.|[\s]+|www|http

for creator, total, year in creators[:]:
    print(f'{creator} | {total} | {year}')
    match = re.search(regex, creator, re.IGNORECASE)
    if match:
        cleaned.append([match.group(1), total, year])
        print(">>>",match.group(1))
    else:
        cleaned.append([creator, total, year])
# print(cleaned)

In [None]:
for row in cleaned[50:100]:
    print(row)

In [None]:
with open('creator_cleaned.json', 'w') as outfile:
    json.dump(cleaned, outfile)

In [None]:
with open('creator_cleaned_years.json', 'w') as outfile:
    json.dump(cleaned, outfile)

In [None]:
print(cleaned[:10])

In [None]:
print(len(cleaned))

In [None]:
def sum_duplicates(a, ind):
    length = len(a)
    print("length:",length)
    
    if ind == length:
        print("ind == length")
        print("ending!")
        return 1    
    else:
        target = a[ind][0]
        print("target:",target)

        for i, (creator, total) in enumerate(a):
            removables = []
            if i != ind:
#                 print("i != ind")
#                 print("i:",i,"ind:",ind)
                if creator == target:
                    print("creator == target")
                    print("creator",creator,"target",target)
                    removables.append(i)
#             else:
                # if i and ind are the same, don't compare
#                 print("i == ind")
#                 print("i:",i,"ind:",ind)
        print("removables:",removables)
        for rem in removables:
            a[ind][1] += a[rem][1]
            del a[rem]
        return 0

In [None]:
with open('creator_cleaned.json') as infile:
    full_list = json.load(infile)

In [None]:
full_list

In [None]:
# sum up totals
# create new array with each entry

with open('creator_cleaned.json') as infile:
    cleaned = json.load(infile)
    
summed = cleaned[:]

ind = 0
running = True
while running:
    length = len(summed)
    print("length:",length)
    
    if ind == length:
        print("ind == length")
        print("--- ending! ---")
        running = False # stop if at the end of the list   
    else:
        target = summed[ind][0]
        print("target:",target)
        removables = []
        for i, (creator, total) in enumerate(summed):
            if i != ind:
#                 print("different index, comparing")
#                 print("i:",i,"ind:",ind)
                if creator.lower() == target.lower():
#                     print("creator == target")
                    print("i:", i, "creator",creator,"target",target)
                    removables.append(i)
#             else:
                # if i and ind are the same, don't compare
#                 print("same index")
#                 print("i:",i,"ind:",ind)

        print("removables:",removables)
#         removables.reverse()
#         print("removables reversed:",removables)

        for rem in removables[::-1]:
            print("removing item", rem)
            summed[ind][1] += summed[rem][1]
            del summed[rem]
        ind += 1    

In [None]:
# print results
print(len(summed))
for creator, total in summed:
    print(creator, "|", total)

In [None]:
# version with years

# sum up totals
# create new array with each entry

with open('creator_cleaned_years.json') as infile:
    cleaned = json.load(infile)
    
summed = cleaned[:]

ind = 0
running = True
while running:
    length = len(summed)
    print("length:",length)
    
    if ind == length:
        print("ind == length")
        print("--- ending! ---")
        running = False # stop if at the end of the list   
    else:
        target = summed[ind][0]
        print("target:",target)
        removables = []
        for i, (creator, total, year) in enumerate(summed):
            if i != ind:
#                 print("different index, comparing")
#                 print("i:",i,"ind:",ind)
                if creator.lower() == target.lower() and year == summed[ind][2]:
#                     print("creator == target")
                    print("i:", i, "creator:",creator,"target:",target,"year:",year)
                    removables.append(i)
#             else:
                # if i and ind are the same, don't compare
#                 print("same index")
#                 print("i:",i,"ind:",ind)

        print("removables:",removables)
#         removables.reverse()
#         print("removables reversed:",removables)

        for rem in removables[::-1]:
            print("removing item", rem)
            summed[ind][1] += summed[rem][1]
            del summed[rem]
        ind += 1    

In [None]:
# print results with year
print(len(summed))
for creator, total, year in summed:
    print(f'{creator} | {total} | {year}')

In [None]:
# with open('creator_summed_years.json', 'w') as outfile:
#     json.dump(summed, outfile)

In [None]:
with open('creator_summed_years.json', 'r') as infile:
    summed = json.load(infile)

In [None]:
sorted_list = sorted(summed, key=lambda x: x[1], reverse=True)

In [None]:
# print results with year
print(len(sorted_list))
for creator, total, year in sorted_list:
    print(f'{creator} | {total} | {year}')

In [None]:
# with open('creator_summed.json', 'w') as outfile:
#     json.dump(sorted_list, outfile)

In [None]:
with open('creator_summed.json', 'r') as infile:
    sorted_list = json.load(infile)

In [None]:
print(sorted_list)

In [None]:
# print results
print(len(summed))
for creator, total, year in summed[:10]:
    print(f'{creator} | {year} | {total}')

In [None]:
list_of_creators = []
for row in sorted_list:
    list_of_creators.append(row[0])

In [None]:
print(list_of_creators)

In [None]:
ordered_creator_list = []
for c, n in sorted_list[:100]:
    creator_sublist = []
    for yy in reversed(range(1990, 2019)):
#         print(yy, c)
    #     print(c, n)
        added = False
        for row in summed:
            if c.lower() == row[0].lower() and str(yy) == row[2]:
                creator_sublist.append(row)
                added = True
        if added == False:
            creator_sublist.append([c, 0, yy])
#     print(creator_sublist)
    ordered_creator_list.append(creator_sublist)

In [None]:
for i in range(len(ordered_creator_list)):
    ordered_creator_list[i][2] = str(ordered_creator_list[i][2])

In [None]:
for r in ordered_creator_list[:3]:
    print(r)

In [None]:
with open("creator_totals_all_years_test.org", "w+") as write_file:
    # first row
    print("| creator |", file=write_file, end = '')
    for yy in reversed(range(1990, 2019)):
        print(str(yy), file=write_file, end = '')
        print("|", file=write_file, end = '')
    print("\n", file=write_file, end = '')
    
    for row in ordered_creator_list:
        print("row:", row)
        print("| ", file=write_file, end = '')
        print(str(row[0][0]).replace("\n", " "), file=write_file, end = '')
        print("|", file=write_file, end = '')
    #     print(row)
        for yy in reversed(range(1990, 2019)):
            for item in row:
#                 print("item:", item)
#                 print("yy:", yy)
                if item[2] == str(yy) or item[2] == yy or str(item[2]) == str(yy):
                    print(">>> match:", item)
                    print(item[1], file=write_file, end = '')
                    print("|", file=write_file, end = '')
        print("\n", file=write_file, end = '')
write_file.close()

In [None]:

#     print("external index:",index)
#     if sum_duplicates(summed[:10], index) == 0:
#         index += 1
#     else:
#         running = False   

### Number of images per licence

In [None]:
# Get the number of images per licence -- specify

c.execute('''
    SELECT images.creator, COUNT(images.creator)
    FROM images
    WHERE images.creator LIKE 'matlab%' COLLATE NOCASE
    GROUP BY images.creator
    ORDER BY COUNT(images.creator) DESC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
total = 0
for name, num in rows:
    total += num
print(total)

In [None]:
print("|-|-|-|")
for row in rows[1:16]:
    per = str(round((row[1]/10053059*100), 2))
    print("|" + per + "|" + str(row[1]) + "|" + row[0] + '|')
print("|-|-|-|")

In [None]:
with open("creator_totals_summed.org", "w+") as write_file:
    print("|", file=write_file, end = '')
    for row in sorted_list:
    #     print(row)
        for item in row:
            print(str(item).replace("\n", " "), file=write_file, end = '')
            print("|", file=write_file, end = '')
        print("\n|", file=write_file, end = '')
write_file.close()

### Get image paths for 2019/2020

SQLite query joining tables and finding images of that year, formatted for other scripts in text file that has filepath,id

In [None]:
c.execute('''
    SELECT images.id, images.path, images.filename, strftime("%Y", metadata.created) as "Y"
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    WHERE strftime("%Y", metadata.created) = '2019'
    OR strftime("%Y", metadata.created) = '2020'
    ''')
rows = c.fetchall()
print(len(rows))
for row in rows:
    print(row)

In [None]:
# write to file

writepath = "/home/rte/data/paths/paths_src_update_sqlite.txt"
    
with open(writepath, "w+") as write_file:
    for id_num, path, filename, year in rows[:]:
        line = "{}/{},{}".format(path, filename, id_num)
    #     print(line)
        print(line, file=write_file, end = '\n')

### Get numbers of images for given years and categories

In [None]:
# Find number of images by year by category, for a specific query

c.execute('''
    SELECT count(images.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = 'cs.LG'
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
total = 0
for row in rows:
    print(row)
    total += row[0]
print(total)

In [None]:
# Find number of images by year by category, for a specific query

c.execute('''
    SELECT count(images.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
total = 0
for row in rows:
    print(row)
    total += row[0]
print(total)

#### sanity check for number of papers in a given category and timeframe

In [None]:
sql = ("SELECT metadata.cat, metadata.id "
    "FROM metadata "
    "WHERE metadata.created BETWEEN date('2012-01-01') "
    "AND date('2012-12-31') "
    "AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?")

c.execute(sql, ("cs.AI", ))
rows = c.fetchall()

print(len(rows))