### Plots for paper "Images of the arXiv: Reconfiguring large scientific image datasets"

This notebook contains a subset of the code from `db_plots.ipynb`

# Plotting data from SQLite database

This notebook queries the databases to generate a matrix of plots for the number of articles/images per year, run separately for each category.

Note that code here does not directly reproduce figures, but different blocks need to be selected to be run. For an updated version, see https://github.com/re-imaging/re-imaging/blob/master/sqlite-scripts/db_plots.ipynb

## Structure

- setup
- load list of categories
- pull specific data (and save as pickle)
- format data
- generate plot
- save image

Notebook is intended to be navigated and blocks to be run selectively, rather than the whole notebook being executed.

## Setup

Import required libraries, connect to SQLite database, create cursor, fetch table info

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import numpy as np
import sqlite3
import pickle
import copy
import json
import math
import pandas as pd
import os

In [None]:
# import the sqlite3 database and create a cursor
db_path = os.path.expanduser("~/data/db/arxiv_db_images.sqlite3")
db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

### Generating figures for "Images of the arXiv" paper

In [None]:
# list primary categories by alphabetical order

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(metadata.identifier)
    FROM metadata
    WHERE strftime("%Y", metadata.created) != '2019'
    AND strftime("%Y", metadata.created) != '2020'
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)
print(len(rows))

In [None]:
# store list of categories with condition
catlist = []
for cat, n in rows:
    if n > 5000:
        catlist.append((cat, n))
print(len(catlist))

In [None]:
def take_second(elem):
    return elem[1]
rows.sort(key=take_second, reverse=True)

In [None]:
catlist = rows[:16]
catlist.sort()

In [None]:
categories = [x[0] for x in catlist]
values = [x[1] for x in catlist]
y_pos = np.arange(len(categories))

fig, ax = plt.subplots()
fig.set_size_inches(10, 12)

ax.barh(y_pos, values, align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(categories)
ax.invert_yaxis()
ax.set_xlabel('No. articles')

plt.tight_layout()
plt.show()

### Testing

In [None]:
# testing that the date check works
c.execute('''
    SELECT count(metadata.identifier)
    FROM metadata
    WHERE strftime("%Y", metadata.created) != '2019'
    AND strftime("%Y", metadata.created) != '2020'
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# testing that the date check works
c.execute('''
    SELECT count(metadata.identifier)
    FROM metadata
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

### Images by extension

In [None]:
# total number of images for each year by extension

db.create_function("reverse", 1, lambda s: s[::-1])
sql = ('''
    SELECT COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1))), reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension
    FROM images
    WHERE x is not null and x != ''
    AND y is not null and y != ''
    AND imageformat is not null and imageformat != ''
    GROUP BY extension
    ''')

data = []

for cat in catlist:
    print("querying for category: " + str(cat[0]))
    c.execute(sql, (cat[0], ))
    rows = c.fetchall()
    print(rows)

    years = []
    totals = []
    
    for row in rows:
        years.append(row[1])
        totals.append(row[0])
        
    newdata = [cat[0], years, totals]
    data.append(newdata)

print("*" * 20)
print("done")