# Plotting data from SQLite database

This notebook queries the databases to generate a matrix of plots for the number of articles/images per year, run separately for each category.

#### Setup

Import required libraries, connect to SQLite database, create cursor, fetch table info

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import sqlite3

In [None]:
# path on rte
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

In [None]:
# import the sqlite3 database and create a cursor

db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

#### Build category list
First get a full list of all the primary categories by querying the SQLite database.

In [None]:
# list primary categories by articles

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(metadata.identifier)
    FROM metadata
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(metadata.identifier) DESC    
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

In [None]:
# list primary categories by associated images

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(images.identifier) DESC    
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

In [None]:
# store the list of categories in catlist

catlist = rows
for cat in catlist:
    print(str(cat[0]))

In [None]:
# testing that both SQL queries give the same list of categories
# the lists seem to be slightly different in order but have the same number of categories

catlist_images = rows
print("are both lists the same?")
print(catlist_images == catlist)
print("do both lists have the same number of elements?")
print(len(catlist_images) == len(catlist))
print(len(catlist))

#### Query DB for data
Then use that list of primary categories to query the db for how many articles per year. Store it in the `data` variable. The first command is the number of articles only so only requires the `metadata` table. The second block below searches by number of images so also requires the `images` table in the SQLite database.

In [None]:
# total number of articles for each year by category

sql = ('''
    SELECT count(metadata.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM metadata
    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = ?
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')

data = []

for cat in catlist:
    print("querying for category: " + str(cat[0]))
    c.execute(sql, (cat[0], ))
    rows = c.fetchall()
    
#     print("total number of images found: " + str(len(rows)))
    print(rows)
#     print("total number of articles: " + rows[0][0])

    years = []
    totals = []
    
    for row in rows:
        years.append(row[1])
        totals.append(row[0])
        
    newdata = [cat[0], years, totals]
    data.append(newdata)

print("*" * 20)
print("done")

In [None]:
# total number of images for each year by category

sql = ('''
    SELECT count(images.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM images
    LEFT JOIN metadata on images.identifier = metadata.identifier
    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = ?
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')

data = []

for cat in catlist:
    print("querying for category: " + str(cat[0]))
    c.execute(sql, (cat[0], ))
    rows = c.fetchall()
    
#     print("total number of images found: " + str(len(rows)))
    print(rows)
#     print("total number of articles: " + rows[0][0])

    years = []
    totals = []
    
    for row in rows:
        years.append(row[1])
        totals.append(row[0])
        
    newdata = [cat[0], years, totals]
    data.append(newdata)

print("*" * 20)
print("done")

In [None]:
# total number of images for each year by extension

db.create_function("reverse", 1, lambda s: s[::-1])
sql = ('''
    SELECT COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1))), reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension
    FROM images
    WHERE x is not null and x != ''
    AND y is not null and y != ''
    AND imageformat is not null and imageformat != ''
    GROUP BY extension
    ''')

data = []

for cat in catlist:
    print("querying for category: " + str(cat[0]))
    c.execute(sql, (cat[0], ))
    rows = c.fetchall()
    
#     print("total number of images found: " + str(len(rows)))
    print(rows)
#     print("total number of articles: " + rows[0][0])

    years = []
    totals = []
    
    for row in rows:
        years.append(row[1])
        totals.append(row[0])
        
    newdata = [cat[0], years, totals]
    data.append(newdata)

print("*" * 20)
print("done")

#### Clean data
- remove any entries of "2019" in the years and articles columns of data (don't have full data for this year).
- rewrite all entries as integers rather than strings (otherwise there will be problems when adjusting the axes)
- find the minimum and maximum for any entries, so that we can set our axes later as needed.

Data is saved in the format of nested lists in the format
```
[
    [cat1, [year1, year2...yearX], [totat1, total2...totalY]
    [cat2, [year1, year2...yearX], [totat1, total2...totalY]
    ...
    [catZ, [year1, year2...yearX], [totat1, total2...totalY]
]
```

In [None]:
# if there is "2019" in the list of years, get the index and remove from both the year and no. article lists

for cat in data:
    while "2019" in cat[1]:
        index = cat[1].index("2019")
        print(cat[0])
        print(index)
        del cat[2][index]
        del cat[1][index]
        print("*" * 20)

In [None]:
# test to make sure there is still a total for each year

for cat in data:
    if len(cat[1]) != len(cat[2]):
        print("problem with category: " + cat)

In [None]:
# fix year entries appearing as string by going through each and re-writing as integer

for cat in data:
    for k in enumerate(cat[1]):
        print(cat[1][k[0]])
        cat[1][k[0]] = int(cat[1][k[0]])

In [None]:
print(data)

#### Save data 
Interim progress, to prevent having to run SQL queries again : )
Save as either json file or pickle for reloading

#### json
saves as human-readable JSON format

In [None]:
import json

In [None]:
filename = "articles_cat_year.json" 

In [None]:
filename = "images_cat_year.json"

In [None]:
# WRITE

load_json = []

with open(filename, "r") as read_file:
    load_json = json.load(read_file)
    read_file.close()

In [None]:
# READ

with open(filename, "w") as write_file:
    json.dump(data, write_file)
    write_file.close()

In [None]:
print(load_json)

In [None]:
data == loadedjson

#### pickle
Save data as a serialized file using pickle

In [None]:
import pickle

In [None]:
filename = "articles_cat_year.pickle"
bArticles = True

In [None]:
filename = "images_cat_year.pickle"
bArticles = False

In [None]:
#READ

load_data = []

with open(filename, "rb") as read_file:
    load_data = pickle.load(read_file)
    read_file.close()

In [None]:
# WRITE
with open(filename, "wb") as write_file:
    pickle.dump(data, write_file)
    write_file.close()

Load in the imported data in the `data` variable

In [None]:
data = load_data

Set bArticles 

In [None]:
print(filename)

In [None]:
bArticles = True

In [None]:
bArticles = False

Testing loaded data

In [None]:
print(load_data == data)

In [None]:
print(load_data)

In [None]:
print(data)

#### Recalculate with log10

In [None]:
# DON'T DO THIS! just use set_yscale('log') in plot instead

# for each item in totals list, replace with log10(d)

import math

# list comprehension on totals list
for cat in data:
    cat[2][:] = [math.log10(x) for x in cat[2]]
    print(cat[2])

#### Find max and min

Go through each value in the data to find the maximum and minimums for plotting

In [None]:
# get the maximums and minimums of year and no. articles for figuring out axes

import math

minY = math.inf
maxY = -(math.inf)
minA = math.inf
maxA = -(math.inf)

for cat in data:
#     print(len(cat))
    print(cat[0])
    if min(cat[1]) < minY: minY = min(cat[1])
    if max(cat[1]) > maxY: maxY = max(cat[1])
    if min(cat[2]) < minA: minA = min(cat[2])
    if max(cat[2]) > maxA: maxA = max(cat[2])
    print("min year: " + str(min(cat[1])))
    print("max year: " + str(max(cat[1])))
    print("min articles/images: " + str(min(cat[2])))
    print("max articles/images: " + str(max(cat[2])))
    print("*" * 20)
    
print("minY: " + str(minY))
print("maxY: " + str(maxY))
print("minA: " + str(minA))
print("maxA: " + str(maxA))
              
print("done")

In [None]:
print(data)

#### Save data in org format
Use org-friendly table format. This can be printed to console or written to a file. For posting to Github and rendered in Github markdown.

In [None]:
# write the data in an org-friendly format for posting on github
for cat in data:
    print("* " + cat[0])
    joined = list(zip(cat[1], cat[2]))
    #     print(joined)
    print("|-|-|")
    for j in joined:
        print('|' + str(j[0]) + "|" + str(j[1]) + "|")
    print("|-|-|")

In [None]:
# write the data to a file
with open("stats_images_cat_year.org", "w") as write_file:
    for cat in data:
        print("* " + cat[0], file=write_file)
        joined = list(zip(cat[1], cat[2]))
        #     print(joined)
        print("|-|-|", file=write_file)
        for j in joined:
            print('|' + str(j[0]) + "|" + str(j[1]) + "|", file=write_file)
        print("|-|-|", file=write_file)
write_file.close()

#### Plotting matrix of scatterplots

Plot data in two formats
- with shared x and y axes, for comparison across data
- with individual x and y axes taken from min/max of each plot automatically, for individual trends
- finally, save as high resolution (300 dpi) image

In [None]:
bArticles = True

In [None]:
bArticles = False

In [None]:
# plot figures with shared x and y axes using the min/max year/article numbers from the cleaning step

xdim = 15
ydim = 12

fig, ax = plt.subplots(ydim, xdim, sharex='col', sharey='row')
fig.set_size_inches(40, 30)

data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
#             ax[i, j].plot(data[idx][1], data[idx][2], '--.')
            ax[i, j].plot(data[idx][1], data[idx][2], '--r.')
            ax[i, j].title.set_text(data[idx][0])
            # add one to the maximum year for alignment
            ax[i, j].axis([minY, maxY+1, minA, maxA])

In [None]:
# fig.savefig("plot_articles_cat_year_04.png", dpi=300)
fig.savefig("plot_images_cat_year_03.png", dpi=300)

In [None]:
# plot figures with individual x and y axes for the year and article/image totals
xdim = 15
ydim = 12

fig, ax = plt.subplots(ydim, xdim)
fig.subplots_adjust(hspace=0.4, wspace=0.4)
fig.set_size_inches(40, 30)

data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
#             ax[i, j].plot(data[idx][1], data[idx][2], '--.')
            ax[i, j].plot(data[idx][1], data[idx][2], '--r.')
            ax[i, j].title.set_text(data[idx][0])
#             ax[i, j].axis([minY, maxY+1, minA, maxA])

In [None]:
# fig.savefig("plot_articles_cat_year_indax_01.png", dpi=300)
fig.savefig("plot_images_cat_year_indax_03.png", dpi=300)

### Additional plots

- Plot data with shared X axis from 1991-2018 but individual Y axes
- Log10 of Y axis
- Plot by individual categories

##### fixed time range, relative totals

In [None]:
# plot figures with shared x and y axes using the min/max year/article numbers from the cleaning step
# articles

xdim = 15
ydim = 12

fig, ax = plt.subplots(ydim, xdim, sharex='col')
fig.subplots_adjust(hspace=0.4, wspace=0.4)
fig.set_size_inches(40, 30)

if bArticles: fig.suptitle("arXiv relative number of articles per year between 1991 and 2018", x=0.5, y=0.92, size=28)
else: fig.suptitle("arXiv relative number of images per year between 1991 and 2018", x=0.5, y=0.92, size=28)
    
data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
            if bArticles: ax[i, j].plot(data[idx][1], data[idx][2], '--.')
            else: ax[i, j].plot(data[idx][1], data[idx][2], '--r.')
            ax[i, j].title.set_text(data[idx][0])
            ax[i, j].axis([1991, 2018, 0, max(data[idx][2])])
            if bLog10: ax[i, j].set_yscale('log')

In [None]:
if bArticles: fig.savefig("plot_articles_cat_year_fixedtime.svg", dpi=300)
else: fig.savefig("plot_images_cat_year_fixedtime.svg", dpi=300)

##### absolute totals

In [None]:
bLog10 = True

In [None]:
# plot figures with shared x and y axes using the min/max year/article numbers from the cleaning step
# articles

xdim = 15
ydim = 12

fig, ax = plt.subplots(ydim, xdim, sharex='col', sharey='row')
fig.subplots_adjust(hspace=0.4, wspace=0.4)
fig.set_size_inches(40, 30)

if bArticles: fig.suptitle("arXiv total articles per year between 1991 and 2018\nShared Axes", x=0.5, y=0.92, size=28)
else: fig.suptitle("arXiv total images per year between 1991 and 2018\nShared Axes", x=0.5, y=0.92, size=28)
    
data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
            if bArticles:
                ax[i, j].plot(data[idx][1], data[idx][2], '--.')
            else:
                ax[i, j].plot(data[idx][1], data[idx][2], '--r.')
            ax[i, j].title.set_text(data[idx][0])
            ax[i, j].axis([1991, 2018, 0, maxA])
            if bLog10: ax[i, j].set_yscale('log')

In [None]:
if bArticles: fig.savefig("plot_articles_cat_year_fixedtime_shareY.svg", dpi=300)
else: fig.savefig("plot_images_cat_year_indax_shareY.svg", dpi=300)

In [None]:
if bArticles: fig.savefig("plot_articles_cat_year_fixedtime_log10.svg", dpi=300)
else: fig.savefig("plot_images_cat_year_fixedtime_log10.svg", dpi=300)

##### log10

In [None]:
bArticles = True

In [None]:
bArticles = False

In [None]:
# plot figures with shared x and y axes using the min/max year/article numbers from the cleaning step

xdim = 15
ydim = 12

fig, ax = plt.subplots(ydim, xdim, sharex='col', sharey='row')
fig.subplots_adjust(hspace=0.4, wspace=0.4)
fig.set_size_inches(40, 30)

if bArticles: fig.suptitle("arXiv log10 of articles per year between 1991 and 2018\nShared Axes", x=0.5, y=0.92, size=28)
else: fig.suptitle("arXiv log10 of images per year between 1991 and 2018\nShared Axes", x=0.5, y=0.92, size=28)

data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
            if bArticles:
                ax[i, j].plot(data[idx][1], data[idx][2], '--.')
            else:
                ax[i, j].plot(data[idx][1], data[idx][2], '--r.')
            ax[i, j].title.set_text(data[idx][0])
            ax[i, j].axis([1991, 2018, 0, maxA])

In [None]:
if bArticles: fig.savefig("plot_articles_cat_year_fixedtime_log10.svg", dpi=300)
else: fig.savefig("plot_images_cat_year_indax_log10.svg", dpi=300)

##### categories

- physics (including astro-ph, cond-mat)
- cs
- math
- q-bio
- q-fin
- stat

In [None]:
# testing for primary category search
article_count = 0
for cat in data:
    if "stat." in cat[0]:
        print(cat[0])
        article_count += 1
print(article_count)

In [None]:
print(data)

#### Grab data from only some categories

In [None]:
# get only computer science
select_data = []
for cat in data:
    if "cs." in cat[0] and "physics" not in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

In [None]:
# get only maths
select_data = []
for cat in data:
    if "math." in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

In [None]:
# get all physics related categories
select_data = []
for cat in data:
    if "ph" in cat[0] or "physics." in cat[0] or "cond-mat" in cat[0] or "nlin" in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

In [None]:
# get only quantitative biology
select_data = []
for cat in data:
    if "q-bio." in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

In [None]:
# get only quantitative finance
select_data = []
for cat in data:
    if "q-fin." in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

In [None]:
# get only statistics
select_data = []
for cat in data:
    if "stat." in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

#### set log10, category and find factors

In [None]:
bLog10 = False

In [None]:
bLog10 = True

In [None]:
# get the two factors closest to the square root

input = len(data)

test = int(math.sqrt(input))
# print(test)
while input % test != 0:
    test -= 1

xdim = max(test, int(input/test))
ydim = min(test, int(input/test))

print(xdim)
print(ydim)

In [None]:
# category = "computer science"
# category = "math"
# category = "physics"
# category = "q-bio"
# category = "q-fin"
category = "stats"

In [None]:
print(len(data))

#### Plot data

In [None]:
# plot figures with shared x and y axes using the min/max year/article numbers from the cleaning step

xdim = 3
ydim = 2

fig, ax = plt.subplots(ydim, xdim, sharex='col', sharey='row')
fig.subplots_adjust(hspace=0.4, wspace=0.4)
fig.set_size_inches(40, 30)

if bArticles: fig.suptitle("arXiv " + category + " articles per year between 1991 and 2018", x=0.5, y=0.92, size=28)
else: fig.suptitle("arXiv " + category + " images per year between 1991 and 2018", x=0.5, y=0.92, size=28)

data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
            if bArticles:
                ax[i, j].plot(data[idx][1], data[idx][2], '--.')
            else:
                ax[i, j].plot(data[idx][1], data[idx][2], '--r.')
            ax[i, j].title.set_text(data[idx][0])
            ax[i, j].axis([1991, 2018, 0, maxA])
            if bLog10: ax[i, j].set_yscale('log')

In [None]:
if bArticles: fig.savefig("plot_cs_articles_year_fixedtime.svg", dpi=300)
else: fig.savefig("plot_cs_images_year_fixedtime.svg", dpi=300)

In [None]:
if bArticles: fig.savefig("plot_cs_articles_year_fixedtime_log10.svg", dpi=300)
else: fig.savefig("plot_cs_images_year_log10.svg", dpi=300)

In [None]:
if bArticles: fig.savefig("plot_" + category + "_articles_year_fixedtime.svg", dpi=300)
else: fig.savefig("plot_" + category + "_images_year.svg", dpi=300)

In [None]:
if bArticles: fig.savefig("plot_" + category + "_articles_year_fixedtime_log10.svg", dpi=300)
else: fig.savefig("plot_" + category + "_images_year_log10.svg", dpi=300)