In [None]:
import matplotlib.pyplot as plt
import numpy as np
import sqlite3

In [None]:
# path on rte
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

In [None]:
# Here we import the sqlite3 database and create a cursor

db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
# Get list of all primary categories

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(images.identifier) DESC    
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

In [None]:
catlist = rows
for cat in catlist:
    print(str(cat[0]))

In [None]:
# sql command to go through each category and check the total number of articles for each year

sql = ('''
    SELECT count(metadata.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM metadata
    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = ?
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')

data = []

for cat in catlist:
    print("querying for category: " + str(cat[0]))
    c.execute(sql, (cat[0], ))
    rows = c.fetchall()
    
#     print("total number of images found: " + str(len(rows)))
    print(rows)
#     print("total number of articles: " + rows[0][0])

    years = []
    totals = []
    
    for row in rows:
        years.append(row[1])
        totals.append(row[0])
        
    newdata = [cat[0], years, totals]
    data.append(newdata)
    

print("*" * 20)
print("done")

In [None]:
# remove any entries of 2019 in the years and articles columns of data (don't have full data for this year)
# if there is "2019" in the list of years, get the index and remove from both the year and no. article lists

for cat in data:
    while "2019" in cat[1]:
        index = cat[1].index("2019")
        print(cat[0])
        print(index)
        del cat[2][index]
        del cat[1][index]
        print("*" * 20)

In [None]:
import json

In [None]:
with open("articles_cat_year.json", "w") as write_file:
    json.dump(data, write_file)
    write_file.close()

In [None]:
loadedjson = []

with open("articles_cat_year.json", "r") as read_file:
    loadedjson = json.load(read_file)
    read_file.close()

In [None]:
print(loadedjson)

In [None]:
data == loadedjson

In [None]:
import pickle

In [None]:
with open("articles_cat_year.pickle", "wb") as write_file:
    pickle.dump(data, write_file)
    write_file.close()

In [None]:
load_data = []

with open("articles_cat_year.pickle", "rb") as read_file:
    load_data = pickle.load(read_file)
    read_file.close()

In [None]:
with open("articles_cat_year_clean.pickle", "wb") as write_file:
    pickle.dump(data, write_file)
    write_file.close()

In [None]:
load_data = []

with open("articles_cat_year_clean.pickle", "rb") as read_file:
    load_data = pickle.load(read_file)
    read_file.close()

In [None]:
print(load_data)

In [None]:
for cat in data:
    for k in enumerate(cat[1]):
        print(cat[1][k[0]])
        cat[1][k[0]] = int(cat[1][k[0]])

In [None]:
print(data)

In [None]:
print(load_data == data)

In [None]:
print(type(data))

In [None]:
print(data[0])
print(data[0][1])

In [None]:
print(len(data))

In [None]:
print(len(data[170]))

In [None]:
print(data[0][2])
print(len(data[0][1]))
print(len(data[0][2]))

In [None]:
# test data values

years = [1990, 2000, 2010, 2020]
points = [100, 125, 180, 260]
testdata = [["hep-ph", years, points]]

print(testdata)
print(testdata[0])
print(testdata[0][1])

In [None]:
# test plot

plt.plot(years, points, 'ro')
plt.axis([1980, 2030, 0, 300])
plt.show

In [None]:
import math

minY = math.inf
maxY = -(math.inf)
minA = math.inf
maxA = -(math.inf)

In [None]:
# get the maximums and minimums of year and no. articles for figuring out axes

for cat in data:
#     print(len(cat))
    print(cat[0])
    if min(cat[1]) < minY: minY = min(cat[1])
    if max(cat[1]) > maxY: maxY = max(cat[1])
    if min(cat[2]) < minA: minA = min(cat[2])
    if max(cat[2]) > maxA: maxA = max(cat[2])
    print("min year: " + str(min(cat[1])))
    print("max year: " + str(max(cat[1])))
    print("min articles: " + str(min(cat[2])))
    print("max articles: " + str(max(cat[2])))
    print("*" * 20)
    
print("minY: " + str(minY))
print("maxY: " + str(maxY))
print("minA: " + str(minA))
print("maxA: " + str(maxA))
              
print("done")

In [None]:
print(data)

In [None]:
# write the data in an org-friendly format for posting on github
for cat in data:
    print("* " + cat[0])
    joined = list(zip(cat[1], cat[2]))
    #     print(joined)
    print("|-|-|")
    for j in joined:
        print('|' + str(j[0]) + "|" + str(j[1]) + "|")
    print("|-|-|")

In [None]:
# write the data to a file
with open("stats_article_cat_year.org", "w") as write_file:
    for cat in data:
        print("* " + cat[0], file=write_file)
        joined = list(zip(cat[1], cat[2]))
        #     print(joined)
        print("|-|-|", file=write_file)
        for j in joined:
            print('|' + str(j[0]) + "|" + str(j[1]) + "|", file=write_file)
        print("|-|-|", file=write_file)
write_file.close()

In [None]:
print(joined)

In [None]:
print(joined[0])

In [None]:
for cat in data:
    print("do list lengths match?")
    print(len(cat[1]) == len(cat[2]))

In [None]:
fig = plt.figure(1, figsize=(9, 9))

ax1 = fig.add_subplot(2, 2, 1)
ax1.plot(data[0][1], data[0][2], '--o')
ax1.title.set_text(data[0][0])
plt.ylabel("articles")
plt.xlabel("year")
# add one to the maximum year for alignment
plt.axis([minY, maxY+1, minA, maxA])

plt.show

In [None]:
xdim = 15
ydim = 12

fig, ax = plt.subplots(ydim, xdim, sharex='col', sharey='row')
fig.set_size_inches(40, 30)

data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
            ax[i, j].plot(data[idx][1], data[idx][2], '--o')
            ax[i, j].title.set_text(data[idx][0])
            ax[i, j].axis([minY, maxY+1, minA, maxA])

In [None]:
fig.savefig("plot_articles_cat_year_04.png", dpi=300)

In [None]:
xdim = 15
ydim = 12

fig, ax = plt.subplots(ydim, xdim)
fig.subplots_adjust(hspace=0.4, wspace=0.4)
fig.set_size_inches(40, 30)

data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
            ax[i, j].plot(data[idx][1], data[idx][2], '--o')
            ax[i, j].title.set_text(data[idx][0])
#             ax[i, j].axis([minY, maxY+1, minA, maxA])

In [None]:
fig.savefig("plot_articles_cat_year_indax_01.png", dpi=300)

In [None]:
print(data[0][1])

In [None]:
print(data[0][1][0])

In [None]:
print(data[0][1])
print(data[0][2])

In [None]:
plt.plot(data[1][1], data[1][2], '--o')
# plt.axis([1980, 2030, 0, 5000])
plt.show

In [None]:
from matplotlib.pyplot import figure


In [None]:
count = 1
for d in data[0:9]:
    plt.subplot(3, 3, count)
    count += 1
    # d[1], d[2]
plt.show

In [None]:
fig, ax = plt.subplots(3, 3, sharex='col', sharey='row')

In [None]:
fig

In [None]:
fig.set_size_inches(18.5, 10.5)
fig