# Plotting data from SQLite database

This notebook queries the databases to generate a matrix of plots for the number of articles/images per year, run separately for each category.

Note that code here does not directly reproduce figures, but different blocks need to be selected to be run. For an updated version, see https://github.com/re-imaging/re-imaging/blob/master/sqlite-scripts/db_plots.ipynb

## Structure

- setup
- load list of categories
- pull specific data (and save as pickle)
- format data
- generate plot
- save image

Notebook is intended to be navigated and blocks to be run selectively, rather than the whole notebook being executed.

## Setup

Import required libraries, connect to SQLite database, create cursor, fetch table info

In [1]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import numpy as np
import sqlite3
import pickle
import copy
import json
import math
import pandas as pd
import os

In [2]:
# import the sqlite3 database and create a cursor
db_path = os.path.expanduser("~/data/db/arxiv_db_images.sqlite3")
db = sqlite3.connect(db_path)
c = db.cursor()

In [3]:
c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)


Column Info:
ID, Name, Type, NotNull, DefaultVal, PrimaryKey
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'identifier', 'TEXT', 0, None, 0)
(2, 'created', 'TEXT', 0, None, 0)
(3, 'cat', 'TEXT', 0, None, 0)
(4, 'authors', 'TEXT', 0, None, 0)
(5, 'title', 'TEXT', 0, None, 0)
(6, 'abstract', 'TEXT', 0, None, 0)
(7, 'licence', 'TEXT', 0, None, 0)


## Build category lists

First get a full list of all the primary categories by querying the SQLite database. Used to do later queries. Select a sort_mode first.

In [4]:
# `catlist` refers to categories by article

print("pulling all categories by total number of articles")
c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(metadata.identifier)
    FROM metadata
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(metadata.identifier) DESC    
    ''')
rows = c.fetchall()
articles_catlist = rows
# uncomment if you want to use article count to order categories
# catlist = rows

print("pulling all categories by total number of images")
c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(images.identifier) DESC    
''')   
rows = c.fetchall()
images_catlist = rows

print("pulling all categories in alphabetical order")
c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(metadata.identifier)
    FROM metadata
    WHERE strftime("%Y", metadata.created) != '2019'
    AND strftime("%Y", metadata.created) != '2020'
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) ASC
''')
rows = c.fetchall()
alpha_catlist = rows
catlist = rows
print("done")

pulling all categories by total number of articles
pulling all categories by total number of images
pulling all categories in alphabetical order
done


In [5]:
# optionally only use category if it contains a certain number of articles or images

# articles_list = []
# for cat, n in rows:
#     if n > 10000:
#         articles_list.append((cat, n))
# print(len(articles_list))

In [6]:
print(f'The list of categories totals {len(catlist)}. Here are the first entries:')
print(catlist[:8])

The list of categories totals 172. Here are the first entries:
[('acc-phys', 47), ('adap-org', 306), ('alg-geom', 1209), ('ao-sci', 13), ('astro-ph', 94246), ('astro-ph.CO', 28434), ('astro-ph.EP', 11646), ('astro-ph.GA', 24745)]


## Finding the change in rank between number of articles and number of images

In [7]:
# structure
# [category, article-rank, images-rank, rank-difference]

ordering = [[cat[0], count, 0, 0] for count, cat in enumerate(articles_catlist)]
print(ordering)

[['hep-ph', 0, 0, 0], ['astro-ph', 1, 0, 0], ['hep-th', 2, 0, 0], ['quant-ph', 3, 0, 0], ['cond-mat.mes-hall', 4, 0, 0], ['gr-qc', 5, 0, 0], ['cond-mat.mtrl-sci', 6, 0, 0], ['cond-mat.str-el', 7, 0, 0], ['cs.CV', 8, 0, 0], ['cond-mat.stat-mech', 9, 0, 0], ['astro-ph.SR', 10, 0, 0], ['math.AP', 11, 0, 0], ['astro-ph.CO', 12, 0, 0], ['math.CO', 13, 0, 0], ['astro-ph.GA', 14, 0, 0], ['math.PR', 15, 0, 0], ['nucl-th', 16, 0, 0], ['cs.LG', 17, 0, 0], ['math.AG', 18, 0, 0], ['math-ph', 19, 0, 0], ['astro-ph.HE', 20, 0, 0], ['cond-mat.supr-con', 21, 0, 0], ['cs.IT', 22, 0, 0], ['math.NT', 23, 0, 0], ['math.DG', 24, 0, 0], ['cond-mat.soft', 25, 0, 0], ['hep-ex', 26, 0, 0], ['physics.optics', 27, 0, 0], ['math.OC', 28, 0, 0], ['math.NA', 29, 0, 0], ['math.DS', 30, 0, 0], ['hep-lat', 31, 0, 0], ['cs.CL', 32, 0, 0], ['astro-ph.EP', 33, 0, 0], ['math.FA', 34, 0, 0], ['astro-ph.IM', 35, 0, 0], ['math.GT', 36, 0, 0], ['math.CA', 37, 0, 0], ['math.RT', 38, 0, 0], ['cond-mat', 39, 0, 0], ['physics.flu

In [8]:
# find the difference in category list when ordering by article numbers vs image numbers
acount = 0
for ac in articles_catlist:
#     print("articles:",acount)
    icount = 0
    for ic in images_catlist:
        if ac[0] == ic[0]:
#             print("match:",ac[0],ic[0])
            ordering[acount][2] = icount
        icount += 1 
    acount += 1

In [9]:
# go through and modify the rank-difference
for count, row in enumerate(ordering):
    ordering[count][3] = row[1] - row[2]

In [10]:
# [category, article-rank, images-rank, change]
for cat in ordering:
    print(cat)

['hep-ph', 0, 1, -1]
['astro-ph', 1, 2, -1]
['hep-th', 2, 8, -6]
['quant-ph', 3, 7, -4]
['cond-mat.mes-hall', 4, 10, -6]
['gr-qc', 5, 14, -9]
['cond-mat.mtrl-sci', 6, 17, -11]
['cond-mat.str-el', 7, 11, -4]
['cs.CV', 8, 0, 8]
['cond-mat.stat-mech', 9, 15, -6]
['astro-ph.SR', 10, 5, 5]
['math.AP', 11, 49, -38]
['astro-ph.CO', 12, 6, 6]
['math.CO', 13, 34, -21]
['astro-ph.GA', 14, 3, 11]
['math.PR', 15, 51, -36]
['nucl-th', 16, 16, 0]
['cs.LG', 17, 4, 13]
['math.AG', 18, 68, -50]
['math-ph', 19, 44, -25]
['astro-ph.HE', 20, 9, 11]
['cond-mat.supr-con', 21, 25, -4]
['cs.IT', 22, 20, 2]
['math.NT', 23, 106, -83]
['math.DG', 24, 73, -49]
['cond-mat.soft', 25, 21, 4]
['hep-ex', 26, 13, 13]
['physics.optics', 27, 35, -8]
['math.OC', 28, 26, 2]
['math.NA', 29, 12, 17]
['math.DS', 30, 42, -12]
['hep-lat', 31, 24, 7]
['cs.CL', 32, 32, 0]
['astro-ph.EP', 33, 18, 15]
['math.FA', 34, 122, -88]
['astro-ph.IM', 35, 22, 13]
['math.GT', 36, 27, 9]
['math.CA', 37, 102, -65]
['math.RT', 38, 108, -70]
['c

## Generating Plots

### Article data

Then use that list of primary categories to query the database for how many articles per year. Store it in the `articles_data` variable.

In [11]:
# by category
# number of articles for each year

sql = ('''
    SELECT count(metadata.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM metadata
    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = ?
    AND strftime("%Y", metadata.created) != '2019'
    AND strftime("%Y", metadata.created) != '2020'
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')

data = []

for cat in catlist:
    print("querying for category: " + str(cat[0]))
    c.execute(sql, (cat[0], ))
    rows = c.fetchall()
    print(rows)

    years = []
    totals = []
    
    for row in rows:
        years.append(row[1])
        totals.append(row[0])
        
    newdata = [cat[0], years, totals]
    data.append(newdata)

print("*" * 20)
print("done")

article_data = data

# WRITE PKL

filename = "articles_cat_year_data.pkl"
with open(filename, "wb") as write_file:
    pickle.dump(article_data, write_file)

querying for category: acc-phys
[(4, '1994'), (21, '1995'), (22, '1996')]
querying for category: adap-org
[(19, '1993'), (28, '1994'), (58, '1995'), (33, '1996'), (55, '1997'), (48, '1998'), (65, '1999')]
querying for category: alg-geom
[(70, '1992'), (105, '1993'), (180, '1994'), (239, '1995'), (294, '1996'), (321, '1997')]
querying for category: ao-sci
[(8, '1995'), (5, '1996')]
querying for category: astro-ph
[(59, '1992'), (490, '1993'), (1028, '1994'), (1659, '1995'), (2418, '1996'), (3576, '1997'), (4746, '1998'), (5637, '1999'), (6348, '2000'), (6799, '2001'), (7033, '2002'), (7893, '2003'), (8243, '2004'), (8754, '2005'), (9277, '2006'), (10164, '2007'), (10122, '2008')]
querying for category: astro-ph.CO
[(3922, '2009'), (4061, '2010'), (4033, '2011'), (3918, '2012'), (3737, '2013'), (1993, '2014'), (1782, '2015'), (1650, '2016'), (1664, '2017'), (1674, '2018')]
querying for category: astro-ph.EP
[(753, '2009'), (857, '2010'), (911, '2011'), (954, '2012'), (1100, '2013'), (122

KeyboardInterrupt: 

In [None]:
print(data)

In [12]:
# READ PKL

filename = "articles_cat_year_data.pkl"
with open(filename, "rb") as read_file:
    article_data = pickle.load(read_file)

In [13]:
print(f'length of article_data: {len(article_data)}')
print(article_data[:])

length of article_data: 172
[['acc-phys', ['1994', '1995', '1996'], [4, 21, 22]], ['adap-org', ['1993', '1994', '1995', '1996', '1997', '1998', '1999'], [19, 28, 58, 33, 55, 48, 65]], ['alg-geom', ['1992', '1993', '1994', '1995', '1996', '1997'], [70, 105, 180, 239, 294, 321]], ['ao-sci', ['1995', '1996'], [8, 5]], ['astro-ph', ['1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008'], [59, 490, 1028, 1659, 2418, 3576, 4746, 5637, 6348, 6799, 7033, 7893, 8243, 8754, 9277, 10164, 10122]], ['astro-ph.CO', ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020'], [3922, 4061, 4033, 3918, 3737, 1993, 1782, 1650, 1664, 1674, 1834, 1115]], ['astro-ph.EP', ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020'], [753, 857, 911, 954, 1100, 1229, 1279, 1409, 1493, 1661, 1851, 1073]], ['astro-ph.GA', ['2008', '2009', '2010', '2011', '2012

### Image data

Pull number of images in each year. Store in `images_data`.

In [None]:
# by category
# total number of images for each year

image_pkl_filename = "images_cat_year.pkl"

sql = ('''
    SELECT count(images.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM images
    LEFT JOIN metadata on images.identifier = metadata.identifier
    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = ?
    AND strftime("%Y", metadata.created) != '2019'
    AND strftime("%Y", metadata.created) != '2020'
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')

data = []

for cat in catlist:
    print("querying for category: " + str(cat[0]))
    c.execute(sql, (cat[0], ))
    rows = c.fetchall()
    print(rows)
    
    years = []
    totals = []
    
    for row in rows:
        years.append(row[1])
        totals.append(row[0])
        
    newdata = [cat[0], years, totals]
    data.append(newdata)

print("*" * 20)
print("done")

# WRITE
image_pkl_filename = "images_cat_year_data.pkl"
with open(image_pkl_filename, "wb") as write_file:
    pickle.dump(image_data, write_file)

In [None]:
print(len(image_data))

In [14]:
# READ PKL
image_pkl_filename = "images_cat_year_data.pkl"
with open(image_pkl_filename, "rb") as read_file:
    image_data = pickle.load(read_file)

In [15]:
print(f'length of image_data: {len(image_data)}')
print(image_data[:10])

length of image_data: 171
[['acc-phys', ['1994', '1995', '1996'], [6, 16, 97]], ['adap-org', ['1993', '1994', '1995', '1996', '1997', '1998', '1999'], [42, 32, 91, 84, 267, 354, 437]], ['alg-geom', ['1993', '1994', '1995', '1996', '1997'], [1, 10, 31, 94, 283]], ['ao-sci', ['1995', '1996'], [53, 15]], ['astro-ph', ['1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008'], [671, 1851, 3602, 12800, 21683, 29362, 36265, 42560, 48630, 52680, 60883, 67104, 76491, 84635, 96412, 107300]], ['astro-ph.CO', ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'], [48589, 53462, 55708, 55415, 57661, 25631, 26535, 23871, 23439, 24589]], ['astro-ph.EP', ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'], [6367, 8697, 9815, 12149, 12765, 14176, 16185, 19045, 20954, 23530]], ['astro-ph.GA', ['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'

### Get percentages

In [16]:
# this code copies the article data then modifies it by 
# dividing the total number of images by the number of articles

average_data = copy.deepcopy(article_data)

for count, article in enumerate(average_data):
    
    # set all values to zero
    for index, val in enumerate(average_data[count][2]):
        average_data[count][2][index] = 0

#     average_data[count][2]
    for i, year in enumerate(image_data[count][1]):
        try:
            print("*" * 20)
            print(f'count {count} | year {year} | i {i}')
            print(article_data[count][1])
            listindex = article_data[count][1].index(str(year)) 
            print("listindex:",listindex)
            print("count:",count)
            print("i:",i)
            print("no images:",image_data[count][2][i])
            print("no articles:",article_data[count][2][listindex])
            print("average:",image_data[count][2][i] / article_data[count][2][listindex])
            average_data[count][2][listindex] = image_data[count][2][i] / article_data[count][2][listindex]
        except ValueError:
            print("!" * 20)
            print("didn't find index")

********************
count 0 | year 1994 | i 0
['1994', '1995', '1996']
listindex: 0
count: 0
i: 0
no images: 6
no articles: 4
average: 1.5
********************
count 0 | year 1995 | i 1
['1994', '1995', '1996']
listindex: 1
count: 0
i: 1
no images: 16
no articles: 21
average: 0.7619047619047619
********************
count 0 | year 1996 | i 2
['1994', '1995', '1996']
listindex: 2
count: 0
i: 2
no images: 97
no articles: 22
average: 4.409090909090909
********************
count 1 | year 1993 | i 0
['1993', '1994', '1995', '1996', '1997', '1998', '1999']
listindex: 0
count: 1
i: 0
no images: 42
no articles: 19
average: 2.210526315789474
********************
count 1 | year 1994 | i 1
['1993', '1994', '1995', '1996', '1997', '1998', '1999']
listindex: 1
count: 1
i: 1
no images: 32
no articles: 28
average: 1.1428571428571428
********************
count 1 | year 1995 | i 2
['1993', '1994', '1995', '1996', '1997', '1998', '1999']
listindex: 2
count: 1
i: 2
no images: 91
no articles: 58
average: 

i: 24
no images: 7412
no articles: 1142
average: 6.490367775831874
********************
count 27 | year 1998 | i 0
['1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 5
count: 27
i: 0
no images: 12
no articles: 16
average: 0.75
********************
count 27 | year 1999 | i 1
['1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 6
count: 27
i: 1
no images: 9
no articles: 9
average: 1.0
********************
count 27 | year 2000 | i 2
['1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017

no images: 2182
no articles: 488
average: 4.471311475409836
********************
count 40 | year 2012 | i 13
['1991', '1992', '1996', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 17
count: 40
i: 13
no images: 2303
no articles: 565
average: 4.076106194690266
********************
count 40 | year 2013 | i 14
['1991', '1992', '1996', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 18
count: 40
i: 14
no images: 3166
no articles: 704
average: 4.497159090909091
********************
count 40 | year 2014 | i 15
['1991', '1992', '1996', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2

count: 59
i: 7
no images: 172
no articles: 19
average: 9.052631578947368
********************
count 59 | year 2009 | i 8
['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 9
count: 59
i: 8
no images: 109
no articles: 20
average: 5.45
********************
count 59 | year 2010 | i 9
['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 10
count: 59
i: 9
no images: 439
no articles: 37
average: 11.864864864864865
********************
count 59 | year 2011 | i 10
['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 11
count: 59
i: 10
no images: 275
no articles: 32
average: 8.59375
********************
count 59 | year

['1992', '1993', '1994', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 10
count: 83
i: 6
no images: 143
no articles: 685
average: 0.20875912408759123
********************
count 83 | year 2004 | i 7
['1992', '1993', '1994', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 11
count: 83
i: 7
no images: 172
no articles: 698
average: 0.24641833810888253
********************
count 83 | year 2005 | i 8
['1992', '1993', '1994', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 12
count: 83
i: 8
no images: 251
no ar

********************
count 98 | year 2007 | i 11
['1992', '1994', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 11
count: 98
i: 11
no images: 94
no articles: 122
average: 0.7704918032786885
********************
count 98 | year 2008 | i 12
['1992', '1994', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 12
count: 98
i: 12
no images: 55
no articles: 89
average: 0.6179775280898876
********************
count 98 | year 2009 | i 13
['1992', '1994', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 13
count: 98
i: 13
no images: 49
no articles: 94
average: 0.52

no articles: 30
average: 90.56666666666666
********************
count 115 | year 2014 | i 14
['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 14
count: 115
i: 14
no images: 2484
no articles: 15
average: 165.6
********************
count 115 | year 2015 | i 15
['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 15
count: 115
i: 15
no images: 2908
no articles: 34
average: 85.52941176470588
********************
count 115 | year 2016 | i 16
['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 16
count: 115
i: 16
no images: 3032
no articles: 20
average: 151.6
********************
count 115 | year 2017 | i 17
['20

average: 1.2666666666666666
********************
count 134 | year 2001 | i 3
['1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 5
count: 134
i: 3
no images: 18
no articles: 13
average: 1.3846153846153846
********************
count 134 | year 2002 | i 4
['1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 6
count: 134
i: 4
no images: 24
no articles: 20
average: 1.2
********************
count 134 | year 2003 | i 5
['1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 7
count: 134
i: 5
no images: 35
no articles: 34
aver

listindex: 15
count: 153
i: 15
no images: 574
no articles: 53
average: 10.830188679245284
********************
count 154 | year 2007 | i 0
['2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 4
count: 154
i: 0
no images: 15
no articles: 21
average: 0.7142857142857143
********************
count 154 | year 2008 | i 1
['2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 5
count: 154
i: 1
no images: 133
no articles: 27
average: 4.925925925925926
********************
count 154 | year 2009 | i 2
['2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
listindex: 6
count: 154
i: 2
no images: 178
no articles: 11
average: 16.181818181818183
********************
count 154 | year 2010 | i 3
['2003', '2004', 

IndexError: list index out of range

In [17]:
print(average_data[:10])

[['acc-phys', ['1994', '1995', '1996'], [1.5, 0.7619047619047619, 4.409090909090909]], ['adap-org', ['1993', '1994', '1995', '1996', '1997', '1998', '1999'], [2.210526315789474, 1.1428571428571428, 1.5689655172413792, 2.5454545454545454, 4.8545454545454545, 7.375, 6.723076923076923]], ['alg-geom', ['1992', '1993', '1994', '1995', '1996', '1997'], [0, 0.009523809523809525, 0.05555555555555555, 0.1297071129707113, 0.3197278911564626, 0.881619937694704]], ['ao-sci', ['1995', '1996'], [6.625, 3.0]], ['astro-ph', ['1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008'], [0, 1.3693877551020408, 1.8005836575875487, 2.171187462326703, 5.293631100082713, 6.06347874720358, 6.1866835229667085, 6.433386553131098, 6.704473850031506, 7.152522429769084, 7.4904023887388025, 7.713543646268846, 8.140725464030087, 8.737834132967786, 9.123100140131507, 9.485635576544668, 10.600671803991306]], ['astro-ph.CO', ['2009', '2010', '

In [18]:
# copy data over for plotting
data = copy.deepcopy(average_data)

## Clean data
- remove any entries of "2019" in the years and articles columns of data (don't have full data for this year).
- rewrite all entries as integers rather than strings (otherwise there will be problems when adjusting the axes)
- find the minimum and maximum for any entries, so that we can set our axes later as needed.

Data is saved as nested lists in the format
```
[
    [cat1, [year1, year2...yearX], [totat1, total2...totalY]
    [cat2, [year1, year2...yearX], [totat1, total2...totalY]
    ...
    [catZ, [year1, year2...yearX], [totat1, total2...totalY]
]
```

### Clean year entries - make sure to run this!

In [19]:
# fix year entries appearing as string by going through each and re-writing as integer

for cat in data:
    for k in enumerate(cat[1]):
#         print(cat[1][k[0]])
        cat[1][k[0]] = int(cat[1][k[0]])

#### Remove years that aren't being used in plots

In [20]:
# get the index and remove from both the year and no. article lists

temp_data = data

# list of years to remove
years_to_remove = [2020, 2019]

for i, data_row in enumerate(data):
    num_years = len(data_row[1])
    for ii, data_year in enumerate(data_row[1][::-1]):
#         print(f'data_year: {data_year}, format: {type(data_year)}')
        for y in years_to_remove:
#             print(f'y: {y}, format: {type(y)}')
            if data_year == y:
                index = num_years - ii - 1
                print(f'found entry, i: {i} | ii: {ii} | data_year: {data_year} | index {index}')
                del(temp_data[i][2][index])
                del(temp_data[i][1][index])
                print("*" * 20)
                
data[:] = temp_data

#     while "2019" in cat[1]:
#         index = cat[1].index("2019")
#         print(cat[0])
#         print(index)
#         del cat[2][index]
#         del cat[1][index]
#         print("*" * 20)

#         while "2020" in cat[1]:
#             index = cat[1].index("2020")
#             print(cat[0])
#             print(index)
#             del cat[2][index]
#             del cat[1][index]
#             print("*" * 20)

found entry, i: 5 | ii: 0 | data_year: 2020 | index 11
********************
found entry, i: 5 | ii: 1 | data_year: 2019 | index 10
********************
found entry, i: 6 | ii: 0 | data_year: 2020 | index 11
********************
found entry, i: 6 | ii: 1 | data_year: 2019 | index 10
********************
found entry, i: 7 | ii: 0 | data_year: 2020 | index 12
********************
found entry, i: 7 | ii: 1 | data_year: 2019 | index 11
********************
found entry, i: 8 | ii: 0 | data_year: 2020 | index 12
********************
found entry, i: 8 | ii: 1 | data_year: 2019 | index 11
********************
found entry, i: 9 | ii: 0 | data_year: 2020 | index 11
********************
found entry, i: 9 | ii: 1 | data_year: 2019 | index 10
********************
found entry, i: 10 | ii: 0 | data_year: 2020 | index 12
********************
found entry, i: 10 | ii: 1 | data_year: 2019 | index 11
********************
found entry, i: 18 | ii: 0 | data_year: 2020 | index 27
********************
found ent

In [21]:
# test to make sure there is still a total for each year

for cat in data:
    if len(cat[1]) != len(cat[2]):
        print("problem with category: " + cat)

In [26]:
count = 0
for d in data:
    for y in d[1]:
        if y == 2020 or y == 2019:
            count += 1
            print(f'found year {y}')
print("-" * 20)
print(f'found a total of {count} entries with year 2019 or 2020')

--------------------
found a total of 0 entries with year 2019 or 2020


### Save data 
Interim progress, to prevent having to run SQL queries again : )
Save as either json file or pickle for reloading.
First set filename

In [37]:
# save_mode = "json"
save_mode = "pkl"

# filename = "articles_cat_year." + save_mode 
# filename = "images_cat_year." + save_mode 
filename = "average_images_article_cat_year." + save_mode

print(filename)

average_images_article_cat_year.pkl


#### JSON
saves as human-readable JSON format

In [28]:
# WRITE

# with open(filename, "w") as write_file:
#     json.dump(data, write_file)

In [30]:
# READ

load_json = []

with open(filename, "r") as read_file:
    load_json = json.load(read_file)

#### pickle
Save data as a serialized file using pickle

In [38]:
#READ

load_data = []

with open(filename, "rb") as read_file:
    load_data = pickle.load(read_file)
    read_file.close()

In [None]:
# WRITE
# with open(filename, "wb") as write_file:
#     pickle.dump(data, write_file)
#     write_file.close()

#### Check data

In [39]:
data = load_data
print(load_json[:10])

[['acc-phys', [1994, 1995, 1996], [1.5, 0.7619047619047619, 4.409090909090909]], ['adap-org', [1993, 1994, 1995, 1996, 1997, 1998, 1999], [2.210526315789474, 1.1428571428571428, 1.5689655172413792, 2.5454545454545454, 4.8545454545454545, 7.375, 6.723076923076923]], ['alg-geom', [1992, 1993, 1994, 1995, 1996, 1997], [0, 0.009523809523809525, 0.05555555555555555, 0.1297071129707113, 0.3197278911564626, 0.881619937694704]], ['ao-sci', [1995, 1996], [6.625, 3.0]], ['astro-ph', [1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008], [0, 1.3693877551020408, 1.8005836575875487, 2.171187462326703, 5.293631100082713, 6.06347874720358, 6.1866835229667085, 6.432245477119546, 6.704473850031506, 7.152522429769084, 7.4904023887388025, 7.713543646268846, 8.140725464030087, 8.737834132967786, 9.123100140131507, 9.485635576544668, 10.600671803991306]], ['astro-ph.CO', [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018], [12.38883222845487, 13.164

## Generate plots

Load in the imported data in the `data` variable

In [40]:
for row in data[:10]:
    print(row)

['acc-phys', [1994, 1995, 1996], [1.5, 0.7619047619047619, 4.409090909090909]]
['adap-org', [1993, 1994, 1995, 1996, 1997, 1998, 1999], [2.210526315789474, 1.1428571428571428, 1.5689655172413792, 2.5454545454545454, 4.8545454545454545, 7.375, 6.723076923076923]]
['alg-geom', [1992, 1993, 1994, 1995, 1996, 1997], [0, 0.009523809523809525, 0.05555555555555555, 0.1297071129707113, 0.3197278911564626, 0.881619937694704]]
['ao-sci', [1995, 1996], [6.625, 3.0]]
['astro-ph', [1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008], [0, 1.3693877551020408, 1.8005836575875487, 2.171187462326703, 5.293631100082713, 6.06347874720358, 6.1866835229667085, 6.432245477119546, 6.704473850031506, 7.152522429769084, 7.4904023887388025, 7.713543646268846, 8.140725464030087, 8.737834132967786, 9.123100140131507, 9.485635576544668, 10.600671803991306]]
['astro-ph.CO', [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018], [12.38883222845487, 13.164737749

### Load data

#### For average number of images per article by year top-16 plot

Just get the first 16 categories from the catlist, e.g. if generating top-16 by image count.

In [88]:
data = []
selected_cats = []
for c in images_catlist[:16]:
    selected_cats.append(c[0])
for c in selected_cats:
    for d in load_data:
        if c == d[0]:
            data.append(d)
print(len(selected_cats))
print(selected_cats)

16
['cs.CV', 'hep-ph', 'astro-ph', 'astro-ph.GA', 'cs.LG', 'astro-ph.SR', 'astro-ph.CO', 'quant-ph', 'hep-th', 'astro-ph.HE', 'cond-mat.mes-hall', 'cond-mat.str-el', 'math.NA', 'hep-ex', 'gr-qc', 'cond-mat.stat-mech']


In [91]:
# full names for giving plot titles
selected_cats_full = ["Computer Science: Computer Vision",
                      "High Energy Physics - Phenomenology",
                      "Astrophysics",
                      "Astrophysics of Galaxies",
                      "Computer Science: Machine Learning",
                      "Solar and Stellar Astrophysics",
                      "Cosmology and Nongalactic Astrophysics",
                      "Quantum Physics",
                      "High Energy Physics - Theory",
                      "High Energy Astrophysical Phenomena",
                      "Mesoscale and Nanoscale Physics",
                      "Strongly Correlated Electrons",
                      "Mathematics: Numerical Analysis",
                      "High Energy Physics - Experiment",
                      "General Relativity", #  and Quantum Cosmology
                      "Condensed Matter: Statistical Mechanics"
                     ]

Or select specific categories to plot

In [86]:
selected_cats = ["hep-ph", "astro-ph", "cs.CV", "astro-ph.GA", "astro-ph.CO", "astro-ph.SR",
                "quant-ph", "hep-th", "astro-ph.HE", "cond-mat.mes-hall", "cond-mat.str-el",
                "hep-ex", "cond-mat.stat-mech", "nucl-th", "gr-qc", "cs.LG"]

data = []
for c in selected_cats:
    for d in load_data:
        if c == d[0]:
            data.append(d)
print(len(data))
print(selected_cats)

16
['hep-ph', 'astro-ph', 'cs.CV', 'astro-ph.GA', 'astro-ph.CO', 'astro-ph.SR', 'quant-ph', 'hep-th', 'astro-ph.HE', 'cond-mat.mes-hall', 'cond-mat.str-el', 'hep-ex', 'cond-mat.stat-mech', 'nucl-th', 'gr-qc', 'cs.LG']


In [90]:
for d in data:
    print(d)

['cs.CV', [1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018], [9.0, 16.0, 7.4, 0.75, 2.5, 8.4375, 16.176470588235293, 6.636363636363637, 7.56, 14.131578947368421, 18.966666666666665, 9.42, 12.826446280991735, 16.010830324909747, 19.862559241706162, 22.673716012084594, 25.894160583941606, 32.72781065088758, 31.10045366169799, 26.131107491856678, 25.896034150371797]]
['hep-ph', [1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018], [0.25165562913907286, 1.4387755102040816, 3.3506805444355483, 3.037742150968604, 3.909255351220983, 4.437188710570005, 4.421135646687697, 4.8400392541707555, 5.368802902055623, 5.337369420702754, 5.486448267735093, 5.882412313903608, 6.143686502177069, 6.5687675427404955, 6.867442448773084, 7.520507084265473, 7.652392344497608, 8.716900404088424, 8.737677725118484, 9.52466464733

#### Find max and min

Go through each value in the data to find the maximum and minimums for plotting

In [92]:
# get the maximums and minimums of year and no. articles for figuring out axes
minY = math.inf
maxY = -(math.inf)
minA = math.inf
maxA = -(math.inf)

for cat in data:
#     print(len(cat))
    print(cat[0])
    if min(cat[1]) < minY: minY = min(cat[1])
    if max(cat[1]) > maxY: maxY = max(cat[1])
    if min(cat[2]) < minA: minA = min(cat[2])
    if max(cat[2]) > maxA: maxA = max(cat[2])
    print("min year: " + str(min(cat[1])))
    print("max year: " + str(max(cat[1])))
    print("min articles/images: " + str(min(cat[2])))
    print("max articles/images: " + str(max(cat[2])))
    print("*" * 20)
    
print("minY: " + str(minY))
print("maxY: " + str(maxY))
print("minA: " + str(minA))
print("maxA: " + str(maxA))
              
print("done")

cs.CV
min year: 1998
max year: 2018
min articles/images: 0.75
max articles/images: 32.72781065088758
********************
hep-ph
min year: 1992
max year: 2018
min articles/images: 0.25165562913907286
max articles/images: 12.388970743001474
********************
astro-ph
min year: 1992
max year: 2008
min articles/images: 0
max articles/images: 10.600671803991306
********************
astro-ph.GA
min year: 2008
max year: 2018
min articles/images: 9.5
max articles/images: 19.13307493540052
********************
cs.LG
min year: 1998
max year: 2018
min articles/images: 0.09090909090909091
max articles/images: 17.636856368563684
********************
astro-ph.SR
min year: 2008
max year: 2018
min articles/images: 10.226545454545455
max articles/images: 15.465523156089194
********************
astro-ph.CO
min year: 2009
max year: 2018
min articles/images: 12.38883222845487
max articles/images: 15.429756489162429
********************
quant-ph
min year: 1994
max year: 2018
min articles/images: 0
max 

#### Save data in org format
Use org-friendly table format. This can be printed to console or written to a file. For posting to Github and rendered in Github markdown.

In [None]:
# write the data in an org-friendly format for posting on github
# single dimension
for cat in data:
    print("* " + cat[0])
    joined = list(zip(cat[1], cat[2]))
    #     print(joined)
    print("|-|-|")
    for j in joined:
        print('|' + str(j[0]) + "|" + str(j[1]) + "|")
    print("|-|-|")

In [93]:
# filename = "stats_images_cat_year.org"
filename = "stats_average_images_article_cat_year.org"

In [97]:
# write the data in an org-friendly format for posting on github
# with nested lists
with open(filename, "w") as write_file:
    for cat in data:
        print("* " + cat[0], file=write_file)
        joined = list(zip(cat[1], cat[2]))
        #     print(joined)
        print("|-|-|", file=write_file)
        for j in joined:
            print('|' + str(j[0]) + "|" + str(j[1]) + "|", file=write_file)
        print("|-|-|", file=write_file)
write_file.close()

#### Plotting matrix of scatterplots

Plot data in two formats
- with shared x and y axes, for comparison across data
- with individual x and y axes taken from min/max of each plot automatically, for individual trends
- finally, save as high resolution (300 dpi) image

In [100]:
bArticles = True
# bArticles = False
bAverage = True
# bAverage = False
bLog10 = False
# bLog10 = True

In [101]:
# manually set the maximum for the Y-axis to ignore large outliers
maxA = 32

#### Plot number of images per article per year, subplots for each category

In [None]:
# plot figures with shared x and y axes using the 
# min/max year/article numbers from the cleaning step

xdim = 15
ydim = 12

fig, ax = plt.subplots(ydim, xdim, sharex='col', sharey='row')
fig.set_size_inches(40, 30)

data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
#             ax[i, j].plot(data[idx][1], data[idx][2], '--.')
#             ax[i, j].plot(data[idx][1], data[idx][2], '--r.')
            ax[i, j].plot(data[idx][1], data[idx][2], '--k.')
            ax[i, j].title.set_text(data[idx][0])
            # add one to the maximum year for alignment
            ax[i, j].axis([minY, maxY+1, minA, maxA])

In [None]:
# fig.savefig("plot_articles_cat_year_04.png", dpi=300)
fig.savefig("plot_images_cat_year_03.png", dpi=300)

In [None]:
# fig.savefig("plot_average_images_articles_cat_year_01.svg", dpi=300)
fig.savefig("plot_average_images_articles_cat_year_max32_01.svg", dpi=300)

In [None]:
# plot figures with individual x and y axes for the year and article/image totals
xdim = 15
ydim = 12

fig, ax = plt.subplots(ydim, xdim)
fig.subplots_adjust(hspace=0.4, wspace=0.4)
fig.set_size_inches(40, 30)

data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
#             ax[i, j].plot(data[idx][1], data[idx][2], '--.')
            ax[i, j].plot(data[idx][1], data[idx][2], '--r.')
            ax[i, j].title.set_text(data[idx][0])
#             ax[i, j].axis([minY, maxY+1, minA, maxA])

In [None]:
# fig.savefig("plot_articles_cat_year_indax_01.png", dpi=300)
fig.savefig("plot_images_cat_year_indax_03.png", dpi=300)

### Additional plots

- Plot data with shared X axis from 1991-2018 but individual Y axes
- Log10 of Y axis
- Plot by individual categories

##### fixed time range, relative totals

In [None]:
# plot figures with shared x and y axes using the min/max year/article numbers from the cleaning step
# articles

xdim = 15
ydim = 12

fig, ax = plt.subplots(ydim, xdim, sharex='col')
fig.subplots_adjust(hspace=0.4, wspace=0.4)
fig.set_size_inches(40, 30)

if bArticles: fig.suptitle("arXiv relative number of articles per year between 1991 and 2018", x=0.5, y=0.92, size=28)
else: fig.suptitle("arXiv relative number of images per year between 1991 and 2018", x=0.5, y=0.92, size=28)
    
data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
            if bArticles: ax[i, j].plot(data[idx][1], data[idx][2], '--.')
            else: ax[i, j].plot(data[idx][1], data[idx][2], '--r.')
            ax[i, j].title.set_text(data[idx][0])
            ax[i, j].axis([1991, 2018, 0, max(data[idx][2])])
            if bLog10: ax[i, j].set_yscale('log')

In [None]:
if bArticles: fig.savefig("plot_articles_cat_year_fixedtime.svg", dpi=300)
else: fig.savefig("plot_images_cat_year_fixedtime.svg", dpi=300)

### Plot for "Images of the arXiv" paper

#### Average number of images per article by year in each category.

Number of images published per year in each category.

In [None]:
# plot figures with shared x and y axes 
# using the min/max year/article numbers from the cleaning step

xdim = 4
ydim = 4

bLog10 = False
bArticles = False

fig, ax = plt.subplots(ydim, xdim, sharey='row') # sharex='col', 
fig.subplots_adjust(hspace=0.5, wspace=0.3)
fig.set_size_inches(16, 12)

# if bArticles: fig.suptitle("arXiv total articles per year between 1991 and 2018\nShared Axes", x=0.5, y=0.92, size=28)
# else: fig.suptitle("arXiv total images per year between 1991 and 2018\nShared Axes", x=0.5, y=0.92, size=28)
    
data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
            if bArticles:
                ax[i, j].plot(data[idx][1], data[idx][2], '--.')
            else:
                ax[i, j].plot(data[idx][1], data[idx][2], '--k.')
#             ax[i, j].title.set_text(data[idx][0])
            title_string = f'{selected_cats_full[idx]}\ntotal: {(images_catlist[idx][1])}'
#             title_string = f'{selected_cats_full[idx]}'
            ax[i, j].title.set_text(title_string)
#             s = f'total: {(catlist[idx][1])}'
#             ax[i, j].text(0.025, 0.88, s, fontsize=12, transform=ax[i, j].transAxes)
            ax[i, j].axis([1991, 2018, 0, maxA])
            if bLog10: ax[i, j].set_yscale('log')

In [None]:
fig.savefig("plot_images_cat_year_indax_shareY_top16_v4.svg", dpi=300, bbox_inches='tight',
    pad_inches=0, transparent=False)
fig.savefig("plot_images_cat_year_indax_shareY_top16_v4.png", dpi=300, bbox_inches='tight',
    pad_inches=0 )

In [None]:
if bArticles: fig.savefig("plot_articles_cat_year_fixedtime_shareY.svg", dpi=300)
else: fig.savefig("plot_images_cat_year_indax_shareY.svg", dpi=300)

In [None]:
if bArticles: fig.savefig("plot_articles_cat_year_fixedtime_log10.svg", dpi=300)
else: fig.savefig("plot_images_cat_year_fixedtime_log10.svg", dpi=300)

##### log10

In [None]:
bArticles = True

In [None]:
bArticles = False

In [None]:
# plot figures with shared x and y axes using the min/max year/article numbers from the cleaning step

xdim = 15
ydim = 12

fig, ax = plt.subplots(ydim, xdim, sharex='col', sharey='row')
fig.subplots_adjust(hspace=0.4, wspace=0.4)
fig.set_size_inches(40, 30)

if bArticles: fig.suptitle("arXiv log10 of articles per year between 1991 and 2018\nShared Axes", x=0.5, y=0.92, size=28)
else: fig.suptitle("arXiv log10 of images per year between 1991 and 2018\nShared Axes", x=0.5, y=0.92, size=28)

data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
            if bArticles:
                ax[i, j].plot(data[idx][1], data[idx][2], '--.')
            else:
                ax[i, j].plot(data[idx][1], data[idx][2], '--r.')
            ax[i, j].title.set_text(data[idx][0])
            ax[i, j].axis([1991, 2018, 0, maxA])

In [None]:
if bArticles: fig.savefig("plot_articles_cat_year_fixedtime_log10.svg", dpi=300)
else: fig.savefig("plot_images_cat_year_indax_log10.svg", dpi=300)

##### categories

- physics (including astro-ph, cond-mat)
- cs
- math
- q-bio
- q-fin
- stat

In [None]:
# testing for primary category search
article_count = 0
for cat in data:
    if "stat." in cat[0]:
        print(cat[0])
        article_count += 1
print(article_count)

In [None]:
print(data)

#### Grab data from only some categories

In [None]:
# get only computer science
select_data = []
for cat in data:
    if "cs." in cat[0] and "physics" not in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

In [None]:
# get only maths
select_data = []
for cat in data:
    if "math." in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

In [None]:
# get all physics related categories
select_data = []
for cat in data:
    if "ph" in cat[0] or "physics." in cat[0] or "cond-mat" in cat[0] or "nlin" in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

In [None]:
# get only quantitative biology
select_data = []
for cat in data:
    if "q-bio." in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

In [None]:
# get only quantitative finance
select_data = []
for cat in data:
    if "q-fin." in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

In [None]:
# get only statistics
select_data = []
for cat in data:
    if "stat." in cat[0]:
        select_data.append(cat)
print(select_data)
print(len(select_data))

data = select_data

#### set log10, category and find factors

In [None]:
bLog10 = False

In [None]:
bLog10 = True

In [None]:
# get the two factors closest to the square root

input = len(data)

test = int(math.sqrt(input))
# print(test)
while input % test != 0:
    test -= 1

xdim = max(test, int(input/test))
ydim = min(test, int(input/test))

print(xdim)
print(ydim)

In [None]:
# category = "computer science"
# category = "math"
# category = "physics"
# category = "q-bio"
# category = "q-fin"
category = "stats"

In [None]:
print(len(data))

#### Plot data

In [None]:
# plot figures with shared x and y axes using the min/max year/article numbers from the cleaning step

xdim = 3
ydim = 2

fig, ax = plt.subplots(ydim, xdim, sharex='col', sharey='row')
fig.subplots_adjust(hspace=0.4, wspace=0.4)
fig.set_size_inches(40, 30)

if bArticles: fig.suptitle("arXiv " + category + " articles per year between 1991 and 2018", x=0.5, y=0.92, size=28)
else: fig.suptitle("arXiv " + category + " images per year between 1991 and 2018", x=0.5, y=0.92, size=28)

data_size = len(data)

for i in range(ydim):
    for j in range(xdim):
        idx = (i * xdim) + j
        if idx < data_size:
            if bArticles:
                ax[i, j].plot(data[idx][1], data[idx][2], '--.')
            else:
                ax[i, j].plot(data[idx][1], data[idx][2], '--r.')
            ax[i, j].title.set_text(data[idx][0])
            ax[i, j].axis([1991, 2018, 0, maxA])
            if bLog10: ax[i, j].set_yscale('log')

In [None]:
if bArticles: fig.savefig("plot_cs_articles_year_fixedtime.svg", dpi=300)
else: fig.savefig("plot_cs_images_year_fixedtime.svg", dpi=300)

In [None]:
if bArticles: fig.savefig("plot_cs_articles_year_fixedtime_log10.svg", dpi=300)
else: fig.savefig("plot_cs_images_year_log10.svg", dpi=300)

In [None]:
if bArticles: fig.savefig("plot_" + category + "_articles_year_fixedtime.svg", dpi=300)
else: fig.savefig("plot_" + category + "_images_year.svg", dpi=300)

In [None]:
if bArticles: fig.savefig("plot_" + category + "_articles_year_fixedtime_log10.svg", dpi=300)
else: fig.savefig("plot_" + category + "_images_year_log10.svg", dpi=300)