# db_queries
#### Notebook for running python sqlite3 commands for querying database

In [1]:
import matplotlib.pyplot as plt
import numpy as np

In [2]:
import sqlite3

In [3]:
# path on rte
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

In [4]:
# Here we import the sqlite3 database and create a cursor

db = sqlite3.connect(db_path)
c = db.cursor()

Get the pragma table info for each table

In [5]:
c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)


Column Info:
ID, Name, Type, NotNull, DefaultVal, PrimaryKey
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'identifier', 'TEXT', 0, None, 0)
(2, 'created', 'TEXT', 0, None, 0)
(3, 'cat', 'TEXT', 0, None, 0)
(4, 'authors', 'TEXT', 0, None, 0)
(5, 'title', 'TEXT', 0, None, 0)
(6, 'abstract', 'TEXT', 0, None, 0)
(7, 'licence', 'TEXT', 0, None, 0)


In [6]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)


Column Info:
ID, Name, Type, NotNull, DefaultVal, PrimaryKey
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'identifier', 'TEXT', 0, None, 0)
(2, 'filename', 'TEXT', 0, None, 0)
(3, 'filesize', 'INT', 0, None, 0)
(4, 'path', 'TEXT', 0, None, 0)
(5, 'x', 'INT', 0, None, 0)
(6, 'y', 'INT', 0, None, 0)
(7, 'imageformat', 'TEXT', 0, None, 0)


Quick test to retrive one row

In [None]:
c.execute('''
    SELECT * 
    FROM images 
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
c.execute('''
    SELECT * 
    FROM metadata 
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c.fetchall()
for row in rows:
    print(row)

Find a few random entries, write to org mode

In [None]:
c.execute('''
    SELECT * 
    FROM metadata 
    ORDER BY RANDOM() 
    LIMIT 3
''')
rows = c.fetchall()
with open("metadata_sample.org", "w+") as write_file:
    print("|", file=write_file, end = '')
    for row in rows:
    #     print(row)
        for item in row:
            print(str(item).replace("\n", " "), file=write_file, end = '')
            print("|", file=write_file, end = '')
        print("\n|", file=write_file, end = '')
write_file.close()

In [None]:
c.execute('''
    SELECT * 
    FROM images 
    ORDER BY RANDOM() 
    LIMIT 3
''')
rows = c.fetchall()
with open("images_sample.org", "w+") as write_file:
    print("|", file=write_file, end = '')
    for row in rows:
    #     print(row)
        for item in row:
            print(str(item).replace("\n", " "), file=write_file, end = '')
            print("|", file=write_file, end = '')
        print("\n|", file=write_file, end = '')
write_file.close()

Find any duplicate identifiers

In [None]:
# look for duplicate rows in the metadata table
c.execute('''
    SELECT identifier, COUNT(identifier)
    FROM metadata
    GROUP BY identifier
    HAVING COUNT(identifier) > 1
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# look for duplicate rows in the images table (brings up totals for images by article ID)
c.execute('''
    SELECT identifier, COUNT(identifier)
    FROM images
    GROUP BY identifier
    HAVING COUNT(identifier) > 0
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find the earliest date of an article

c.execute('''
    SELECT created, identifier
    FROM metadata
    WHERE created IS NOT NULL
    ORDER BY created ASC
    LIMIT 20
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find the earliest dated articles and get the identifiers

c.execute('''
    SELECT images.identifier, metadata.created, metadata.identifier 
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE metadata.created IS NOT NULL
    ORDER BY created ASC
    LIMIT 20
    ''')
    
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find total number of rows in metadata

c.execute('''
    SELECT count(*)
    FROM metadata
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find total number of rows in images

c.execute('''
    SELECT count(*)
    FROM images
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find total number of rows in images

c.execute('''
    SELECT count(*)
    FROM images
    WHERE x IS NOT null AND x != ''
    AND y IS NOT null AND y != ''
    AND imageformat is not null AND imageformat != ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)
print(len(rows))    

In [None]:
# Get average of the x size column

c.execute('''
    SELECT avg(x)
    FROM images
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get average of the y size column

c.execute('''
    SELECT avg(y)
    FROM images
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find where there are NULL values for x

c.execute('''
    SELECT id, identifier, path, filename, filesize, x, y, imageformat
    FROM images
    WHERE x is null or x = ''
    OR y is null or y = ''
    OR imageformat is null or imageformat = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# generate a text file listing paths of files that couldn't be identified
with open("identify_errors_db.txt", "a+") as f:
    for row in rows[:3]:
        filepath = row[2].split(".")[1:]
        print(filepath)
#         f.write(filepath + "," + image_id + "\n")


In [None]:
print(len(rows))

In [None]:
# Find where there are NULL values for y

c.execute('''
    SELECT id, identifier, path, filename, filesize, x, y, imageformat
    FROM images
    WHERE y is null or y = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

In [None]:
# Find where there are NULL values for identifier

c.execute('''
    SELECT id, identifier, path, filesize, x, y, imageformat
    FROM images
    WHERE identifier is null or identifier = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)
    
# no result is a good result!

In [None]:
# Find where there are NULL values for filesize

c.execute('''
    SELECT id, identifier, path, filesize, x, y, imageformat
    FROM images
    WHERE filesize is null or filesize = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

In [None]:
# save (commit) changes

db.commit()

In [None]:
# Get total number of images per article

c.execute('''
    SELECT images.identifier, metadata.cat, count(images.identifier)    
    FROM images 
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY images.identifier
    ORDER BY count(images.identifier)
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get total number of images per category

c.execute('''
    SELECT metadata.cat, count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY metadata.cat
    ORDER BY count(images.identifier) DESC    
    LIMIT 200
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get total number of images per primary category only

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(images.identifier) DESC    
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
fname = "stats_images_by_category.txt"
f = open(fname, "w+")
for row in rows:
    f.write(row[0] + ", " + str(row[1]) + "\n")
f.close()

In [None]:
print(len(rows))

In [None]:
# Get total number of images per primary category only - with a specific filter

c.execute('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = "cond-mat.mes-hall"
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(images.identifier) DESC    
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get number of articles with no category
# this seems to give no results

c.execute('''
    SELECT identifier, cat
    FROM metadata
    WHERE cat IS NULL OR cat = '' OR cat = 'None'
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find where there is no matching metadata for a given image

c.execute('''
    SELECT count(images.identifier), images.identifier
    FROM images
    LEFT JOIN metadata ON metadata.identifier = images.identifier
    WHERE metadata.identifier IS NULL
    GROUP BY images.identifier
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

In [None]:
print(sum(row[0] for row in rows))

In [None]:
# Find where there is no identifier for an image

c.execute('''
    SELECT identifier, id, filename, filesize, path, x, y
    FROM images
    WHERE identifier IS NULL OR identifier = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# More looking for specific entries with no identifier

c.execute('''
    SELECT identifier, id
    FROM images
    WHERE identifier IS NULL OR identifier = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find number of images in each month/year

c.execute('''
    SELECT count(images.identifier), strftime("%m-%Y", metadata.created) as 'mY'
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY strftime("%m-%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find number of images by year

c.execute('''
    SELECT count(images.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
total = 0
for row in rows:
    total += row[0]
print(total)

In [None]:
# Find number of articles in each month/year

c.execute('''
    SELECT count(metadata.identifier), strftime("%m-%Y", metadata.created) as 'mY'
    FROM metadata
    GROUP BY strftime("%m-%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find number of articles by year

c.execute('''
    SELECT count(metadata.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM metadata
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
total = 0
for row in rows:
    total += row[0]
print(total)

In [None]:
# Find number of articles by year by category

c.execute('''
    SELECT count(metadata.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM metadata
    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = 'cs.CV'
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find number of articles by year by category

c.execute('''
    SELECT count(metadata.identifier), strftime("%Y", metadata.created) as 'Y'
    FROM metadata
    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = 'cs.CV'
    AND strftime("%Y", metadata.created) = '2018'
    GROUP BY strftime("%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

In [None]:
# values = np.array()
values = [item[0] for item in rows]
plt.plot(values)

In [None]:
# Find number of images per each different imageformat

c.execute('''
    SELECT imageformat, count(imageformat)
    FROM images
    GROUP BY imageformat
    ORDER BY count(imageformat) DESC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get only the first listed category

c.execute('''
    SELECT identifier, cat, substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)
    FROM metadata
    LIMIT 20
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get the licence information for number of articles

c.execute('''
    SELECT licence, COUNT(licence)
    FROM metadata
    GROUP BY licence
    HAVING COUNT(identifier) > 0
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get the number of images per licence

c.execute('''
    SELECT metadata.licence, COUNT(metadata.licence)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    GROUP BY metadata.licence
    HAVING COUNT(images.identifier) > 0
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
c.execute('''
    SELECT COUNT(*)
    FROM images
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get number of articles for each primary category

c.execute('''
    SELECT count(substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)), substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)
    FROM metadata
    GROUP BY substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get the number of images per extension

c.execute('''
    SELECT images.filename, COUNT(reverse(left(reverse(images.filename),charindex('.',reverse(images.filename))-1)))
    FROM images
    GROUP BY reverse(left(reverse(images.filename),charindex('.',reverse(images.filename))-1))
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Get the number of images per extension
# don't filter based on X or Y dimension

db.create_function("reverse", 1, lambda s: s[::-1])
c.execute('''
    SELECT COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1))), reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension
    FROM images
    GROUP BY extension 
    ''')

rows = c.fetchall()
for row in rows:
    print(row)

In [9]:
# Get the number of images per extension
# DO filter based on X or Y dimension

db.create_function("reverse", 1, lambda s: s[::-1])
c.execute('''
    SELECT COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1))), reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension
    FROM images
    WHERE x is not null and x != ''
    AND y is not null and y != ''
    AND imageformat is not null and imageformat != ''
    GROUP BY extension
    ''')

rows = c.fetchall()
for row in rows:
    print(row)

(25131, 'EPS')
(653, 'EPSF')
(40, 'Eps')
(605, 'GIF')
(919, 'JPEG')
(7788, 'JPG')
(1386, 'PDF')
(11250, 'PNG')
(5226, 'PS')
(9, 'Ps')
(1, 'ePS')
(4197911, 'eps')
(3407, 'epsf')
(18449, 'gif')
(26162, 'jpeg')
(450583, 'jpg')
(1, 'pS')
(3297657, 'pdf')
(1065481, 'png')
(904078, 'ps')
(23922, 'pstex')
(12400, 'svg')


In [11]:
# Get the number of images per extension by year
# DO filter based on X or Y dimension

db.create_function("reverse", 1, lambda s: s[::-1])
c.execute('''
    SELECT reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension, COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)))
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE x is not null and x != ''
    AND y is not null and y != ''
    AND imageformat is not null and imageformat != ''
    AND strftime("%Y", metadata.created) = '2018'
    GROUP BY extension
    ''')

rows = c.fetchall()
for row in rows:
    print(row)

('EPS', 441)
('JPEG', 254)
('JPG', 2004)
('PDF', 310)
('PNG', 4282)
('eps', 205067)
('gif', 162)
('jpeg', 6311)
('jpg', 111482)
('pdf', 690138)
('png', 334027)
('ps', 10665)
('pstex', 84)
('svg', 2660)


In [43]:
# Get the number of images per extension
# DO filter based on X or Y dimension

db.create_function("reverse", 1, lambda s: s[::-1])
c.execute('''
    SELECT reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension, COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)))
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE x is not null and x != ''
    AND y is not null and y != ''
    AND imageformat is not null and imageformat != ''
    GROUP BY extension
    ''')

rows = c.fetchall()
for row in rows:
    print(row)
    
total = 0
for row in rows:
    total += row[1]
print("total:", total)

alldata = []
for i, row in enumerate(rows):
    alldata.append([row[0], row[1], row[1]/total])
for d in alldata:
    print("{} | {} | {:2.2%}".format(d[0], d[1], d[2]))

('EPS', 25131)
('EPSF', 653)
('Eps', 40)
('GIF', 605)
('JPEG', 919)
('JPG', 7788)
('PDF', 1386)
('PNG', 11250)
('PS', 5226)
('Ps', 9)
('ePS', 1)
('eps', 4197911)
('epsf', 3407)
('gif', 18449)
('jpeg', 26162)
('jpg', 450583)
('pS', 1)
('pdf', 3297657)
('png', 1065481)
('ps', 904078)
('pstex', 23922)
('svg', 12400)
total: 10053059
EPS | 25131 | 0.25%
EPSF | 653 | 0.01%
Eps | 40 | 0.00%
GIF | 605 | 0.01%
JPEG | 919 | 0.01%
JPG | 7788 | 0.08%
PDF | 1386 | 0.01%
PNG | 11250 | 0.11%
PS | 5226 | 0.05%
Ps | 9 | 0.00%
ePS | 1 | 0.00%
eps | 4197911 | 41.76%
epsf | 3407 | 0.03%
gif | 18449 | 0.18%
jpeg | 26162 | 0.26%
jpg | 450583 | 4.48%
pS | 1 | 0.00%
pdf | 3297657 | 32.80%
png | 1065481 | 10.60%
ps | 904078 | 8.99%
pstex | 23922 | 0.24%
svg | 12400 | 0.12%


In [40]:
# Get list of years

c.execute('''
    SELECT strftime("%Y", metadata.created), COUNT(strftime("%Y", metadata.created))
    FROM metadata
    GROUP BY strftime("%Y", metadata.created)
    ''')

years = []

rows = c.fetchall()
for row in rows:
    print(row)
    years.append(row[0])
print(years)

('1986', 1)
('1988', 1)
('1989', 8)
('1990', 25)
('1991', 370)
('1992', 3181)
('1993', 6728)
('1994', 10085)
('1995', 12994)
('1996', 15876)
('1997', 19621)
('1998', 24174)
('1999', 27694)
('2000', 30672)
('2001', 33127)
('2002', 36102)
('2003', 39389)
('2004', 43719)
('2005', 46863)
('2006', 50303)
('2007', 55768)
('2008', 58796)
('2009', 64077)
('2010', 70283)
('2011', 76604)
('2012', 84385)
('2013', 92864)
('2014', 97593)
('2015', 105124)
('2016', 113422)
('2017', 123750)
('2018', 140242)
('2019', 22721)
['1986', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']


In [41]:
# Get the number of images per extension by year
# DO filter based on X or Y dimension

data = []
# years = ["2000", "2005", "2010", "2015"]

db.create_function("reverse", 1, lambda s: s[::-1])
sql = ('''
    SELECT reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension, COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)))
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE x is not null and x != ''
    AND y is not null and y != ''
    AND imageformat is not null and imageformat != ''
    AND strftime("%Y", metadata.created) = ?
    GROUP BY extension
    ''')

# data = []

for y, year in enumerate(years):
    print("*" * 20)
    print(year)
    print("*" * 20)
    c.execute(sql, (year, ))

    rows = c.fetchall()
#     for row in rows:
#         print(row)
    total = 0
    for row in rows:
        total += row[1]
    print("total:", total)

    data = []
    for i, row in enumerate(rows):
        data.append([row[0], row[1], row[1]/total])
    for d in data:
        print("{} | {} | {:2.2%}".format(d[0], d[1], d[2]))
#     print(data)
        

********************
1986
********************
total: 0
********************
1988
********************
total: 11
eps | 10 | 90.91%
ps | 1 | 9.09%
********************
1989
********************
total: 0
********************
1990
********************
total: 130
eps | 64 | 49.23%
ps | 66 | 50.77%
********************
1991
********************
total: 146
eps | 40 | 27.40%
ps | 106 | 72.60%
********************
1992
********************
total: 942
PS | 5 | 0.53%
eps | 268 | 28.45%
ps | 669 | 71.02%
********************
1993
********************
total: 6863
PS | 9 | 0.13%
eps | 1302 | 18.97%
epsf | 6 | 0.09%
jpg | 3 | 0.04%
pdf | 4 | 0.06%
png | 2276 | 33.16%
ps | 3263 | 47.54%
********************
1994
********************
total: 19984
EPS | 22 | 0.11%
PS | 13 | 0.07%
eps | 2874 | 14.38%
epsf | 49 | 0.25%
gif | 11 | 0.06%
jpg | 1 | 0.01%
pdf | 5 | 0.03%
png | 9137 | 45.72%
ps | 7868 | 39.37%
pstex | 4 | 0.02%
********************
1995
********************
total: 26699
EPS | 7 | 0.03%
GIF | 

total: 1003563
EPS | 610 | 0.06%
JPEG | 100 | 0.01%
JPG | 965 | 0.10%
PDF | 140 | 0.01%
PNG | 1261 | 0.13%
PS | 34 | 0.00%
eps | 233264 | 23.24%
epsf | 5 | 0.00%
gif | 121 | 0.01%
jpeg | 4385 | 0.44%
jpg | 76346 | 7.61%
pS | 1 | 0.00%
pdf | 500624 | 49.88%
png | 164472 | 16.39%
ps | 19385 | 1.93%
pstex | 259 | 0.03%
svg | 1591 | 0.16%
********************
2017
********************
total: 1130570
EPS | 694 | 0.06%
JPEG | 370 | 0.03%
JPG | 1357 | 0.12%
PDF | 183 | 0.02%
PNG | 2266 | 0.20%
PS | 19 | 0.00%
eps | 211941 | 18.75%
gif | 122 | 0.01%
jpeg | 5745 | 0.51%
jpg | 80219 | 7.10%
pdf | 590684 | 52.25%
png | 219291 | 19.40%
ps | 15364 | 1.36%
pstex | 194 | 0.02%
svg | 2121 | 0.19%
********************
2018
********************
total: 1367887
EPS | 441 | 0.03%
JPEG | 254 | 0.02%
JPG | 2004 | 0.15%
PDF | 310 | 0.02%
PNG | 4282 | 0.31%
eps | 205067 | 14.99%
gif | 162 | 0.01%
jpeg | 6311 | 0.46%
jpg | 111482 | 8.15%
pdf | 690138 | 50.45%
png | 334027 | 24.42%
ps | 10665 | 0.78%
pstex | 84 

In [36]:
for d in data:
    print(d)

['EPS', 0.00551675013264913]
['EPSF', 0.0006686969857756522]
['GIF', 0.0005524018578146691]
['JPEG', 7.268445497561436e-05]
['JPG', 0.0007123076587610208]
['PDF', 0.0001599058009463516]
['PNG', 0.0002471271469170888]
['PS', 0.004244772170575879]
['eps', 0.5939991713972133]
['epsf', 0.0031036262274587332]
['gif', 0.016455760606479092]
['jpeg', 0.0005305965213219849]
['jpg', 0.01103350026529826]
['pdf', 0.0006905023222683364]
['png', 0.0023404394502147827]
['ps', 0.35514351545634937]
['pstex', 0.004528241544980775]
['EPS', 0.008530474782322984]
['EPSF', 0.0001314276326597667]
['GIF', 0.00010267783801544274]
['JPG', 0.00023821258419582717]
['PDF', 4.10711352061771e-06]
['PNG', 4.517824872679481e-05]
['PS', 0.00039017578445868246]
['eps', 0.773012157056021]
['epsf', 0.00045999671430918353]
['gif', 0.0011417775587317233]
['jpeg', 0.00045999671430918353]
['jpg', 0.006526203384261541]
['pdf', 0.004530146213241334]
['png', 0.001458025299819287]
['ps', 0.18978971578774437]
['pstex', 0.013175620

In [18]:
# get proportion as percentage

total = 0
for row in rows:
    total += row[1]
print(total)

data = []
for row in rows:
    data.append([row[0], row[1], row[1]/total])
print(data)

1367887
[['EPS', 441, 0.00032239505163803733], ['JPEG', 254, 0.00018568785287088774], ['JPG', 2004, 0.0014650332958789725], ['PDF', 310, 0.00022662690704714644], ['PNG', 4282, 0.003130375535406068], ['eps', 205067, 0.14991516112076508], ['gif', 162, 0.00011843083529560555], ['jpeg', 6311, 0.004613685194756584], ['jpg', 111482, 0.08149942210138703], ['pdf', 690138, 0.5045285173409791], ['png', 334027, 0.244191954452378], ['ps', 10665, 0.007796696656960699], ['pstex', 84, 6.140858126438807e-05], ['svg', 2660, 0.0019446050733722888]]


In [20]:
# take second element for sort
def takeThird(elem):
    return elem[1]
data.sort(key=takeThird)

for d in data:
    print("{} | {} | {:2.2%}".format(d[0], d[1], d[2]))

pstex | 84 | 0.01%
gif | 162 | 0.01%
JPEG | 254 | 0.02%
PDF | 310 | 0.02%
EPS | 441 | 0.03%
JPG | 2004 | 0.15%
svg | 2660 | 0.19%
PNG | 4282 | 0.31%
jpeg | 6311 | 0.46%
ps | 10665 | 0.78%
jpg | 111482 | 8.15%
eps | 205067 | 14.99%
png | 334027 | 24.42%
pdf | 690138 | 50.45%


In [None]:
values = []
names = []
for row in rows:
    values.append(row[0])
    names.append(row[1])

In [None]:
values_save = values
names_save = names

In [None]:
values = values_save
names = names_save

In [None]:
for i, name in enumerate(names.copy()):
    names[i] = name.lower()
print(names)

In [None]:
print(values)
print(names)
print(len(values))
print(len(names))

In [None]:
# attempting to remove duplicates (???)
currentname = ""
for ii in range(0, len(names)):
    for i, name_one in enumerate(names.copy()[ii:]):
        for j, name_two in enumerate(names.copy()[:]):
            print("name_one:",name_one)
            print("name_two:",name_two)
            if name_one == name_two and i != j:
                values[i] += values[j]
                del values[j]
                del names[j]
                break
        break

In [None]:
print(values)
print(names)
print(len(values))
print(len(names))

In [None]:
total = 0
for row in rows:
    total += row[0]
print(total)

In [None]:
# writing file to org mode for github

# write the data to a file
with open("stats_articles_by_cat.org", "w") as write_file:
    print("* number of articles by category", file=write_file)
    print("|-|-|", file=write_file)
    for cat in rows:
#         joined = list(zip(cat[1], cat[2]))
        #     print(joined)
        print('|' + str(cat[0]) + "|" + str(cat[1]) + "|", file=write_file)
    print("|-|-|", file=write_file)
write_file.close()