Notebook for running python sqlite3 commands for querying database

In [None]:
import sqlite3

db_path = "/home/kt/rte/db/arxiv_db_images.sqlite3"

In [None]:
# def printsql():
    

Here we import the sqlite3 database and create a cursor

In [None]:
db = sqlite3.connect(db_path)
c = db.cursor()

Get the pragma table info for each table

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

Quick test to retrive one row

In [None]:
c.execute('''
    SELECT * 
    FROM images 
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c.fetchall()
for row in rows:
    print(row)

Find any duplicate identifiers

In [None]:
c.execute('''
    SELECT identifier, COUNT(identifier)
    FROM metadata
    GROUP BY identifier
    HAVING COUNT(identifier) > 1
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
c.execute('''
    SELECT identifier, COUNT(identifier)
    FROM images
    GROUP BY identifier
    HAVING COUNT(identifier) > 0
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

Find the earliest date of an article

In [None]:
c.execute('''
    SELECT created, identifier
    FROM metadata
    WHERE created IS NOT NULL
    ORDER BY created ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

Find total number of rows in metadata

In [82]:
c.execute('''
    SELECT count(*)
    FROM metadata
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

(1506177,)


Find total number of rows in images

In [83]:
c.execute('''
    SELECT count(*)
    FROM images
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

(10061232,)


Get average of the x size column

In [None]:
c.execute('''
    SELECT avg(x)
    FROM images
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

Get average of the y size column

In [None]:
c.execute('''
    SELECT avg(y)
    FROM images
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

Find where there are NULL values for x

In [None]:
c.execute('''
    SELECT identifier, id, path, filesize, x, y, imageformat
    FROM images
    WHERE x is null or x = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(len(rows))

Find where there are NULL values for identifier

In [None]:
c.execute('''
    SELECT identifier, id, path, filesize, x, y, imageformat
    FROM images
    WHERE identifier is null or identifier = ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

Get total number of images per article

In [None]:
c.execute('''
    SELECT images.identifier, metadata.cat, count(images.identifier)    
    FROM images 
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY images.identifier
    ORDER BY count(images.identifier)
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

Get total number of images per category

In [81]:
c.execute('''
    SELECT metadata.cat, count(images.identifier)
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY metadata.cat
    ORDER BY count(images.identifier) DESC    
    LIMIT 100
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

(None, 1862732)
('cs.CV', 401985)
('hep-ph', 299442)
('astro-ph.GA', 286399)
('astro-ph.CO', 264151)
('astro-ph.SR', 258633)
('astro-ph', 170464)
('astro-ph.HE', 160563)
('quant-ph', 153522)
('math.NA', 132838)
('hep-ex', 119860)
('cond-mat.mes-hall', 117547)
('cs.IT math.IT', 109733)
('astro-ph.EP', 107692)
('cond-mat.str-el', 101510)
('hep-ph hep-ex', 98219)
('hep-th', 92500)
('cond-mat.mtrl-sci', 91265)
('physics.flu-dyn', 70619)
('gr-qc', 70218)
('nucl-th', 66227)
('cond-mat.stat-mech', 60862)
('cs.LG stat.ML', 59611)
('math.GT', 59363)
('astro-ph.IM', 58970)
('astro-ph.GA astro-ph.CO', 58936)
('cond-mat.soft', 58553)
('cs.NI', 57744)
('stat.ME', 51821)
('math.OC', 48722)
('cond-mat.supr-con', 44425)
('math.CO', 43947)
('physics.optics', 41141)
('astro-ph.SR astro-ph.GA', 40489)
('stat.ML cs.LG', 40372)
('nucl-ex', 39500)
('hep-lat', 39017)
('astro-ph.CO astro-ph.GA', 38669)
('cs.RO', 36855)
('cs.LG', 35298)
('cond-mat.quant-gas', 33831)
('cs.SE', 33765)
('physics.ins-det hep-ex', 

Get number of articles with no category

In [None]:
c.execute('''
    SELECT identifier, cat
    FROM metadata
    WHERE cat IS NULL OR cat = '' OR cat = 'None'
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
c.execute('''
    SELECT id, identifier, cat, title
    FROM metadata
    WHERE identifier is ''
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
# Find where there is no matching metadata for a given image

c.execute('''
    SELECT metadata.identifier, images.identifier
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    LIMIT 100
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [102]:
# Find number of articles/images in each year

c.execute('''
    SELECT count(images.identifier), strftime("%m-%Y", metadata.created) as 'mY'
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier 
    GROUP BY strftime("%m-%Y", metadata.created)
    ORDER BY strftime("%Y", metadata.created) ASC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

(1862732, None)
(169, '03-2007')
(22485, '04-2007')
(25668, '05-2007')
(24426, '06-2007')
(25360, '07-2007')
(24225, '08-2007')
(26571, '09-2007')
(31672, '10-2007')
(27463, '11-2007')
(25980, '12-2007')
(27561, '01-2008')
(25120, '02-2008')
(25970, '03-2008')
(27261, '04-2008')
(27428, '05-2008')
(28252, '06-2008')
(29978, '07-2008')
(25154, '08-2008')
(30985, '09-2008')
(35081, '10-2008')
(28507, '11-2008')
(30994, '12-2008')
(29999, '01-2009')
(27152, '02-2009')
(31566, '03-2009')
(28030, '04-2009')
(30822, '05-2009')
(34584, '06-2009')
(35045, '07-2009')
(31141, '08-2009')
(35056, '09-2009')
(36168, '10-2009')
(33965, '11-2009')
(34971, '12-2009')
(32916, '01-2010')
(30680, '02-2010')
(34933, '03-2010')
(34588, '04-2010')
(34520, '05-2010')
(37563, '06-2010')
(34320, '07-2010')
(33145, '08-2010')
(38881, '09-2010')
(39270, '10-2010')
(42457, '11-2010')
(38161, '12-2010')
(37872, '01-2011')
(35109, '02-2011')
(40708, '03-2011')
(35983, '04-2011')
(38638, '05-2011')
(40226, '06-2011'

In [108]:
# Find number of images per each different imageformat

c.execute('''
    SELECT imageformat, count(imageformat)
    FROM images
    GROUP BY imageformat
    ORDER BY count(imageformat) DESC
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

('PS', 5149324)
('PDF', 3261411)
('PNG', 1079044)
('JPEG', 484113)
('GIF', 18742)
('PDF612', 13083)
('SVG', 12407)
('PDF595', 9874)
('', 8117)
('PS360', 1967)
('PS612', 1688)
('EPS', 1643)
('PS596', 1099)
('PDF504', 709)
('PDF360', 644)
('PDF842', 602)
('PS504', 563)
('PDF720', 537)
('PDF576', 471)
('PDF792', 344)
('PDF960', 329)
('PS595', 300)
('PS574', 236)
('PS576', 212)
('PDF1024', 182)
('PDF432', 173)
('PS567', 143)
('PDF288', 139)
('PS540', 124)
('PS468', 111)
('PDF144', 106)
('PDF596', 97)
('PS180', 96)
('PDF425', 95)
('PDF567', 90)
('PDF216', 89)
('PDF414', 89)
('PDF420', 82)
('PDF468', 76)
('PS586', 76)
('PDF252', 72)
('PDF255', 68)
('PS252', 66)
('PDF57', 63)
('PS288', 63)
('PDF2160', 62)
('PDF864', 60)
('EPT', 59)
('PS559', 58)
('PDF455', 57)
('PDF461', 55)
('PS335', 55)
('PDF71', 52)
('PDF283', 50)
('PS353', 50)
('PS453', 49)
('PS562', 49)
('PS216', 47)
('PDF29', 46)
('PDF227', 45)
('PDF454', 45)
('PS575', 45)
('PDF493', 44)
('PDF119', 42)
('PDF34', 42)
('PDF58', 42)
('PDF1