# Stackplot

Generates a stackplot by all image file extensions for the arXiv images from inception to end 2018.

## Setup

Import required libraries, connect to SQLite database, create cursor, fetch table info

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import numpy as np
import sqlite3
import pickle
import copy
import json
import math
import pandas as pd
import os

In [None]:
# import the sqlite3 database and create a cursor
db_path = os.path.expanduser("~/data/db/arxiv_db_images.sqlite3")
db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

### Generate stackplot of image formats by year

In [None]:
# list primary categories by associated images

c.execute('''
    SELECT images.filename, strftime("%Y", metadata.created) 
    FROM images
    LEFT JOIN metadata ON images.identifier = metadata.identifier
    WHERE strftime("%Y", metadata.created) != '2019'
    AND strftime("%Y", metadata.created) != '2020'
    ''')
rows = c.fetchall()
print(len(rows))
print("sample:\n",rows[:3])

In [None]:
# make lists of the years and extensions
years = []
exts = []

# get years and extensions
for filename, year in rows[:]:
    if year not in years:
        years.append(year)
    fileext = filename.rsplit(".", 1)[1].lower()
    if fileext not in exts:
        exts.append(fileext)
years.sort()
print(years)
# print(exts)
exts.sort()
print(exts)

In [None]:
# delete image formats that we don't want to plot
# make sure to do this higher numbers first!
del exts[9] # delete svg
del exts[8] # delete pstex
del exts[3] # delete jpeg
del exts[1] # delete epsf
del years[0] # delete 1998

In [None]:
print(len(years))
print(years)
print(len(exts))
print(exts)

In [None]:
# make empty array

ext_data = np.zeros((len(exts), len(years)))
print(ext_data)

In [None]:
for filename, year in rows[:]:
    if year is not '1988':
#         print(filename, year)
        fileext = filename.rsplit(".", 1)[1].lower()
        # account for image formats that are similar
        if fileext == "jpeg":
            fileext = "jpg"
        if fileext == "epsf":
            fileext = "eps"
        if fileext == "pstex":
            fileext = "ps"
#         print(fileext)
        iyear = int(year) - 1990
#         print(iyear)
        if fileext in exts:
            iext = exts.index(fileext)
#             print(iext)
            ext_data[iext][iyear] += 1

In [None]:
print(ext_data)

In [None]:
# get sum for each year
sums = []
for i, y in enumerate(years):
    sum = 0
    for j, e in enumerate(exts):
#         print(i, j)
        sum += ext_data[j][i]
    sums.append(sum)

In [None]:
print(sums)

In [None]:
# get percentages
ext_data = np.array(ext_data)
sums = np.array(sums)
# ext_data_per = (ext_data / sums)
ext_data_per = np.divide(ext_data, sums)
ext_data_per = ext_data_per * 100

In [None]:
ext_data

In [None]:
ext_data_per

In [None]:
# ind = np.arange(len(years))
fig, ax = plt.subplots(1, 1, sharex='col', sharey='row')
fig.set_size_inches(10, 8)
width = 1

pal = sns.color_palette("deep", 7)
# pal = sns.diverging_palette(10, 220, sep=80, n=7)
plt.stackplot(years, ext_data_per, labels=exts, colors=pal, alpha=1)
plt.margins(0, 0)
plt.ylabel("percentage image file extensions per year")
# plt.xticks(np.arange(0, 1, step=0.2) + 20)
plt.xticks(years, years, rotation=300)
# plt.title("File extension percentages by year")

# ax.legend(loc='upper left', 
#           bbox_to_anchor=(1.02, 0.98),
#           fontsize=9.0,
#           frameon=True,
#           handlelength=2)

# labelspacing=-2.5
#           prop={'size':15})

ax.legend(reversed(ax.legend().legendHandles), reversed(exts), loc='upper left')

for label in ax.xaxis.get_ticklabels()[1::2]:
    label.set_visible(False)

In [None]:
fig.savefig("extensions_stackplot_smaller_v5_legend_nosvg.png", bbox_inches='tight',
    pad_inches=0, transparent=False, dpi=300)