In [None]:
import subprocess
import os
import sqlite3
import random

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.interpolate import BSpline
import scipy.interpolate as interpolate

import seaborn as sns
sns.set(style="ticks")

import joypy

In [None]:
open_images = False

In [None]:
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
sql = '''
    SELECT captions.image_ids, captions.caption, captions.fignum, metadata.cat, metadata.created
    FROM captions
    LEFT JOIN metadata ON captions.identifier = metadata.identifier
    WHERE caption LIKE ?
    '''

c.execute(sql, ("%monte carlo%",))
rows = c.fetchall()

print("number of rows:",len(rows))

In [None]:
for row in rows[:]:
    print(row[3], row[4])

In [None]:
# years = pd.period_range(start='1991-01-01', end='2018-01-01', freq='Y')
# years

In [None]:
years = [str(x) for x in range(1991, 2019, 1)]
print(len(years))

In [None]:
cats = []

for row in rows:
    cat = row[3].split(" ")[0]
    if cat not in cats:
        cats.append(cat)
print(len(cats))
# cats

In [None]:
data_long = []
for row in rows:
    cat = row[3].split(" ")[0]
    year = row[4].split("-")[0]
    data_long.append([cat, year])

In [None]:
df = pd.DataFrame(data_long, columns=["cat", "year"]).astype({'cat': 'category', 'year': 'int32'})

In [None]:
df.head()

In [None]:
a = np.zeros((len(years), len(cats)), dtype='float64')
a.shape

In [None]:
for row in rows[:]:
    cat = row[3].split(" ")[0]
    year = row[4].split("-")[0]
    cat_index = cats.index(cat)
    year_index = years.index(year)
#     print(cat, cat_index, year, year_index)
    a[year_index, cat_index] += 1

In [None]:
a[4:5, 10:]

In [None]:
data = pd.DataFrame(a, columns=cats)

In [None]:
cats

In [None]:
data.insert(0, "year", years)

In [None]:
data.head()

In [None]:
# Reshape by melt: 
# https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-by-melt
activity = data.melt(id_vars=['year']).rename(columns = {'variable': 'cat', 'value': 'freq'})

In [None]:
activity.tail()

In [None]:
# Group data by activity. This is so each group can be run through the 
# spline function and recombined later
activity_grouped = activity.groupby(by="skill", sort=False)
# activity_grouped.describe()

In [None]:
def to_spline(x, y, samples=300):
    """Pass in the x and y data. 
    Return x and y data that has expanded data points and curvy"""
    x_new = np.linspace(x.min(), x.max(), samples)
    t, c, k = interpolate.splrep(x, y, s=0, k=4)
    y_spline = interpolate.BSpline(t, c, k, extrapolate=False)
    y_new = y_spline(x_new)
    return (x_new, y_new)

In [None]:
# Iterate over every group, sending each group to the spline function
# Append the results to a new dataframe
activity_splined = pd.DataFrame()
for key, item in activity_grouped:
    x, y = to_spline(activity_grouped.get_group(key).year, activity_grouped.get_group(key).intensity)
    
    # When interpolating, the end of a sharp curve goes below zero and then back above zero
    # While mathematically correct, it doesn't reflect my data. So cut off those small waves    
    y[y < 0.5] = 0 
    df = pd.DataFrame({
        "year": x,
        "intensity": y,
        "activity": np.repeat(key, len(x))
    })
    activity_splined = activity_splined.append(df, ignore_index=True)
    

# activity_splined[activity_splined.activity == "C"].describe()
activity_splined[activity_splined.activity == "Physics Research"].iloc[80:95]

In [None]:
df = df.astype({'cat': 'category', 'year': 'int32'})

In [None]:
df

In [None]:
df.dtypes

In [None]:
from matplotlib import cm

plt.figure(figsize=(20,20))
fig, axes = joypy.joyplot(df, by="cat", column="year", grid="y", linewidth=1, 
                          legend=False, fade=True, colormap=cm.Blues_r, 
                          title="Caption occurrences of \'monte carlo\'", 
                          figsize=(20,20),
                          )
# kind="counts", bins=30,
# range_style='own'

In [None]:
fig.savefig("monte-carlo_cat_year_ridge.svg", dpi=300)

In [None]:
# random.shuffle(rows)

In [None]:
files = []
for image_ids, caption, fignum, cat, created in rows:
    print(image_ids, cat, created)
    if image_ids is not None:
        if "\|" in image_ids:
            # print("splitting string:",image_ids)
            ids = image_ids.split("\|")
            for id in ids:
                # print(id)
                files.append(id)
        else:
            files.append(image_ids)

# files = [str(x[0]) + ".jpg" for x in rows[:]]
print("total number of results:", len(files))

In [None]:
if open_images:
    os.chdir("/mnt/hd2/images/all")

    cmd = ["feh"]
    for file in files:
        cmd.append(file + ".jpg")
    # print(cmd)

    # for row in rows:
    #     print("fignum:",row[2])
    #     print("caption:",row[1])

    subprocess.run(cmd)