In [None]:
import matplotlib.pyplot as plt
import numpy as np

import sqlite3

import pickle

import math

In [None]:
# path on rte
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

In [None]:
# Here we import the sqlite3 database and create a cursor

db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
# get list of all singular categories

sql = ('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1))
    FROM metadata
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)) DESC
    ''')

c.execute(sql, ())

cats = c.fetchall()

In [None]:
# get list of all singular categories

sql = ('''
    SELECT DISTINCT metadata.cat
    FROM metadata
    GROUP BY metadata.cat
    ''')

c.execute(sql, ())

all_cats = c.fetchall()

In [None]:
print(len(all_cats))
for row in all_cats:
    print(row)

In [None]:
individual_cats = []
for cat in all_cats:
    substrings = cat[0].split(" ")
    for s in substrings:
        if s not in individual_cats:
            individual_cats.append(s)
print(len(individual_cats))
for row in individual_cats:
    print(row)

There are 175 individual categories that are used and 171 categories given as the primary category (first listed)

In [None]:
print(len(cats))
for cat in cats:
    print(cat)

In [None]:
cats = individual_cats

In [None]:
sql = ('''
    SELECT metadata.cat, count(metadata.cat)
    FROM metadata
    GROUP BY metadata.cat
    ''')

c.execute(sql, ())

data = c.fetchall()

print("number of rows: ", len(data))

# c.execute(sql, (targetDate, targetDate, cat[0], )))

# LEFT JOIN metadata ON images.identifier = metadata.identifier


In [None]:
for row in data[:20]:
    print(row)

In [None]:
# writing file to org mode for github

# write the data to a file
with open("stats_all_category_totals.org", "w") as write_file:
    print("* totals of publications per categories", file=write_file)
    print("|-|-|", file=write_file)
    for cat in rows:
#         joined = list(zip(cat[1], cat[2]))
        #     print(joined)
        print('|' + str(cat[1]) + "|" + str(cat[0]) + "|", file=write_file)
    print("|-|-|", file=write_file)
write_file.close()

In [None]:
print(len(cats))
for row in cats:
    print(row)

In [None]:
# testing how many times a given category appears along with another category

heatmap_data = np.zeros((len(cats), len(cats)), dtype=int)

# iterate twice over whole dataset
# range(len(data))

for i in range(len(cats)):
    print("i:", i)

    for j in range(len(cats)):
        if i == j: 
            break
        else:                
            total = 0
            for row in data[:]:
                substrings = row[0].split(" ")
#                 print("number of substrings: ", len(substrings))

                c1 = cats[i]
                c2 = cats[j]
#                 print("c1:",c1,"c2:",c2)

                if c1 in substrings:
                    if c2 in substrings:
                        total += row[1]
#                         print(c1, "and", c2, "in", row[0], "adding", row[1])

#             if total > 0: 
#                 print("total:", total)
#                 print("*" * 20)
            heatmap_data[i][j] = total
        

In [None]:
# WRITE pickle

with open("heatmap_data_175" + ".pickle", "wb") as write_file:
    pickle.dump(heatmap_data, write_file)
    write_file.close()

In [None]:
# READ pickle

with open("heatmap_data.pickle", "rb") as read_file:
    heatmap_data = pickle.load(read_file)
    read_file.close()

In [None]:
# testing string finding

a = "astro-ph"
b = "astro-ph.MD"
c = "something"

li = [b,c]

if a in li:
    print("true")

In [None]:
print(data[1][0])

In [None]:
print(heatmap_data.shape)
print(heatmap_data.dtype)
print(heatmap_data)

In [None]:
for row in heatmap_data:
    for v in row:
#         print(type(v))
        if v > 0: print(v)

In [None]:
# find the log of all values in the data

heatmap_data_log = np.zeros((len(cats),len(cats)),dtype=float)

for i in range(len(cats)):
    print("i:", i)
    for j in range(len(cats)):
#         print(heatmap_data[i][j])
        if heatmap_data[i][j] > 0:
            heatmap_data_log[i][j] = math.log10(heatmap_data[i][j])
            print(heatmap_data_log[i][j])

In [None]:
# currently not working

heatmap_log = np.log(np.where(heatmap_data > 0), heatmap_data)

In [None]:
# attempting to sort the heatmap data so that we have top correlations first

x = heatmap_data_log
a = x.argsort()
# print(x[a])

In [None]:
fig, ax = plt.subplots()
im = ax.imshow(heatmap_data_log)
fig.set_size_inches(24, 24)

# We want to show all ticks...
ax.set_xticks(np.arange(len(cats)))
ax.set_yticks(np.arange(len(cats)))
# ... and label them with the respective list entries
ax.set_xticklabels(cats)
ax.set_yticklabels(cats)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_yticklabels(), size="small")
plt.setp(ax.get_xticklabels(), rotation=90, ha="right", size="small", 
         rotation_mode="anchor", position=(-100,0))
# ax.get_xticklabels().set_position()

# Loop over data dimensions and create text annotations.
# for i in range(len(cats)):
#     for j in range(len(cats)):
#         text = ax.text(j, i, heatmap_data[i, j],
#                        ha="center", va="center", color="w")

ax.set_title("Category co-occurance within arXiv")
# fig.tight_layout()
plt.show()

In [None]:
plt.setp(ax.get_xticklabels())

In [None]:
fig.savefig("heatmap_co-category.svg", dpi=300)