# Generate heatmap diagrams

This notebook contains experimentation towards producing heatmap diagrams of the co-occurance of categories in the papers of the arXiv repository. Work-in-progress.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

import sqlite3

import pickle

import math

In [None]:
# path on rte
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

In [None]:
# Here we import the sqlite3 database and create a cursor

db = sqlite3.connect(db_path)
c = db.cursor()

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
# OLD WAY
# get list of all singular categories
# only gets 171 categories used as primary

sql = ('''
    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1))
    FROM metadata
    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)
    ORDER BY count(substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)) DESC
    ''')

c.execute(sql, ())

cats = c.fetchall()

In [None]:
print(len(cats))

In [None]:
# get list of all singular categories using DISTINCT

sql = ('''
    SELECT DISTINCT metadata.cat
    FROM metadata
    GROUP BY metadata.cat
    ''')

c.execute(sql, ())

all_cats = c.fetchall()

In [None]:
print(len(all_cats))
for row in all_cats:
    print(row)

In [None]:
individual_cats = []
for cat in all_cats:
    substrings = cat[0].split(" ")
    for s in substrings:
        if s not in individual_cats:
            individual_cats.append(s)
print(len(individual_cats))
for row in individual_cats:
    print(row)

There are 175 individual categories that are used and 171 categories given as the primary category (first listed)

In [None]:
cats = individual_cats

In [None]:
print(len(cats))
for cat in cats:
    print(cat)

In [None]:
# get the total of each time a category combination appears
# store in data variable

sql = ('''
    SELECT metadata.cat, count(metadata.cat)
    FROM metadata
    GROUP BY metadata.cat
    ''')

c.execute(sql, ())

data = c.fetchall()

print("number of rows: ", len(data))

# c.execute(sql, (targetDate, targetDate, cat[0], )))

# LEFT JOIN metadata ON images.identifier = metadata.identifier


In [None]:
for row in data[:20]:
    print(row)

In [None]:
# writing file to org mode for github

# write the data to a file
with open("stats_all_category_totals.org", "w") as write_file:
    print("* totals of publications per categories", file=write_file)
    print("|-|-|", file=write_file)
    for cat in rows:
#         joined = list(zip(cat[1], cat[2]))
        #     print(joined)
        print('|' + str(cat[1]) + "|" + str(cat[0]) + "|", file=write_file)
    print("|-|-|", file=write_file)
write_file.close()

In [None]:
print(len(cats))
for row in cats[:20]:
    print(row)

In [None]:
# testing how many times a given category appears along with another category

heatmap_data = np.zeros((len(cats), len(cats)), dtype=int)

# iterate twice over whole dataset
# range(len(data))

for i in range(len(cats)):
    print("i:", i)

    for j in range(len(cats)):
        if i == j: 
#             break
            pass
        else:                
            total = 0
            for row in data[:]:
                substrings = row[0].split(" ")
#                 print("number of substrings: ", len(substrings))

                c1 = cats[i]
                c2 = cats[j]
#                 print("c1:",c1,"c2:",c2)

                if c1 in substrings:
                    if c2 in substrings:
                        total += row[1]
#                         print(c1, "and", c2, "in", row[0], "adding", row[1])

#             if total > 0: 
#                 print("total:", total)
#                 print("*" * 20)
            heatmap_data[i][j] = total
        

In [None]:
f = ['''    astro-ph.GA: Astrophysics of Galaxies
    astro-ph.CO: Cosmology and Nongalactic Astrophysics
    astro-ph.EP: Earth and Planetary Astrophysics
    astro-ph.HE: High Energy Astrophysical Phenomena
    astro-ph.IM: Instrumentation and Methods for Astrophysics
    astro-ph.SR: Solar and Stellar Astrophysics
    cond-mat.dis-nn: Disordered Systems and Neural Networks
    cond-mat.mtrl-sci: Materials Science
    cond-mat.mes-hall: Mesoscale and Nanoscale Physics
    cond-mat.other: Other Condensed Matter
    cond-mat.quant-gas: Quantum Gases
    cond-mat.soft: Soft Condensed Matter
    cond-mat.stat-mech: Statistical Mechanics
    cond-mat.str-el: Strongly Correlated Electrons
    cond-mat.supr-con: Superconductivity
    gr-qc: General Relativity and Quantum Cosmology
    hep-ex: High Energy Physics - Experiment
    hep-lat: High Energy Physics - Lattice
    hep-ph: High Energy Physics - Phenomenology
    hep-th: High Energy Physics - Theory
    math-ph: Mathematical Physics
    nlin.AO: Adaptation and Self-Organizing Systems
    nlin.CG: Cellular Automata and Lattice Gases
    nlin.CD: Chaotic Dynamics
    nlin.SI: Exactly Solvable and Integrable Systems
    nlin.PS: Pattern Formation and Solitons
    nucl-ex: Nuclear Experiment
    nucl-th: Nuclear Theory
    physics.acc-ph: Accelerator Physics
    physics.app-ph: Applied Physics
    physics.ao-ph: Atmospheric and Oceanic Physics
    physics.atom-ph: Atomic Physics
    physics.atm-clus: Atomic and Molecular Clusters
    physics.bio-ph: Biological Physics
    physics.chem-ph: Chemical Physics
    physics.class-ph: Classical Physics
    physics.comp-ph: Computational Physics
    physics.data-an: Data Analysis, Statistics and Probability
    physics.flu-dyn: Fluid Dynamics
    physics.gen-ph: General Physics
    physics.geo-ph: Geophysics
    physics.hist-ph: History and Philosophy of Physics
    physics.ins-det: Instrumentation and Detectors
    physics.med-ph: Medical Physics
    physics.optics: Optics
    physics.ed-ph: Physics Education
    physics.soc-ph: Physics and Society
    physics.plasm-ph: Plasma Physics
    physics.pop-ph: Popular Physics
    physics.space-ph: Space Physics
    quant-ph: Quantum Physics''', '''    math.AG: Algebraic Geometry
    math.AT: Algebraic Topology
    math.AP: Analysis of PDEs
    math.CT: Category Theory
    math.CA: Classical Analysis and ODEs
    math.CO: Combinatorics
    math.AC: Commutative Algebra
    math.CV: Complex Variables
    math.DG: Differential Geometry
    math.DS: Dynamical Systems
    math.FA: Functional Analysis
    math.GM: General Mathematics
    math.GN: General Topology
    math.GT: Geometric Topology
    math.GR: Group Theory
    math.HO: History and Overview
    math.IT: Information Theory
    math.KT: K-Theory and Homology
    math.LO: Logic
    math.MP: Mathematical Physics
    math.MG: Metric Geometry
    math.NT: Number Theory
    math.NA: Numerical Analysis
    math.OA: Operator Algebras
    math.OC: Optimization and Control
    math.PR: Probability
    math.QA: Quantum Algebra
    math.RT: Representation Theory
    math.RA: Rings and Algebras
    math.SP: Spectral Theory
    math.ST: Statistics Theory
    math.SG: Symplectic Geometry''','''    cs.AI: Artificial Intelligence
    cs.CL: Computation and Language
    cs.CC: Computational Complexity
    cs.CE: Computational Engineering, Finance, and Science
    cs.CG: Computational Geometry
    cs.GT: Computer Science and Game Theory
    cs.CV: Computer Vision and Pattern Recognition
    cs.CY: Computers and Society
    cs.CR: Cryptography and Security
    cs.DS: Data Structures and Algorithms
    cs.DB: Databases
    cs.DL: Digital Libraries
    cs.DM: Discrete Mathematics
    cs.DC: Distributed, Parallel, and Cluster Computing
    cs.ET: Emerging Technologies
    cs.FL: Formal Languages and Automata Theory
    cs.GL: General Literature
    cs.GR: Graphics
    cs.AR: Hardware Architecture
    cs.HC: Human-Computer Interaction
    cs.IR: Information Retrieval
    cs.IT: Information Theory
    cs.LG: Learning
    cs.LO: Logic in Computer Science
    cs.MS: Mathematical Software
    cs.MA: Multiagent Systems
    cs.MM: Multimedia
    cs.NI: Networking and Internet Architecture
    cs.NE: Neural and Evolutionary Computing
    cs.NA: Numerical Analysis
    cs.OS: Operating Systems
    cs.OH: Other Computer Science
    cs.PF: Performance
    cs.PL: Programming Languages
    cs.RO: Robotics
    cs.SI: Social and Information Networks
    cs.SE: Software Engineering
    cs.SD: Sound
    cs.SC: Symbolic Computation
    cs.SY: Systems and Control''','''    q-bio.BM: Biomolecules
    q-bio.GN: Genomics
    q-bio.MN: Molecular Networks
    q-bio.SC: Subcellular Processes
    q-bio.CB: Cell Behavior
    q-bio.NC: Neurons and Cognition
    q-bio.TO: Tissues and Organs
    q-bio.PE: Populations and Evolution
    q-bio.QM: Quantitative Methods
    q-bio.OT: Other''', '''    q-fin.PR: Pricing of Securities
    q-fin.RM: Risk Management
    q-fin.PM: Portfolio Management
    q-fin.TR: Trading and Microstructure
    q-fin.MF: Mathematical Finance
    q-fin.CP: Computational Finance
    q-fin.ST: Statistical Finance
    q-fin.GN: General Finance
    q-fin.EC: Economics''','''    stat.AP: Applications
    stat.CO: Computation
    stat.ML: Machine Learning
    stat.ME: Methodology
    stat.OT: Other Statistics
    stat.TH: Theory''']

In [None]:
# put the lists of categories into each field

field_cats = []

# print(f)
# counter = 0

for counter, string in enumerate(f):
    print(counter)
    temp_cats = []
    for line in string.splitlines():
    #     counter += 1
#         print(line[4:].split(":")[0])
        temp_cats.append(line[4:].split(":")[0])
#         print(temp_cats)
    field_cats.append(temp_cats)


In [None]:
print(field_cats)

In [None]:
fields_list = ["Physics", "Mathematics", "Computer Science", \
               "Quantitative Biology", "Quantitative Finance", "Statistics"]

In [None]:
# run same as above, but instead reduce data to only the 6 "fields":
# Physics, Mathematics, Computer Science, 
# Quantitative Biology, Quantitative Finance, Statistics


heatmap_data_field = np.zeros((len(field_cats), len(field_cats)), dtype=int)

# iterate twice over whole dataset
# range(len(data))

for i in range(len(field_cats)):
    print("i:", i)

    for j in range(len(field_cats)):
        print("j:", j)
        
        c1 = field_cats[i]
        c2 = field_cats[j]
#         print("c1:",c1,"c2:",c2)

        if i == j: 
#             break
            pass
        else:                
            total = 0
            for row in data[:]:
                substrings = row[0].split(" ")
#                 print("number of substrings: ", len(substrings))

                c1b = False
                c2b = False
                              
                for c in c1:
                    if c in substrings:
                        c1b = True
#                         print("c1b found","c1:",c)
                        break
                              
                for c in c2:
                    if c in substrings:
                        c2b = True
#                         print("c2b found","c2:",c)
                        break


                if c1b and c2b:
                    total += row[1]
#                         print(c1, "and", c2, "in", row[0], "adding", row[1])

#             if total > 0: 
#                 print("total:", total)
#                 print("*" * 20)
            heatmap_data_field[i][j] = total

In [None]:
heatmap_data_field

In [None]:
heatmap_data_field

In [None]:
# Get number of articles for each primary category

c.execute('''
    SELECT count(substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)), substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)
    FROM metadata
    GROUP BY substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)
    ''')
rows = c.fetchall()
for row in rows:
    print(row)
    

In [None]:
fields_list = ["Physics", "Mathematics", "Computer Science", \
               "Quantitative Biology", "Quantitative Finance", "Statistics"]
print(len(fields_list))

In [None]:
# WRITE pickle

with open("heatmap_data_175_full" + ".pickle", "wb") as write_file:
    pickle.dump(heatmap_data, write_file)
    write_file.close()

In [None]:
# WRITE pickle

with open("heatmap_data_175" + ".pickle", "wb") as write_file:
    pickle.dump(heatmap_data, write_file)
    write_file.close()

In [None]:
# WRITE pickle field_cats

with open("heatmap_data_field" + ".pickle", "wb") as write_file:
    pickle.dump(heatmap_data_field, write_file)
    write_file.close()

In [None]:
# READ pickle

with open("heatmap_data_175.pickle", "rb") as read_file:
    heatmap_data = pickle.load(read_file)
    read_file.close()

In [None]:
# READ pickle

with open("heatmap_data_175_full.pickle", "rb") as read_file:
    heatmap_data = pickle.load(read_file)
    read_file.close()

In [None]:
# READ pickle field_cats

with open("heatmap_data_field.pickle", "rb") as read_file:
    heatmap_data_field = pickle.load(read_file)
    read_file.close()

In [None]:
# testing string finding

a = "astro-ph"
b = "astro-ph.MD"
c = "something"

li = [b,c]

if a in li:
    print("true")

In [None]:
print(data[1][0])

In [None]:
print(heatmap_data.shape)
print(heatmap_data.dtype)
print(heatmap_data)

In [None]:
for row in heatmap_data:
    for v in row:
#         print(type(v))
        if v > 0: print(v)

In [None]:
# find the log of all values in the data

heatmap_data_log = np.zeros((len(cats),len(cats)),dtype=float)

for i in range(len(cats)):
    print("i:", i)
    for j in range(len(cats)):
#         print(heatmap_data[i][j])
        if heatmap_data[i][j] > 0:
            heatmap_data_log[i][j] = math.log10(heatmap_data[i][j])
            print(heatmap_data_log[i][j])

In [None]:
# find the log of all values in the data (fields)

heatmap_data_log = np.zeros((len(fields_list),len(fields_list)),dtype=float)

for i in range(len(fields_list)):
    print("i:", i)
    for j in range(len(fields_list)):
#         print(heatmap_data[i][j])
        if heatmap_data_field[i][j] > 0:
            heatmap_data_log[i][j] = math.log10(heatmap_data_field[i][j])
            print(heatmap_data_log[i][j])

In [None]:
# currently not working

heatmap_log = np.log(np.where(heatmap_data > 0), heatmap_data)

In [None]:
# attempting to sort the heatmap data so that we have top correlations first

x = heatmap_data_log
a = x.argsort()
# print(x[a])

In [None]:
# slice heatmap into smaller chunks that are then exported sequentially

# xstart = 0
# xstop = 25
# ystart = 0
# ystop = 25

for i in range(0,175,25):
    print("i:",i)
    for j in range(0,175,25):
        print("j:",j)
        xstart = i
        xstop = i + 25
        ystart = j
        ystop = j + 25

        fig, ax = plt.subplots()
        im = ax.imshow(heatmap_data[xstart:xstop,ystart:ystop])
        # plt.colorbar(im)

        fig.set_size_inches(12, 12)

        # We want to show all ticks...
        ax.set_xticks(np.arange(len(cats[ystart:ystop])))
        ax.set_yticks(np.arange(len(cats[xstart:xstop])))
        # ... and label them with the respective list entries
        ax.set_xticklabels(cats[ystart:ystop])
        ax.set_yticklabels(cats[xstart:xstop])

        # Rotate the tick labels and set their alignment.
        plt.setp(ax.get_yticklabels(), size="small")
        plt.setp(ax.get_xticklabels(), rotation=90, ha="right", size="small", 
                 rotation_mode="anchor", position=(-100,0))
        # ax.get_xticklabels().set_position()

        # Loop over data dimensions and create text annotations.
        # for i in range(len(cats)):
        #     for j in range(len(cats)):
        #         text = ax.text(j, i, heatmap_data[i, j],
        #                        ha="center", va="center", color="w")

        ax.set_title("Category co-occurance within arXiv")
        # fig.tight_layout()
#         plt.show()

        filename = "heatmap_co-category_" + \
                    str(xstart) + "-" + str(xstop) + \
                    "_" + str(ystart) + "-" + str(ystop) + ".svg"
        fig.savefig(filename, dpi=300)

In [None]:
for i in range(0,175,25):
    print(i)

In [None]:
# field

fig, ax = plt.subplots()
im = ax.imshow(heatmap_data_field)
plt.colorbar(im)

fig.set_size_inches(12, 12)

# We want to show all ticks...
ax.set_xticks(np.arange(len(fields_list)))
ax.set_yticks(np.arange(len(fields_list)))
# ... and label them with the respective list entries
ax.set_xticklabels(fields_list)
ax.set_yticklabels(fields_list)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_yticklabels(), size="small")
plt.setp(ax.get_xticklabels(), rotation=90, ha="right", size="small", 
         rotation_mode="anchor", position=(-100,0))
# ax.get_xticklabels().set_position()

# Loop over data dimensions and create text annotations.
# for i in range(len(cats)):
#     for j in range(len(cats)):
#         text = ax.text(j, i, heatmap_data[i, j],
#                        ha="center", va="center", color="w")

ax.set_title("Category co-occurance within arXiv (fields)")
# fig.tight_layout()
plt.show()

In [None]:
plt.setp(ax.get_xticklabels())

In [None]:
fig.savefig("heatmap_co-category_fields.svg", dpi=300)