# Get Captions (test notebook)

Notebook for getting a list of all tex files in a folder, parsing captions out of tex files, formatting data and inserting into SQLite

!!! Note !!!

This notebook used only for testing, actual writing of data done with separate python script.

In [None]:
import os
import re
import json
import sqlite3
import random
import time
import math

In [None]:
# db_path = "/home/rte/data/db/arxiv_db.sqlite3"
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"

In [None]:
texs = []
with open("tex_list_all.json") as f:
    texs = json.load(f)

In [None]:
# test cases - these contain a range of different tex expressions for writing figures

texs = ['/home/rte/arXiv/src_all/0606/math0606111/examples.tex',
        '/home/rte/arXiv/src_all/1205/1205.0252/ms.tex',
        '/home/rte/arXiv/src_all/0809/0809.4669/DK.tex',
        '/home/rte/arXiv/src_all/0803/0803.1154/benchmark.tex',
        '/home/rte/arXiv/src_all/1811/1811.02626/body/46-numiteration-fig.tex',
        '/home/rte/arXiv/src_all/1307/1307.3519/krux_paper_arxiv_v3.tex',
        '/home/rte/arXiv/src_all/1209/1209.1205/thesis_tarball/complex_images.tex',
        '/home/rte/arXiv/src_all/1605/1605.06652/bending.tex',
        '/home/rte/arXiv/src_all/1811/1811.00678/monahan_018.tex',
        '/home/rte/arXiv/src_all/1507/1507.06061/Revised_Oukil.tex',
        '/home/rte/arXiv/src_all/1001/1001.1420/cca_11.tex', 
        '/home/rte/arXiv/src_all/1001/1001.0575/aas_macros.tex', 
        '/home/rte/arXiv/src_all/1001/1001.0575/magstar.tex', 
        '/home/rte/arXiv/src_all/1001/1001.0475/ppg6_ijmpd_DE-Gt.tex', 
        '/home/rte/arXiv/src_all/1001/1001.0555/SimTreePathArXiv.tex']

In [None]:
# change starting folder to only search a subset

# starting_folder = "/home/rte/arXiv/src_all/"
starting_folder = "/home/rte/arXiv/src_update/"

# subset of src
# starting_folder = "/home/rte/arXiv/src_all/0606/"
# starting_folder = "/home/rte/arXiv/src_all/9501/"
# starting_folder = "/home/rte/arXiv/src_all/0501/"
# starting_folder = "/home/rte/arXiv/src_all/1001/"
# starting_folder = "/home/rte/arXiv/src_all/1501/"
# starting_folder = "/home/rte/arXiv/src_all/1801/"
# starting_folder = "/home/kt/rte/src_small_sample/"
# starting_folder = "/home/kt/rte/src_small_sample/1804"

In [None]:
# search through folders and create a list of tex files

tex_counter = 0
num_dirs = 0
texs = []
# limit = 100
limit = math.inf

for root, dirs, files in os.walk(starting_folder):
    for name in files:
#         print(os.path.join(root, name))
        if ".tex" in name:
#             and name != "bibliography.tex"
            print(name)
            tex_counter += 1
            texs.append(os.path.join(root, name))
    for name in dirs:
        print("--- " + os.path.join(root, name))
        num_dirs += 1
    if len(texs) >= limit:
        break

print("*" * 20)
print("tex_counter:",tex_counter)
print("num_dirs:",num_dirs)

In [None]:
# write json data
filename = 'tex_paths_src.json' 
# filename = 'tex_paths_src_update.json'
with open(filename, 'w') as f:
    json.dump(texs, f)

In [None]:
print(len(texs))
print(texs[:5])

In [None]:
image_extensions = [".eps", ".pdf", ".png", ".ps", ".jpg", ".pstex", 
                    ".gif", ".svg", ".epsf"]

In [None]:
# shuffle list, but set seed so it is reproducible

random.seed(5)
random.shuffle(texs)

In [None]:
# function to parse captions out of tex by matching square brackets and braces

def parse_caption(text):
    offset = 0
    level = 0
    sq_level = 0
    completed_braces = False
    first_brace = text.find("{")
    first_sq_bracket = text.find("[")
    if first_sq_bracket != -1:
        start = min(first_brace, first_sq_bracket)    
    else:
        start = first_brace
    
    end = 2000
    offset = 0
    print("parsing caption from start:", start)
    print("skipping first part of string:", text[:start])
    
    for i, char in enumerate(text[start:]):
        print(char, end=" ")
        if char == "{":
            level += 1
            if level == 1:
                offset = i
        if char == "}":
            level -= 1
            if sq_level == 0:
                completed_braces = True
        if char == "[":
            sq_level += 1
        if char == "]":
            sq_level -= 1

        if level == 0 and sq_level == 0 and completed_braces == True:
            print("end loop")
            end = i
            return text[start+offset+1:start+end]
#         else:
#             print("continue loop")

    return text[start+1:]

In [None]:
# check if there is a caption in the text, by checking for a couple of different options

def check_caption(text):
    start_index = 0
    found_caption = False
    
    while found_caption is False:
        start_index = text.lower().find(r"\caption", start_index)
#         print("start_index:",start_index)
        if start_index == -1:
            print(text)
            print("COULDN'T FIND ANOTHER \caption")
#             start_index = text.lower().find(r"\mycaption")
            return ""
            break
        if text.lower()[start_index:].startswith(r"\captionsetup"):
#             print("found \captionsetup -> increment start_index")
            start_index += 5
        else:
#             print("found a caption")
            found_caption = True
    return text[start_index:]

In [None]:
# iterate over all tex files and grab text within \begin{figure} and \end{figure} and 

error_count = 0
article_count = 0
figure_count = 0

id_re = r'\/(\d{4}\.\d{4,5}|[^\/]+?\d{7})\/'
caption_re = r'(?:\\caption[^{]*?{)(?:\s?\\label{[\S\s]*?\})?\s?([\s\S]+?)(?:\s+?\}?\s?)?(?:(?:\s*?\}?\s*?\\label)|(?:\}\s*?\\end))'
label_re = r'(?:\\label\{)([^}]+?)(?:\})'
imagecheck_re = r'(?:\\epsfbox|\\sfig|\\plotfiddle|\\plottwo|\\psfig|\\plotone|\\includegraphics|\\epsfig)[^\{]*?(?:\{file=|\{figure=|{)([^\}\,]+)'
subfigure_re = r'\\begin\{subfigure[\S\s]+?\\end\{subfigure\}'
remove_label_re = r'\s*\\label\{[^\}]*?\}\s*'

data = []

for ai, t in enumerate(texs[0:]):
    print("*" * 20)
    print("paper:",ai)
    print("-" * 20)
    try:
        with open(t, "rt", encoding="latin1") as f:
#             print(f)
            article_data = []
#             content = [x.strip() for x in f.readlines()]
            content = f.readlines()
    
            article_count += 1
            start = 0
            end = 0
            
            fignum = 1 # 1 indexed figures, i.e. no figure 0
#             filenames = []
            figures = []

#             path, name = os.path.split(t)
            print(t)
#             article_id = path.split(os.sep)[-1]
            match = re.search(id_re, t)
            if match:
                article_id = match.group(1)
            else:
                print("!!! no article id found!")
            print(article_id)

            # iterate over each line and find where a figure begins
            for i, l1 in enumerate(content[:]):
                if r"\begin{figure" in l1 and l1.lstrip().startswith("%") is False:
                    start = i
                    # found a figure, now create our row of data
                    article_data.append(article_id)
                    article_data.append(fignum)
                    
                    figures.append([article_id, fignum, ""])

                    # find where figure ends
                    for j, l2 in enumerate(content[i:]):
                        if l2.lstrip().startswith("%") is False:
#                             print(l2)
                            figures[fignum-1][2] += l2

                            if r"\end{figure" in l2:
                                end = start + j
                                break
                    figure_count += 1
                    fignum += 1

        # organise and print data
        for i, figure in enumerate(figures):
            print("article-id:",figure[0])
            print("fignum:",figure[1])
            print("\nwhole figure text:")
            print(figure[2])
            
#             get caption
            '''
            match = re.search(caption_re, figure[2])
            if match:
                print("caption:")
                print(match.group(1))
                figures[i].append(match.group(1))
            else:
                print("!!! no caption")
                figures[i].append("")
            '''
    
            # remove all subfigures
            figure_text = re.sub(subfigure_re, "", figure[2])
#             print("figure_text", figure_text)

            print("\nfigure_text\n",figure_text)
            
            # check for captionsetup
            caption_text = check_caption(figure_text)
            print("\ncaption_text:\n",caption_text)

            caption = parse_caption(caption_text)
            print("\n>>>>> caption:")
            print(caption)
            
            # remove labels from caption
            caption = re.sub(remove_label_re, "", caption)
            print("\n>>>>> caption w labels removed:")
            print(caption)
            
            # get label
            match = re.search(label_re, figure_text)
            if match:
                print("\n>>>>> label:",match.group(1))
                figures[i].append(match.group(1))
            else:
                figures[i].append("")
                print("\n!!! no label")
            # get filenames
            filenames = re.findall(imagecheck_re, figure[2])
            print(">>>>> filenames:",filenames)
            figures[i].append(filenames)
            print("-" * 30)
            print("")
            
            data.append(figure)
                
    except UnicodeDecodeError as error:
        print("decode error!",error)
        error_count += 1

# [r.pop(2) for r in data]

print("*" * 20)
print("error_count:",error_count)
print("article_count:",article_count)
print("figure_count:",figure_count)
print("*" * 20)
# print(data)