In [208]:
import os
os.chdir("../tree_rag")
from paper_tree import PaperTree

import torch
import numpy as np

import pickle
import regex as re

In [194]:
def clean_junk(tex):
    # Try to get rid of most of the headers
    substrings = ["\\maketitle", "\\end{titlepage}", "\\end{abstract}", "\\abstract"]
    max_index = 0
    for substring in substrings:
        index = tex.rfind(substring)
        if index > max_index:
            max_index = index
    tex = tex[max_index:]

    for pattern in [r"\\end\{titlepage\}", r"\\newpage", r"\\setcounter", r"\\mbox", r"\\cleardoublepage", r"\\pagestyle\{.*\}", r"\\pagenumbering\{.*\}", r"\\bibliographystyle", r"\\bibliography\{.*\}", r"\\clearpage", "\\vfill"]:
        tex = re.sub(pattern, "", tex)

    # Find all \def definitions with or without arguments
    pattern = r"\\def\s*\\(\w+)\s*((?:#\d\s*)*)\s*({(?:[^{}]*+|(?3))*})"
    tex = re.sub(pattern, "", tex)

    # Find all \newcommand definitions
    pattern = r"\\newcommand\*?\s*{?\s*\\(\w+)\s*}?\s*((?:\[\s*\d+\s*\])*)\s*({(?:[^{}]*+|(?3))*})"
    tex = re.sub(pattern, "", tex)

    # Find all \renewcommand definitions
    pattern = r"\\renewcommand\*?\s*{?\s*\\(\w+)\s*}?\s*((?:\[\s*\d+\s*\])*)\s*({(?:[^{}]*+|(?3))*})"
    tex = re.sub(pattern, "", tex)

    # Remove all comments
    pattern = r"\\begin\s*\{\s*comment\s*\}(.*?)\\end\s*\{\s*comment\s*\}"
    tex = re.sub(pattern, "", tex, flags=re.DOTALL)

    # Remove all comments
    pattern = r"(?<!\\)%.*"
    tex = re.sub(pattern, "", tex)

    # Remove excessive newlines
    pattern = r"\n\s+"
    tex = re.sub(pattern, "\n", tex)

    # LHCb junk
    pattern = r"\\centerline\s*\{\s*\\large\s*\\bf\s*LHCb\s*collaboration\s*\}\s*\\begin\s*\{\s*flushleft\s*\}(?:\n|.)*\{\s*\\footnotesize(?:\n|.)*\}\s*\\end\s*\{\s*flushleft\s*\}"
    tex = re.sub(pattern, "", tex)

    pattern = r"\\centerline\s*\{\s*\\large\s*\\bf\s*The\s*LHCb\s*[Cc]ollaboration\s*\}\s*\\begin\s*\{\s*flushleft\s*\}(?:\n|.)*\\end\s*\{\s*flushleft\s*\}"
    tex = re.sub(pattern, "", tex)

    pattern = r"[a-zA-Z.-]+(?:~[a-zA-Z-\\ \{\}\"\'\`]*)+\$\^\{[a-zA-Z0-9,]+\}\$[\,.][\s\n]*"
    tex = re.sub(pattern, "", tex)

    pattern = r"(?<=\n)\$ \^{[\d\w]+}\$.*\n"
    tex = re.sub(pattern, "", tex)

    # Get rid of any bibliography
    pattern = r"\\bibitem\{.+\}(?:.|\n)*\\EndOfBibitem"
    tex = re.sub(pattern, "", tex)

    pattern = r"\\begin{thebibliography}(?:\n|.)*\\end{thebibliography}"
    tex = re.sub(pattern, "", tex)

    return tex

In [192]:
dir_expanded_tex = "../data/expanded_tex/"
filenames = list(filter(lambda str : (".tex" in str[-4:]), os.listdir(dir_expanded_tex)))

papers = {}
for filename in filenames:
    with open(dir_expanded_tex + filename, "r") as f:
        papers[filename] = clean_junk(f.read())

dir_clean_tex = "../data/clean_tex/"
for id in papers:
    paper = papers[id]
    with open(dir_clean_tex + id, "w") as f:
        f.write(paper)

In [203]:
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer(lowercase=True)

# Train on your corpus
expanded_tex = "../data/clean_tex/"
filenames = [expanded_tex + filename for filename in filter(lambda str : (".tex" in str[-4:]), os.listdir(expanded_tex))]
tokenizer.train(filenames, vocab_size=30000, min_frequency=2)






In [207]:
bert = AutoTokenizer.from_pretrained("thellert/physbert_cased")

AttributeError: 'BertWordPieceTokenizer' object has no attribute 'AutoTokenizer'

In [199]:
chunk = papers["2411.12178.tex"][5000:10000]
print(chunk)

a  recorded by LHCb 
at 13\tev  in 2016--2018 (Run\,2\xspace), corresponding to an integrated luminosity of $6 \invfb$. 
The \lhcb detector is a single-arm forward spectrometer covering the {pseudorapidity} range $2<\eta <5$, described in detail in Refs.~\cite{LHCb-DP-2008-001,LHCb-DP-2014-002}. 
The magnetic-field polarity is reversed periodically during data taking to mitigate the differences of reconstruction efficiencies of particles with opposite charges.
Data sets corresponding to about half of the total integrated luminosity are recorded with each magnetic-field configuration.
Samples of simulated events are used to study the properties of the signal mode
$\decay{\Bp}\jpsi{\decay({}{\mu^+\mu^-})\pip}$ and the control mode $\decay{\Bp}\jpsi{\decay({}{\mu^+\mu^-})\Kp}$. 
These simulated events are produced with the software described in Refs.~\cite{Sjostrand:2007gs, Lange:2001uf,Golonka:2005pn,Allison:2006ve, *Agostinelli:2002hh,LHCb-PROC-2011-006}.
The momentum and transverse mom

In [209]:
tokenizer.encode(chunk).tokens

['a',
 'recorded',
 'by',
 'lhcb',
 'at',
 '13',
 '\\',
 'tev',
 'in',
 '2016',
 '-',
 '-',
 '2018',
 '(',
 'run',
 '\\',
 ',',
 '2',
 '\\',
 'xspace',
 ')',
 ',',
 'corresponding',
 'to',
 'an',
 'integrated',
 'luminosity',
 'of',
 '$',
 '6',
 '\\',
 'invfb',
 '$',
 '.',
 'the',
 '\\',
 'lhcb',
 'detector',
 'is',
 'a',
 'single',
 '-',
 'arm',
 'forward',
 'spectrometer',
 'covering',
 'the',
 '{',
 'pseudorapidity',
 '}',
 'range',
 '$',
 '2',
 '<',
 '\\',
 'eta',
 '<',
 '5',
 '$',
 ',',
 'described',
 'in',
 'detail',
 'in',
 'refs',
 '.',
 '~',
 '\\',
 'cite',
 '{',
 'lhcb',
 '-',
 'dp',
 '-',
 '2008',
 '-',
 '001',
 ',',
 'lhcb',
 '-',
 'dp',
 '-',
 '2014',
 '-',
 '002',
 '}',
 '.',
 'the',
 'magnetic',
 '-',
 'field',
 'polarity',
 'is',
 'reversed',
 'periodically',
 'during',
 'data',
 'taking',
 'to',
 'mitigate',
 'the',
 'differences',
 'of',
 'reconstruction',
 'efficiencies',
 'of',
 'particles',
 'with',
 'opposite',
 'charges',
 '.',
 'data',
 'sets',
 'corresponding',