In [1]:
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

import getpass
import os
import regex as re

import json
import glob

### Custom Algorithm for LaTeX Corpus to Tree Database for RAG

In [11]:
llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")

class Paper:
    def __init__(self, title: str, text: str, abstract: str = None, parent = None, depth: int = 0):
        self.title = title
        self.abstract = abstract
        self.text = text
        
        self.parent = parent
        self.depth = depth

        self.sections = self.split_to_sections(self.text)
        if len(self.sections) > 0 and self.abstract is None:
            self.abstract = self.generate_abstract()

    def __repr__(self):
        string = ""
        if self.depth > 0:
            string += "--"*(self.depth) + ">"
        string += self.title + "\n"
        for section in self.sections:
            string += section.__repr__()
        return string

    def split_to_sections(self, text: str):
        pattern = r"(\\" + "sub" * self.depth + r"section\s*({(?:[^{}]*+|(?2))*}))"
        matches = re.finditer(pattern, text)
        try:       
            sections = []
            start, title = 0, "Headers"
            for match in matches:
                end = match.start()
                section_text = text[start : end]
                if section_text.strip() == "":
                    continue
                sections.append(Paper(title = title, text = section_text, abstract = None, parent = self, depth = self.depth + 1))
                start, title = match.end(), match.group(2)[1:-1]
        except :
            sections = []
        return sections
        
    def generate_abstract(self):
        parent = self.parent
        abstract = ""
        while parent:
            abstract = parent.abstract + abstract
            parent = parent.parent

        prompt = f"""
        ======== Problem Statement ========
        You are an elite particle physics researcher. You will be given the abstract for a particle physics paper as well as a text excerpt from that paper. Your job is to, in as few words as possible, add on to the existing abstract a fully descriptive description of specifically the contents of the text excerpt. You may not modify the original abstract, and your output should start where the abstract ends.

        ======== Abstract =================
        {abstract}

        ======== Text Excerpt =============
        {self.text}
        """

        message = HumanMessage(content=prompt)
        response = llm(messages=[prompt])
        return response.content

In [34]:
papers = []

dir_expanded_tex = "../data/expanded_tex_nomacro/"
dir_abstracts = "../data/abstracts/"
i = 0
for filename in os.listdir(dir_expanded_tex):
    with open(dir_abstracts + filename) as file:
        abstract = file.read()
    with open(dir_expanded_tex + filename) as file:
        full_tex = file.read()

    papers.append(Paper(filename, full_tex, abstract = abstract))
    i += 1
    if i == 10:
        break
        

In [29]:
papers[0]

1304.4518.tex
-->Headers
-->Introduction
-->Detector and triggers
-->Signal candidate selection
-->Signal and background discrimination
-->Normalisation
-->Background studies

In [30]:
papers[1]

1509.00414.tex
-->Headers
-->Introduction
-->Detector and simulation
-->Event selection
-->Event yields
-->Results
---->Headers
---->Differential branching fraction
---->CKM matrix elements

In [33]:
papers[2]

1304.4530.tex
-->Headers
-->Introduction
-->\mbox{LHCb} detector
-->Event selection
-->Observation of {\boldmath\B_c^+{}\rightarrow{}{J\mskip -3mu/\mskip -2mu\psi\mskip 2mu}{}D^+_s}
-->Normalization to the {\boldmath\B_c^+{}\rightarrow{}{J\mskip -3mu/\mskip -2mu\psi\mskip 2mu}{}\pi^+} decay mode
-->Systematic uncertainties

In [24]:
papers[2].abstract

'The first measurement of ${C\\!P}$ asymmetries in the decay ${B_s^0\\to J/\\psi \\overline{K}^{*}(892)^{0}}$ and an updated measurement of its branching fraction and polarisation fractions are presented. The results are obtained using data corresponding to an integrated luminosity of $3.0\\,fb^{-1}$ of proton-proton collisions recorded with the LHCb detector at centre-of-mass energies of $7$ and $8\\,\\mathrm{TeV}$. Together with constraints from ${B^0\\to J/\\psi \\rho^0}$, the results are used to constrain additional contributions due to penguin diagrams in the ${C\\!P}$-violating phase ${{\\phi}_{s}}$, measured through ${B_s^0}$ decays to charmonium.'

In [27]:
papers[2].sections[7].abstract

'The branching fraction of the decay ${B_s^0\\to J/\\psi K^{*}(892)^{0}}$ is determined by normalising to the decay channels ${B_s^0\\to J/\\psi \\phi}$ and ${B^0\\to J/\\psi K^{*}}$, with detailed calculations of efficiency ratios and correction factors to account for background interference and angular acceptance. The results yield a branching fraction ratio of ${\\BRof\\BsJpsiKst/\\BRof\\BsJpsiPhi = (4.05 \\pm 0.19 \\, \\text{(stat)} \\pm 0.13 \\, \\text{(syst)})\\%}$ and ${\\BRof\\BsJpsiKst/\\BRof\\BdJpsiKst = (2.99 \\pm 0.14 \\, \\text{(stat)} \\pm 0.12 \\, \\text{(syst)} \\pm 0.17 \\, (f_d/f_s))\\%}$. The final averaged branching fraction is found to be ${\\BR{\\BsJpsiKst} = (4.14 \\pm 0.18 \\, \\text{(stat)} \\pm 0.26 \\, \\text{(syst)} \\pm 0.24 \\, (f_d/f_s))\\times 10^{-5}}$, consistent with previous measurements.'

In [220]:
abstract = papers[3].abstract
text = papers[3].sections[3].text

The analysis employs stringent selection criteria for charged particle tracks, utilizing a neural network to enhance track quality and suppress fake tracks. Duplicate particles are minimized through the Kullback-Leibler divergence, while muon and hadron candidates are identified based on likelihood ratios and momentum requirements. The selection of J/ψ candidates is based on oppositely-charged muon pairs, ensuring good vertex quality and separation from the primary interaction vertex. D_s mesons are reconstructed with specific mass and momentum criteria, and B_c candidates are formed from J/ψD_s pairs, subjected to kinematic fitting and decay time requirements to ensure accurate measurements.


In [227]:
print(abstract + "\n" + response.content)

The decays $B^+_c \rightarrow J/\psi D_s^+$ and $B^+_c \rightarrow J/\psi D_s^{*+}$ are observed for the first time using a dataset, corresponding to an integrated luminosity of 3$fb^{-1}$, collected by the LHCb experiment in proton-proton collisions at centre-of-mass energies of $\sqrt{s}$=7 and 8 TeV. The statistical significance for both signals is in excess of 9 standard deviations. The following ratios of branching fractions are measured to be $BR(B^+_c \rightarrow J/\psi D_s^+)/BR(B^+_c \rightarrow J/\psi \pi+) = 2.90 \pm 0.57 \pm 0.24$, $BR(B^+_c \rightarrow J/\psi D_s^{*+}) / BR (B^+_c \rightarrow J/\psi D_s^+) = 2.37 \pm 0.56 \pm 0.10$, where the first uncertainties are statistical and the second systematic. The mass of the \Bc meson is measured to be $m_{B^+_c} = 6276.28 \pm 1.44 (stat) \pm 0.36(syst) MeV/c^2$, using the $B^+_c \rightarrow J/\psi D_s^+$ decay mode.
The analysis employs stringent selection criteria for charged particle tracks, utilizing a neural network to enh