Load doc2json venv

In [None]:
import os

# Define the path to the working directory in WSL format
working_dir = r'/mnt/e/Google Drive/RuBase (1)/Corpora/RFSDP/Dissertations/ProQuest/240123'

# Set the working directory
os.chdir(working_dir)

# Verify that the working directory is set correctly
print(f"Current working directory: {os.getcwd()}")

In [None]:
import pickle
import os

# Assuming 'working_dir' is your working directory path
pickle_file_name = "results.pickle"  # The name of your pickle file

# Construct the full path to the pickle file
pickle_file_path = os.path.join(working_dir, pickle_file_name)

# Read 'results' back from the pickle file
with open(pickle_file_path, 'rb') as file:
    results_loaded = pickle.load(file)

# Now, 'results_loaded' contains the data from the pickle file
print("Loaded 'results' from the pickle file.")

# Takes about 3.6s (when in cache)


In [None]:
import pandas as pd
import json
import bs4
import re
from bs4 import BeautifulSoup, NavigableString
from typing import List, Dict, Tuple
from datetime import datetime
from tqdm.auto import tqdm

from doc2json.s2orc import Paper

from doc2json.utils.grobid_util import parse_bib_entry, extract_paper_metadata_from_grobid_xml
from doc2json.utils.citation_util import SINGLE_BRACKET_REGEX, BRACKET_REGEX, BRACKET_STYLE_THRESHOLD
from doc2json.utils.citation_util import is_expansion_string, _clean_empty_and_duplicate_authors_from_grobid_parse
from doc2json.utils.refspan_util import sub_spans_and_update_indices


REPLACE_TABLE_TOKS = {
    "<row>": "<tr>",
    "<row/>": "<tr/>",
    "</row>": "</tr>",
    "<cell>": "<td>",
    "<cell/>": "<td/>",
    "</cell>": "</td>",
    "<cell ": "<td ",
    "cols=": "colspan="
}


class UniqTokenGenerator:
    """
    Generate unique token
    """
    def __init__(self, tok_string):
        self.tok_string = tok_string
        self.ind = 0

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        new_token = f'{self.tok_string}{self.ind}'
        self.ind += 1
        return new_token


def normalize_grobid_id(grobid_id: str):
    """
    Normalize grobid object identifiers
    :param grobid_id:
    :return:
    """
    str_norm = grobid_id.upper().replace('_', '').replace('#', '')
    if str_norm.startswith('B'):
        return str_norm.replace('B', 'BIBREF')
    if str_norm.startswith('TAB'):
        return str_norm.replace('TAB', 'TABREF')
    if str_norm.startswith('FIG'):
        return str_norm.replace('FIG', 'FIGREF')
    if str_norm.startswith('FORMULA'):
        return str_norm.replace('FORMULA', 'EQREF')
    return str_norm


def parse_bibliography(soup: BeautifulSoup) -> List[Dict]:
    """
    Finds all bibliography entries in a grobid xml.
    """
    bibliography = soup.listBibl
    if bibliography is None:
        return []

    entries = bibliography.find_all("biblStruct")

    structured_entries = []
    for entry in entries:
        bib_entry = parse_bib_entry(entry)
        # add bib entry only if it has a title
        if bib_entry['title']:
            structured_entries.append(bib_entry)

    bibliography.decompose()

    return structured_entries


def extract_formulas_from_tei_xml(sp: BeautifulSoup) -> None:
    """
    Replace all formulas with the text
    :param sp:
    :return:
    """
    for eq in sp.find_all('formula'):
        eq.replace_with(sp.new_string(eq.text.strip()))


def table_to_html(table: bs4.element.Tag) -> str:
    """
    Sub table tags with html table tags
    :param table_str:
    :return:
    """
    for tag in table:
        if tag.name != 'row':
            print(f'Unknown table subtag: {tag.name}')
            tag.decompose()
    table_str = str(table)
    for token, subtoken in REPLACE_TABLE_TOKS.items():
        table_str = table_str.replace(token, subtoken)
    return table_str


def extract_figures_and_tables_from_tei_xml(sp: BeautifulSoup) -> Dict[str, Dict]:
    """
    Generate figure and table dicts
    :param sp:
    :return:
    """
    ref_map = dict()

    for fig in sp.find_all('figure'):
        try:
            if fig.name and fig.get('xml:id'):
                if fig.get('type') == 'table':
                    ref_map[normalize_grobid_id(fig.get('xml:id'))] = {
                        "text": fig.figDesc.text.strip() if fig.figDesc else fig.head.text.strip() if fig.head else "",
                        "latex": None,
                        "type": "table",
                        "content": table_to_html(fig.table),
                        "fig_num": fig.get('xml:id')
                    }
                else:
                    if True in [char.isdigit() for char in fig.findNext('head').findNext('label')]:
                        fig_num = fig.findNext('head').findNext('label').contents[0]
                    else:
                        fig_num = None
                    ref_map[normalize_grobid_id(fig.get('xml:id'))] = {
                        "text": fig.figDesc.text.strip() if fig.figDesc else "",
                        "latex": None,
                        "type": "figure",
                        "content": "",
                        "fig_num": fig_num
                    }
        except AttributeError:
            continue
        fig.decompose()

    return ref_map


def check_if_citations_are_bracket_style(sp: BeautifulSoup) -> bool:
    """
    Check if the document has bracket style citations
    :param sp:
    :return:
    """
    cite_strings = []
    if sp.body:
        for div in sp.body.find_all('div'):
            if div.head:
                continue
            for rtag in div.find_all('ref'):
                ref_type = rtag.get('type')
                if ref_type == 'bibr':
                    cite_strings.append(rtag.text.strip())

        # check how many match bracket style
        bracket_style = [bool(BRACKET_REGEX.match(cite_str)) for cite_str in cite_strings]

        # return true if
        if sum(bracket_style) > BRACKET_STYLE_THRESHOLD:
            return True

    return False


def sub_all_note_tags(sp: BeautifulSoup) -> BeautifulSoup:
    """
    Sub all note tags with p tags
    :param para_el:
    :param sp:
    :return:
    """
    for ntag in sp.find_all('note'):
        p_tag = sp.new_tag('p')
        p_tag.string = ntag.text.strip()
        ntag.replace_with(p_tag)
    return sp


def process_formulas_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup) -> None:
    """
    Process all formulas in paragraph and replace with text and label
    :param para_el:
    :param sp:
    :return:
    """
    for ftag in para_el.find_all('formula'):
        # get label if exists and insert a space between formula and label
        if ftag.label:
            label = ' ' + ftag.label.text
            ftag.label.decompose()
        else:
            label = ''
        ftag.replace_with(sp.new_string(f'{ftag.text.strip()}{label}'))


def process_references_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, refs: Dict) -> Dict:
    """
    Process all references in paragraph and generate a dict that contains (type, ref_id, surface_form)
    :param para_el:
    :param sp:
    :param refs:
    :return:
    """
    tokgen = UniqTokenGenerator('REFTOKEN')
    ref_dict = dict()
    for rtag in para_el.find_all('ref'):
        try:
            ref_type = rtag.get('type')
            # skip if citation
            if ref_type == 'bibr':
                continue
            if ref_type == 'table' or ref_type == 'figure':
                ref_id = rtag.get('target')
                if ref_id and normalize_grobid_id(ref_id) in refs:
                    # normalize reference string
                    rtag_string = normalize_grobid_id(ref_id)
                else:
                    rtag_string = None
                # add to ref set
                ref_key = tokgen.next()
                ref_dict[ref_key] = (rtag_string, rtag.text.strip(), ref_type)
                rtag.replace_with(sp.new_string(f" {ref_key} "))
            else:
                # replace with surface form
                rtag.replace_with(sp.new_string(rtag.text.strip()))
        except AttributeError:
            continue
    return ref_dict


def process_citations_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, bibs: Dict, bracket: bool) -> Dict:
    """
    Process all citations in paragraph and generate a dict for surface forms
    :param para_el:
    :param sp:
    :param bibs:
    :param bracket:
    :return:
    """
    # CHECK if range between two surface forms is appropriate for bracket style expansion
    def _get_surface_range(start_surface, end_surface):
        span1_match = SINGLE_BRACKET_REGEX.match(start_surface)
        span2_match = SINGLE_BRACKET_REGEX.match(end_surface)
        if span1_match and span2_match:
            # get numbers corresponding to citations
            span1_num = int(span1_match.group(1))
            span2_num = int(span2_match.group(1))
            # expand if range is between 1 and 20
            if 1 < span2_num - span1_num < 20:
                return span1_num, span2_num
        return None

    # CREATE BIBREF range between two reference ids, e.g. BIBREF1-BIBREF4 -> BIBREF1 BIBREF2 BIBREF3 BIBREF4
    def _create_ref_id_range(start_ref_id, end_ref_id):
        start_ref_num = int(start_ref_id[6:])
        end_ref_num = int(end_ref_id[6:])
        return [f'BIBREF{curr_ref_num}' for curr_ref_num in range(start_ref_num, end_ref_num + 1)]

    # CREATE surface form range between two bracket strings, e.g. [1]-[4] -> [1] [2] [3] [4]
    def _create_surface_range(start_number, end_number):
        return [f'[{n}]' for n in range(start_number, end_number + 1)]

    # create citation dict with keywords
    cite_map = dict()
    tokgen = UniqTokenGenerator('CITETOKEN')

    for rtag in para_el.find_all('ref'):
        try:
            # get surface span, e.g. [3]
            surface_span = rtag.text.strip()

            # check if target is available (#b2 -> BID2)
            if rtag.get('target'):
                # normalize reference string
                rtag_ref_id = normalize_grobid_id(rtag.get('target'))

                # skip if rtag ref_id not in bibliography
                if rtag_ref_id not in bibs:
                    cite_key = tokgen.next()
                    rtag.replace_with(sp.new_string(f" {cite_key} "))
                    cite_map[cite_key] = (None, surface_span)
                    continue

                # if bracket style, only keep if surface form is bracket
                if bracket:
                    # valid bracket span
                    if surface_span and (surface_span[0] == '[' or surface_span[-1] == ']' or surface_span[-1] == ','):
                        pass
                    # invalid, replace tag with surface form and continue to next ref tag
                    else:
                        rtag.replace_with(sp.new_string(f" {surface_span} "))
                        continue
                # not bracket, add cite span and move on
                else:
                    cite_key = tokgen.next()
                    rtag.replace_with(sp.new_string(f" {cite_key} "))
                    cite_map[cite_key] = (rtag_ref_id, surface_span)
                    continue

                ### EXTRA PROCESSING FOR BRACKET STYLE CITATIONS; EXPAND RANGES ###
                # look backward for range marker, e.g. [1]-*[3]*
                backward_between_span = ""
                for sib in rtag.previous_siblings:
                    if sib.name == 'ref':
                        break
                    elif type(sib) == NavigableString:
                        backward_between_span += sib
                    else:
                        break

                # check if there's a backwards expansion, e.g. need to expand [1]-[3] -> [1] [2] [3]
                if is_expansion_string(backward_between_span):
                    # get surface number range
                    surface_num_range = _get_surface_range(
                        rtag.find_previous_sibling('ref').text.strip(),
                        surface_span
                    )
                    # if the surface number range is reasonable (range < 20, in order), EXPAND
                    if surface_num_range:
                        # delete previous ref tag and anything in between (i.e. delete "-" and extra spaces)
                        for sib in rtag.previous_siblings:
                            if sib.name == 'ref':
                                break
                            elif type(sib) == NavigableString:
                                sib.replace_with(sp.new_string(""))
                            else:
                                break

                        # get ref id of previous ref, e.g. [1] (#b0 -> BID0)
                        previous_rtag = rtag.find_previous_sibling('ref')
                        previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get('target'))
                        previous_rtag.decompose()

                        # replace this ref tag with the full range expansion, e.g. [3] (#b2 -> BID1 BID2)
                        id_range = _create_ref_id_range(previous_rtag_ref_id, rtag_ref_id)
                        surface_range = _create_surface_range(surface_num_range[0], surface_num_range[1])
                        replace_string = ''
                        for range_ref_id, range_surface_form in zip(id_range, surface_range):
                            # only replace if ref id is in bibliography, else add none
                            if range_ref_id in bibs:
                                cite_key = tokgen.next()
                                cite_map[cite_key] = (range_ref_id, range_surface_form)
                            else:
                                cite_key = tokgen.next()
                                cite_map[cite_key] = (None, range_surface_form)
                            replace_string += cite_key + ' '
                        rtag.replace_with(sp.new_string(f" {replace_string} "))
                    # ELSE do not expand backwards and replace previous and current rtag with appropriate ref id
                    else:
                        # add mapping between ref id and surface form for previous ref tag
                        previous_rtag = rtag.find_previous_sibling('ref')
                        previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get('target'))
                        previous_rtag_surface = previous_rtag.text.strip()
                        cite_key = tokgen.next()
                        previous_rtag.replace_with(sp.new_string(f" {cite_key} "))
                        cite_map[cite_key] = (previous_rtag_ref_id, previous_rtag_surface)

                        # add mapping between ref id and surface form for current reftag
                        cite_key = tokgen.next()
                        rtag.replace_with(sp.new_string(f" {cite_key} "))
                        cite_map[cite_key] = (rtag_ref_id, surface_span)
                else:
                    # look forward and see if expansion string, e.g. *[1]*-[3]
                    forward_between_span = ""
                    for sib in rtag.next_siblings:
                        if sib.name == 'ref':
                            break
                        elif type(sib) == NavigableString:
                            forward_between_span += sib
                        else:
                            break
                    # look forward for range marker (if is a range, continue -- range will be expanded
                    # when we get to the second value)
                    if is_expansion_string(forward_between_span):
                        continue
                    # else treat like normal reference
                    else:
                        cite_key = tokgen.next()
                        rtag.replace_with(sp.new_string(f" {cite_key} "))
                        cite_map[cite_key] = (rtag_ref_id, surface_span)

            else:
                cite_key = tokgen.next()
                rtag.replace_with(sp.new_string(f" {cite_key} "))
                cite_map[cite_key] = (None, surface_span)
        except AttributeError:
            continue

    return cite_map


def process_paragraph(
        sp: BeautifulSoup,
        para_el: bs4.element.Tag,
        section_names: List[Tuple],
        bib_dict: Dict,
        ref_dict: Dict,
        bracket: bool
) -> Dict:
    """
    Process one paragraph
    :param sp:
    :param para_el:
    :param section_names:
    :param bib_dict:
    :param ref_dict:
    :param bracket: if bracket style, expand and clean up citations
    :return:
    """
    # return empty paragraph if no text
    if not para_el.text:
        return {
            'text': "",
            'cite_spans': [],
            'ref_spans': [],
            'eq_spans': [],
            'section': section_names
        }

    # replace formulas with formula text
    process_formulas_in_paragraph(para_el, sp)

    # get references to tables and figures
    ref_map = process_references_in_paragraph(para_el, sp, ref_dict)

    # generate citation map for paragraph element (keep only cite spans with bib entry or unlinked)
    cite_map = process_citations_in_paragraph(para_el, sp, bib_dict, bracket)

    # substitute space characters
    para_text = re.sub(r'\s+', ' ', para_el.text)
    para_text = re.sub(r'\s', ' ', para_text)

    # get all cite and ref spans
    all_spans_to_replace = []
    for span in re.finditer(r'(CITETOKEN\d+)', para_text):
        uniq_token = span.group()
        ref_id, surface_text = cite_map[uniq_token]
        all_spans_to_replace.append((
            span.start(),
            span.start() + len(uniq_token),
            uniq_token,
            surface_text
        ))
    for span in re.finditer(r'(REFTOKEN\d+)', para_text):
        uniq_token = span.group()
        ref_id, surface_text, ref_type = ref_map[uniq_token]
        all_spans_to_replace.append((
            span.start(),
            span.start() + len(uniq_token),
            uniq_token,
            surface_text
        ))

    # replace cite and ref spans and create json blobs
    para_text, all_spans_to_replace = sub_spans_and_update_indices(all_spans_to_replace, para_text)

    cite_span_blobs = [{
        "start": start,
        "end": end,
        "text": surface,
        "ref_id": cite_map[token][0]
    } for start, end, token, surface in all_spans_to_replace if token.startswith('CITETOKEN')]

    ref_span_blobs = [{
        "start": start,
        "end": end,
        "text": surface,
        "ref_id": ref_map[token][0]
    } for start, end, token, surface in all_spans_to_replace if token.startswith('REFTOKEN')]

    for cite_blob in cite_span_blobs:
        assert para_text[cite_blob["start"]:cite_blob["end"]] == cite_blob["text"]

    for ref_blob in ref_span_blobs:
        assert para_text[ref_blob["start"]:ref_blob["end"]] == ref_blob["text"]

    return {
        'text': para_text,
        'cite_spans': cite_span_blobs,
        'ref_spans': ref_span_blobs,
        'eq_spans': [],
        'section': section_names
    }


def extract_abstract_from_tei_xml(
        sp: BeautifulSoup,
        bib_dict: Dict,
        ref_dict: Dict,
        cleanup_bracket: bool
) -> List[Dict]:
    """
    Parse abstract from soup
    :param sp:
    :param bib_dict:
    :param ref_dict:
    :param cleanup_bracket:
    :return:
    """
    abstract_text = []
    if sp.abstract:
        # process all divs
        if sp.abstract.div:
            for div in sp.abstract.find_all('div'):
                if div.text:
                    if div.p:
                        for para in div.find_all('p'):
                            if para.text:
                                abstract_text.append(
                                    process_paragraph(sp, para, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
                                )
                    else:
                        if div.text:
                            abstract_text.append(
                                process_paragraph(sp, div, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
                            )
        # process all paragraphs
        elif sp.abstract.p:
            for para in sp.abstract.find_all('p'):
                if para.text:
                    abstract_text.append(
                        process_paragraph(sp, para, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
                    )
        # else just try to get the text
        else:
            if sp.abstract.text:
                abstract_text.append(
                    process_paragraph(sp, sp.abstract, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
                )
        sp.abstract.decompose()
    return abstract_text


def extract_body_text_from_div(
        sp: BeautifulSoup,
        div: bs4.element.Tag,
        sections: List[Tuple],
        bib_dict: Dict,
        ref_dict: Dict,
        cleanup_bracket: bool
) -> List[Dict]:
    """
    Parse body text from soup
    :param sp:
    :param div:
    :param sections:
    :param bib_dict:
    :param ref_dict:
    :param cleanup_bracket:
    :return:
    """
    chunks = []
    # check if nested divs; recursively process
    if div.div:
        for subdiv in div.find_all('div'):
            # has header, add to section list and process
            if subdiv.head:
                chunks += extract_body_text_from_div(
                    sp,
                    subdiv,
                    sections + [(subdiv.head.get('n', None), subdiv.head.text.strip())],
                    bib_dict,
                    ref_dict,
                    cleanup_bracket
                )
                subdiv.head.decompose()
            # no header, process with same section list
            else:
                chunks += extract_body_text_from_div(
                    sp,
                    subdiv,
                    sections,
                    bib_dict,
                    ref_dict,
                    cleanup_bracket
                )
    # process tags individuals
    for tag in div:
        try:
            if tag.name == 'p':
                if tag.text:
                    chunks.append(process_paragraph(
                        sp, tag, sections, bib_dict, ref_dict, cleanup_bracket
                    ))
            elif tag.name == 'formula':
                # e.g. <formula xml:id="formula_0">Y = W T X.<label>(1)</label></formula>
                label = tag.label.text
                tag.label.decompose()
                eq_text = tag.text
                chunks.append({
                    'text': 'EQUATION',
                    'cite_spans': [],
                    'ref_spans': [],
                    'eq_spans': [
                        {
                            "start": 0,
                            "end": 8,
                            "text": "EQUATION",
                            "ref_id": "EQREF",
                            "raw_str": eq_text,
                            "eq_num": label
                        }
                    ],
                    'section': sections
                })
        except AttributeError:
            if tag.text:
                chunks.append(process_paragraph(
                    sp, tag, sections, bib_dict, ref_dict, cleanup_bracket
                ))

    return chunks


def extract_body_text_from_tei_xml(
        sp: BeautifulSoup,
        bib_dict: Dict,
        ref_dict: Dict,
        cleanup_bracket: bool
) -> List[Dict]:
    """
    Parse body text from soup
    :param sp:
    :param bib_dict:
    :param ref_dict:
    :param cleanup_bracket:
    :return:
    """
    body_text = []
    if sp.body:
        body_text = extract_body_text_from_div(sp, sp.body, [], bib_dict, ref_dict, cleanup_bracket)
        sp.body.decompose()
    return body_text


def extract_back_matter_from_tei_xml(
        sp: BeautifulSoup,
        bib_dict: Dict,
        ref_dict: Dict,
        cleanup_bracket: bool
) -> List[Dict]:
    """
    Parse back matter from soup
    :param sp:
    :param bib_dict:
    :param ref_dict:
    :param cleanup_bracket:
    :return:
    """
    back_text = []

    if sp.back:
        for div in sp.back.find_all('div'):
            if div.get('type'):
                section_type = div.get('type')
            else:
                section_type = ''

            for child_div in div.find_all('div'):
                if child_div.head:
                    section_title = child_div.head.text.strip()
                    section_num = child_div.head.get('n', None)
                    child_div.head.decompose()
                else:
                    section_title = section_type
                    section_num = None
                if child_div.text:
                    if child_div.text:
                        back_text.append(
                            process_paragraph(sp, child_div, [(section_num, section_title)], bib_dict, ref_dict, cleanup_bracket)
                        )
        sp.back.decompose()
    return back_text


def convert_tei_xml_soup_to_s2orc_json(soup: BeautifulSoup, paper_id: str, pdf_hash: str) -> Paper:
    """
    Convert Grobid TEI XML to S2ORC json format
    :param soup: BeautifulSoup of XML file content
    :param paper_id: name of file
    :param pdf_hash: hash of PDF
    :return:
    """
    # extract metadata
    metadata = extract_paper_metadata_from_grobid_xml(soup.fileDesc)
    # clean metadata authors (remove dupes etc)
    metadata['authors'] = _clean_empty_and_duplicate_authors_from_grobid_parse(metadata['authors'])

    # parse bibliography entries (removes empty bib entries)
    biblio_entries = parse_bibliography(soup)
    bibkey_map = {
        normalize_grobid_id(bib['ref_id']): bib for bib in biblio_entries
    }

    # # process formulas and replace with text
    # extract_formulas_from_tei_xml(soup)

    # extract figure and table captions
    refkey_map = extract_figures_and_tables_from_tei_xml(soup)

    # get bracket style
    is_bracket_style = check_if_citations_are_bracket_style(soup)

    # substitute all note tags with p tags
    soup = sub_all_note_tags(soup)

    # process abstract if possible
    abstract_entries = extract_abstract_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)

    # process body text
    body_entries = extract_body_text_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)

    # parse back matter (acks, author statements, competing interests, abbrevs etc)
    back_matter = extract_back_matter_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)

    # form final paper entry
    return Paper(
        paper_id=paper_id,
        pdf_hash=pdf_hash,
        metadata=metadata,
        abstract=abstract_entries,
        body_text=body_entries,
        back_matter=back_matter,
        bib_entries=bibkey_map,
        ref_entries=refkey_map
    )

def convert_tei_xml_string_to_s2orc_json(tei_xml_string: str, paper_id: str, pdf_hash: str = "") -> Paper:
    """
    Convert a TEI XML string to S2ORC JSON format.
    :param tei_xml_string: XML content as a string.
    :param paper_id: Identifier for the paper.
    :param pdf_hash: Hash of the corresponding PDF, if available.
    :return: Paper object in S2ORC format.
    """
    soup = BeautifulSoup(tei_xml_string, "xml")
    return convert_tei_xml_soup_to_s2orc_json(soup, paper_id, pdf_hash)

def process_paper(paper):
    paper_data = {
        'paper_id': str(paper.paper_id),
        'pdf_hash': paper.pdf_hash,
        'metadata': vars(paper.metadata) if paper.metadata else {},
        'abstract': [vars(item) for item in paper.abstract] if paper.abstract else [],
        'body_text': [vars(item) for item in paper.body_text] if paper.body_text else [],
        'back_matter': [vars(item) for item in paper.back_matter] if paper.back_matter else [],
        'bib_entries': [vars(item) for item in paper.bib_entries] if paper.bib_entries else [],
        'ref_entries': [vars(item) for item in paper.ref_entries] if paper.ref_entries else [],
        'raw_abstract_text': paper.raw_abstract_text,
        'raw_body_text': paper.raw_body_text
    }
    return paper_data

# Convert XML strings to Paper objects
papers = []
for paper_id, xml_content in tqdm(results_loaded.items(), desc="Processing XML"):
    paper = convert_tei_xml_string_to_s2orc_json(xml_content, paper_id)
    papers.append(paper)

# Convert the papers list to a list of dictionaries using the custom processing function
papers_processed = []
for paper in tqdm(papers, desc="Processing Papers"):
    processed_paper = process_paper(paper)
    papers_processed.append(processed_paper)

# Create a DataFrame from the list of dictionaries
df_papers = pd.DataFrame(papers_processed)

# Write the DataFrame to a JSONL file
output_file_suffix = datetime.now().strftime("_%y%m%d")
output_jsonl_path = f'grobid2json{output_file_suffix}.jsonl'  # Adjust the path as needed
df_papers.to_json(output_jsonl_path, orient='records', lines=True, force_ascii=False)

print(f"Data written to JSONL file at: {output_jsonl_path}")

# Takes ~9 minutes


In [None]:
df_papers.info()

In [None]:
df_papers.head()

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Function to calculate text length
def text_length(text):
    return len(text) if isinstance(text, str) else 0

# Calculate the length of text in each column
df_papers['raw_abstract_text_length'] = df_papers['raw_abstract_text'].apply(text_length)
df_papers['abstract_text_length'] = df_papers['abstract'].apply(lambda x: sum(text_length(item['text']) for item in x))
df_papers['body_text_length'] = df_papers['body_text'].apply(lambda x: sum(text_length(item['text']) for item in x))
df_papers['raw_body_text_length'] = df_papers['raw_body_text'].apply(text_length)

# Create a subplot with 4 plots
fig = make_subplots(rows=4, cols=1, subplot_titles=("Raw Abstract Text Length", "Abstract Text Length", "Body Text Length", "Raw Body Text Length"))

# Add histograms to the subplot with binsize of 10,000
fig.add_trace(go.Histogram(x=df_papers['raw_abstract_text_length'], xbins=dict(start=0, end=max(df_papers['raw_abstract_text_length']), size=10000)), row=1, col=1)
fig.add_trace(go.Histogram(x=df_papers['abstract_text_length'], xbins=dict(start=0, end=max(df_papers['abstract_text_length']), size=10000)), row=2, col=1)
fig.add_trace(go.Histogram(x=df_papers['body_text_length'], xbins=dict(start=0, end=max(df_papers['body_text_length']), size=10000)), row=3, col=1)
fig.add_trace(go.Histogram(x=df_papers['raw_body_text_length'], xbins=dict(start=0, end=max(df_papers['raw_body_text_length']), size=10000)), row=4, col=1)

# Update layout
fig.update_layout(height=800, width=700, title_text="Distribution of Characters per 10,000 in Text Columns", showlegend=False)

# Show plot
fig.show()

In [None]:
# !wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz


In [None]:
# %pip install fasttext

In [None]:
import fasttext
import pandas as pd
from tqdm.auto import tqdm

# Load the pre-trained language detection model
model = fasttext.load_model("lid.176.ftz")

# Function to predict language and extract the ISO code
def detect_language(text):
    # Replace newline characters with spaces
    text = text.replace('\n', ' ').replace('\r', ' ')
    # Predict the language
    predictions = model.predict(text, k=1)
    # Extract the language code
    lang_code = predictions[0][0].replace("__label__", "")
    return lang_code

# Detect language for each entry in 'raw_body_text' column
df_papers['lang'] = [detect_language(text) for text in tqdm(df_papers['raw_body_text'], desc='Detecting Language')]

# Now df_papers has a new column 'lang' with the language codes


In [None]:
df_papers.head(2)

In [None]:
import pandas as pd

# Create a new DataFrame with truncated 'raw_body_text' and 'lang'
df_preview = pd.DataFrame({
    'raw_body_text_truncated': df_papers['raw_body_text'].str[:2000],
    'lang': df_papers['lang']
})

# Display the DataFrame
print(df_preview)


In [None]:
import pandas as pd

# Assuming you have your DataFrame df_papers already loaded

# Use the value_counts() function to count unique values in the 'lang' column
lang_counts = df_papers['lang'].value_counts().reset_index()

# Rename the columns for clarity
lang_counts.columns = ['Language', 'Count']

# Sort the DataFrame by count in descending order
lang_counts = lang_counts.sort_values(by='Count', ascending=False)

# Display the result
print(lang_counts)


In [None]:
# %pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
# %pip install transformers
# %pip install sentencepiece

In [None]:
%pip install protobuf


In [None]:
!sudo apt update


In [None]:
import pandas as pd
import torch
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
from tqdm.auto import tqdm

# Assuming df_papers is your DataFrame
print("Initializing the model and tokenizer...")
model_name = 'facebook/mbart-large-50-many-to-many-mmt'
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to('cuda')

def translate_mbart_batch(texts, src_lang_code, model, tokenizer):
    torch.cuda.empty_cache()
    print(f"Translating {len(texts)} texts from {src_lang_code} to English...")
    tokenizer.src_lang = src_lang_code
    encoded = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=1024).to('cuda')
    generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
    translations = [tokenizer.decode(tokens, skip_special_tokens=True) for tokens in generated_tokens]
    print(f"Completed translating {len(texts)} texts.")
    return translations

# Prepare a new column for the translations
df_papers['raw_body_text_en'] = None

batch_size = 4  # Adjust based on your system's capabilities
print("Starting the translation process...")
grouped = df_papers[df_papers['lang'] != 'en'].groupby('lang')

with tqdm(total=len(df_papers), desc="Translating raw body texts") as pbar:
    for lang, group in grouped:
        print(f"Processing {len(group)} texts for language: {lang}...")

        texts = group['raw_body_text'].tolist()
        texts = [text for text in texts if text]  # Remove empty texts

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            print(f"Translating batch {i//batch_size + 1}: {len(batch_texts)} texts...")
            translated_texts = translate_mbart_batch(batch_texts, lang, model, tokenizer)
            df_papers.loc[group.index[i:i+batch_size], 'raw_body_text_en'] = translated_texts
            pbar.update(len(batch_texts))

print("Translation process completed.")


In [None]:
df_papers.info()

In [None]:
df_papers.head(3)

In [None]:
import pandas as pd

# Assuming df_papers is your DataFrame

# Filter the DataFrame for rows where 'lang' is 'en'
en_rows = df_papers[df_papers['lang'] == 'en']

# Select the first 10 rows from the filtered DataFrame
first_10_en_rows = en_rows.head(10)

# Print the first 2000 characters of 'raw_body_text' for these rows
for index, row in first_10_en_rows.iterrows():
    print(f"Row {index}:\n{row['raw_body_text'][:2000]}\n{'-'*50}\n")


In [None]:
import pandas as pd

# Create the examples DataFrame
max_chars_body = 2000  # Adjust as needed
examples = []

for lang in df_papers['lang'].unique():
    example_row = df_papers[df_papers['lang'] == lang].iloc[0]
    truncated_body_original = example_row['raw_body_text'][:max_chars_body] if example_row['raw_body_text'] else ""
    truncated_body_translated = example_row['raw_body_text_en'][:max_chars_body] if example_row['raw_body_text_en'] else ""
    examples.append({'Language': lang,
                     'Truncated Original Body Text': truncated_body_original,
                     'Truncated Translated Body Text': truncated_body_translated})

examples_df = pd.DataFrame(examples)
print(examples_df)


In [None]:
import pandas as pd

# Assuming df_papers is your DataFrame and it's already loaded.

# Only copy 'raw_body_text' to 'raw_body_text_en' for English language papers where 'raw_body_text' is not empty
df_papers.loc[
    (df_papers['lang'] == 'en') & (df_papers['raw_body_text'].notna()) & (df_papers['raw_body_text'] != ''), 
    'raw_body_text_en'
] = df_papers.loc[
    (df_papers['lang'] == 'en') & (df_papers['raw_body_text'].notna()) & (df_papers['raw_body_text'] != ''), 
    'raw_body_text'
]


In [None]:
import pandas as pd

# Assuming df_papers is your DataFrame and it's already loaded.

# Filter to only English language papers
english_papers = df_papers[df_papers['lang'] == 'en']

# Take a sample of 10 random rows from the English papers
sampled_english_papers = english_papers.sample(n=10)

# Create the examples DataFrame with truncated text
max_chars_body = 2000  # Adjust as needed
examples = []

for index, row in sampled_english_papers.iterrows():
    truncated_body_original = row['raw_body_text'][:max_chars_body] if pd.notna(row['raw_body_text']) else ""
    truncated_body_translated = row['raw_body_text_en'][:max_chars_body] if pd.notna(row['raw_body_text_en']) else ""
    examples.append({
        'Language': row['lang'],
        'Truncated Original Body Text': truncated_body_original,
        'Truncated Translated Body Text': truncated_body_translated
    })

examples_df = pd.DataFrame(examples)
print(examples_df)


In [None]:
import pandas as pd

# Function to count paragraphs in a text
def count_paragraphs(text):
    if pd.isna(text):
        return 0
    paragraphs = text.split('\n')
    return len(paragraphs)

# Apply the function to each row in the 'raw_body_text_en' column
df_papers['paragraph_count'] = df_papers['raw_body_text_en'].apply(count_paragraphs)

# Analyze the paragraph counts
avg_paragraphs = df_papers['paragraph_count'].mean()
min_paragraphs = df_papers['paragraph_count'].min()
max_paragraphs = df_papers['paragraph_count'].max()

print(f"Average number of paragraphs per dissertation: {avg_paragraphs}")
print(f"Minimum number of paragraphs in a dissertation: {min_paragraphs}")
print(f"Maximum number of paragraphs in a dissertation: {max_paragraphs}")


In [None]:
import pandas as pd
import re

# Assuming df_papers is your DataFrame and it already has the 'paragraph_count' column
# Count dissertations with fewer than 50 paragraphs
dissertations_fewer_than_50_paragraphs = (df_papers['paragraph_count'] < 50).sum()
print(f"Dissertations with fewer than 50 paragraphs: {dissertations_fewer_than_50_paragraphs}")

# Function to count sentences in a text
def count_sentences(text):
    if pd.isna(text):
        return 0
    # Simple sentence split based on period followed by space or end of string
    sentences = re.split(r'\.\s+|\.$', text)
    return len(sentences)

# Apply the function to each row in the 'raw_body_text_en' column
df_papers['sentence_count'] = df_papers['raw_body_text_en'].apply(count_sentences)

# Analyze the sentence counts
avg_sentences = df_papers['sentence_count'].mean()
min_sentences = df_papers['sentence_count'].min()
max_sentences = df_papers['sentence_count'].max()

print(f"Average number of sentences per dissertation: {avg_sentences}")
print(f"Minimum number of sentences in a dissertation: {min_sentences}")
print(f"Maximum number of sentences in a dissertation: {max_sentences}")


In [None]:
!pip install spacy
!python -m spacy download en_core_web_lg

In [None]:
import spacy
import pandas as pd
from tqdm.auto import tqdm
from datetime import datetime

# Load spaCy model
nlp = spacy.load('en_core_web_lg', disable=['ner', 'parser'])
nlp.enable_pipe("senter")

# Increase maximum text length (set to 2 million characters here, adjust as needed)
nlp.max_length = 3000000

# Function to split text into sentences using spaCy
def split_into_sentences(text):
    if pd.isna(text) or len(text) == 0:
        return []
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

# Assuming df_papers is your DataFrame and it has a column named 'raw_body_text_en'
# Create a new DataFrame for sentences
df_papers_sents = pd.DataFrame(columns=['paper_id', 'sentence'])

# Process each row in df_papers
for index, row in tqdm(df_papers.iterrows(), total=len(df_papers), desc="Processing dissertations"):
    sentences = split_into_sentences(row['raw_body_text_en'])
    temp_df = pd.DataFrame({'paper_id': index, 'sentence': sentences})
    df_papers_sents = pd.concat([df_papers_sents, temp_df], ignore_index=True)

# Write out the results to a JSONL file
output_filename = f'df_papers_sents_{datetime.now().strftime("%y%m%d")}.jsonl'
df_papers_sents.to_json(output_filename, orient='records', lines=True)

print(f"Data written to {output_filename}")


In [None]:
# %pip install 'urllib3<2' 

In [None]:
# %pip install -U --pre "weaviate-client==v4.4b2"
# %pip install --upgrade requests

In [None]:
df_papers_sents.info()

In [None]:
import pandas as pd
import torch
import uuid
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import weaviate
import json

# Initialize Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Function to Compute Embeddings
def compute_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    with torch.no_grad():
        outputs = model(**inputs)
    # Return a list of floats for the embedding
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()

# Initialize Weaviate Client
client = weaviate.Client("http://localhost:8080")

# Validation Function
def is_valid_data(data_object):
    # Add validation logic here
    return True  # Return True if valid, False otherwise

# Load and Slice DataFrame for First 15 Rows
df_papers_sents = df_papers_sents.head(15)

# Check for Empty 'sentence' Column
empty_sentences = df_papers_sents[df_papers_sents['sentence'].isna() | (df_papers_sents['sentence'] == '')]
if not empty_sentences.empty:
    print(f"Empty sentences found in rows: {empty_sentences.index.tolist()}")
else:
    print("No empty sentences found.")

# Process Data and Store in Weaviate
for index, row in tqdm(df_papers_sents.iterrows(), total=df_papers_sents.shape[0], desc="Processing Rows"):
    text = row['sentence']
    if pd.notna(text) and text.strip():  # Check if the text is not empty or NaN
        embedding = compute_embeddings(text)
        
        # Debugging: Print the type and a portion of the embedding
        print(f"Embedding type: {type(embedding)}")
        print(f"Embedding sample: {embedding[:10]}")

        data_object = {
            "embedding": embedding,
            "text": text,
            "reference": index
        }
        unique_id = row['uuid']
        try:
            client.data_object.create(data_object, "RFSDPDissesEmbeddingEn", unique_id)
            print(f"Added document embedding with UUID: {unique_id}")
        except weaviate.UnexpectedStatusCodeException as e:
            error_details = str(e)
            print(f"Error adding document embedding at index {index}: {error_details}")

print("Completed processing and storing embeddings.")


In [None]:
import pandas as pd
import torch
import uuid
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import weaviate

# Initialize Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Function to Compute Embeddings
def compute_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()

# Initialize Weaviate Client
client = weaviate.Client("http://localhost:8080")

# Delete existing class (if needed)
try:
    client.schema.delete_class("RFSDPDissesEmbeddingEn")
    print("Class RFSDPDissesEmbeddingEn deleted successfully.")
except Exception as e:
    print(f"Error deleting class: {e}")

# Create new class schema
class_schema = {
    "class": "RFSDPDissesEmbeddingEn",
    "description": "Store embeddings of sentences from dissertations",
    "properties": [
        {
            "name": "text",
            "dataType": ["text"],
            "description": "The text of the sentence",
        },
        {
            "name": "embedding",
            "dataType": ["number"],
            "description": "The embedding vector",
            "vectorIndexType": "hnsw",
            "vectorizer": "none",
        },
        {
            "name": "reference",
            "dataType": ["int"],
            "description": "A reference to the original paper",
        }
    ]
}

# Create new class
try:
    client.schema.create_class(class_schema)
    print("Class RFSDPDissesEmbeddingEn created successfully.")
except Exception as e:
    print(f"Error creating class: {e}")

# Load DataFrame (Assuming it's already available as df_papers_sents)
# df_papers_sents = pd.read_csv('your_csv_file.csv') # Uncomment if needed

# Add a UUID column to the DataFrame
df_papers_sents['uuid'] = [str(uuid.uuid4()) for _ in range(len(df_papers_sents))]

# Process Data and Store in Weaviate
batch_size = 256  # Adjust as needed
for start_idx in tqdm(range(0, len(df_papers_sents), batch_size), desc="Processing Batches"):
    end_idx = min(start_idx + batch_size, len(df_papers_sents))
    batch_df = df_papers_sents.iloc[start_idx:end_idx]

    for index, row in batch_df.iterrows():
        text = row['sentence']
        if pd.notna(text) and text.strip():  # Check if the text is not empty or NaN
            embedding = compute_embeddings(text)
            data_object = {
                "embedding": embedding,
                "text": text,
                "reference": index
            }
            unique_id = row['uuid']
            try:
                client.data_object.create(data_object, "RFSDPDissesEmbeddingEn", unique_id)
                print(f"Added document embedding with UUID: {unique_id}")
            except Exception as e:
                print(f"Error adding document embedding: {e}")

print("Completed processing and storing embeddings.")
