In [1]:
# Latex match Grobid
import os
import random
import regex as re
from thefuzz import fuzz
from thefuzz import process
from collections import OrderedDict
from grobid_parser import GParser
import xml.etree.ElementTree as ET
import json

# 查询一段区间是否完全在另一段区间里面
def if_inside(span1, span2, strict=False):
    if strict:
        if span1[0] > span2[0] and span1[1] < span2[1]:
            return True
        else:
            return False   
    else:   
        if span1[0] >= span2[0] and span1[1] <= span2[1]:
            return True
        else:
            return False

def clean_equ_sentences(equ_sentence_spans, clean_data):
    equ_sentence_lst = []
    for equ_sentence_span in equ_sentence_spans:
        equ_sentence = clean_data[equ_sentence_span[0]:equ_sentence_span[1]].replace('\n',' ').strip()# 初步清洗
        equ_sentence = re.sub(r'\s+', ' ', equ_sentence)
        equ_sentence = re.sub(r'(?!\\begin\{equation\})(?!\\end\{equation\})(?!\\begin\{multline\})(?!\\end\{multline\})((\\begin\{.*?\})|(\\end\{.*?\})|(\\label\{.*?\})|\\item|\\qqq|\\ee|\\ref{.*?}|\\eqref{.*?}|\\footnote{.*?}|\\cite{.*?})', '', equ_sentence)
        equ_sentence = equ_sentence.replace('\n',' ').strip()# again
        equ_sentence = re.sub(r'\s+', ' ', equ_sentence)
        equ_sentence_lst.append(equ_sentence)
    return equ_sentence_lst

def merge_span(span1, span2):
    return (min(span1[0], span1[1], span2[0], span2[1]), max(span1[0], span1[1], span2[0], span2[1]))


In [2]:
# fetch data
tex_base_path = "/home/jihuawei/jhw_pdf_parser/grobid_base/datasetgenius/formula/tmp/tex"
xml_base_path = "/home/jihuawei/jhw_pdf_parser/grobid_base/datasetgenius/formula/tmp/xmls"
tex_folds_rand_sample = os.listdir(tex_base_path)

pdf_parser = GParser(parse_imgs=False)


In [3]:
# 提取latex种包含公式的句子
# 思路：
# 获得公式及位于的index
# 获得段落句点的index
# 看公式落在哪两个句点之间
# 提取出来

"""
# 找出句点
(?<!No)(?<!\.g)(?<!\.e)(?<!\{)\.(?!g\.)(?!e\.)(?!\d)(?=\s)
# 提取公式
(?<!\\)(?:((?<!\$)\${1,2}(?!\$))|(\\begin\{equation\})|(\\\()|(\\\[)|(\\begin\{multline\}))(?(1)(.*?)(?<!\\)(?<!\$)\1(?!\$)|(?:([\s\S]*?)(?<!\\)(?:(?(2)\\end\{equation\}|(?(3)\\\)|(?(4)\\\]|(?(5)\\end\{multline\})))))))
# 清洗公式
(?!\\begin\{equation\})(?!\\end\{equation\})(?!\\begin\{multline\})(?!\\end\{multline\})((\\begin\{.*?\})|(\\end\{.*?\})|(\\label\{.*?\})|\\item|\\qqq|\\ee|\\ref{.*?}|\\eqref{.*?}|\\footnote{.*?}|\\cite{.*?})
"""
matched_pairs_all = []
for tex_name in tex_folds_rand_sample:
    equ_sentence_lst_all = []
    xml_sentence_lst_all = []
    for tex_file in os.listdir(os.path.join(tex_base_path, tex_name)):
        with open(os.path.join(tex_base_path, tex_name, tex_file), encoding='ISO-8859-1') as fp:
            data = fp.read()
        # 注释去掉
        clean_data = re.sub(r'(?<!\\)%.*\n?', '', data)
        # \begin{figure}...\end{figure}去掉
        clean_data = re.sub(r'\\begin\{figure\}[\s\S]*?\\end\{figure\}', '', clean_data)
        # \begin{algorithm}...\end{algorithm}去掉
        clean_data = re.sub(r'\\begin\{algorithm\}[\s\S]*?\\end\{algorithm\}', '', clean_data)
        # \begin{lstlisting}...\end{lstlisting}去掉
        clean_data = re.sub(r'\\begin\{lstlisting\}[\s\S]*?\\end\{lstlisting\}', '', clean_data)
        clean_data = clean_data.replace('\n', ' ')
        # 提取公式
        reg1 = r"(?<!\\)(?:((?<!\$)\${1,2}(?!\$))|(\\begin\{equation\})|(\\\()|(\\\[)|(\\begin\{multline\}))(?(1)(.*?)(?<!\\)(?<!\$)\1(?!\$)|(?:([\s\S]*?)(?<!\\)(?:(?(2)\\end\{equation\}|(?(3)\\\)|(?(4)\\\]|(?(5)\\end\{multline\})))))))"
        pattern1 = re.compile(reg1, re.MULTILINE)
        matches1 = pattern1.finditer(clean_data)   
        equ_spans = [match.span() for match in matches1]
        # 提取句点
        reg2 = r"(?<!No)(?<!\.g)(?<!\.e)(?<!\{)\.(?!g\.)(?!e\.)(?!\d)(?=\s)"
        pattern2 = re.compile(reg2, re.MULTILINE)
        matches2 = pattern2.finditer(clean_data)
        period_spans = [match.span() for match in matches2]

        # 对period_spans进行筛选，先把跨equation的span合起来
        drop_per_lst = []
        for equ_span in equ_spans:
            equ_right = equ_span[1]
            for period_span in period_spans:
                per_right = period_span[1]
                if per_right <= equ_right:
                    if if_inside((per_right,per_right), equ_span):
                        drop_per_lst.append(period_span)
                        
                else:
                    break
        for drop_per in drop_per_lst:
            period_spans.remove(drop_per)# 去掉这个句点就是合起来了

        # 把包含公式的句子找出来
        sentence_spans = []
        for i in range(len(period_spans)):
            if i == 0:
                sentence_spans.append((0, period_spans[i][1]))
            elif i == len(period_spans):
                equ_spans.append((period_spans[i][1], len(period_spans)))
            else:
                sentence_spans.append((period_spans[i-1][1], period_spans[i][1]))
        equ_sentence_spans = []
        for equ_span in equ_spans:
            for sentence_span in sentence_spans:
                if if_inside(equ_span, sentence_span):
                    equ_sentence_spans.append(sentence_span)
                    break
        equ_sentence_spans = list(OrderedDict.fromkeys(equ_sentence_spans).keys())
        equ_sentence_spans = [equ_sentence for equ_sentence in equ_sentence_spans if equ_sentence[1] - equ_sentence[0]<= 800]# 去掉极长的
        equ_sentence_lst = clean_equ_sentences(equ_sentence_spans, clean_data)
        equ_sentence_lst_all = equ_sentence_lst_all + equ_sentence_lst
    

    # 清洗xml
    clean_data = ""
    xml_path = os.path.join(xml_base_path, tex_name+'.tei.xml')
    pdf_name = os.path.basename(xml_path).replace(".tei.xml", ".pdf")
    pdf_parser.pdf_name = pdf_name
    tree = ET.parse(xml_path)
    root = tree.getroot()
    result = pdf_parser.parse_all(root, pdf_name, f"{pdf_name.strip('.pdf')}.json")
    result.pop('back')
    for name in result.keys():
        top_el = result[name]
        for el in top_el:
            if 'tail' not in el.keys():
                clean_data = clean_data + el['txt'] + ' '
            else:
                clean_data = clean_data + el['txt'] + ' '
                clean_data = clean_data + el['tail'] + ' '
    clean_data= clean_data.replace('\n', ' ')
    # 提取句点
    reg2 = r"(?<!No)(?<!\.g)(?<!\.e)(?<!\{)\.(?!g\.)(?!e\.)(?!\d)(?=\s)"
    pattern2 = re.compile(reg2, re.MULTILINE)
    matches2 = pattern2.finditer(clean_data)
    period_spans = [match.span() for match in matches2]
    sentence_spans = []
    for i in range(len(period_spans)):
        if i == 0:
            sentence_spans.append((0, period_spans[i][1]))
        elif i == len(period_spans):
            equ_spans.append((period_spans[i][1], len(period_spans)))
        else:
            sentence_spans.append((period_spans[i-1][1], period_spans[i][1]))
    sentence_spans_clean = []
    for i in range(len(sentence_spans)):
        if i == len(sentence_spans)-1:
            sentence_spans_clean.append(sentence_spans[i])
        else:
            if sentence_spans[i][1]-sentence_spans[i][0] <= 12:
                sentence_spans[i+1] = merge_span(sentence_spans[i], sentence_spans[i+1])
            else:
                sentence_spans_clean.append(sentence_spans[i])
    sentence_spans = [sentence_span for sentence_span in sentence_spans_clean if sentence_span[1]-sentence_span[0] <= 800]
    xml_sentence_lst_all = clean_equ_sentences(sentence_spans, clean_data)

    # 匹配
    matched_pairs = []
    choices = equ_sentence_lst_all
    for i in range(len(xml_sentence_lst_all)):
        xml_sentence = xml_sentence_lst_all[i]
        matched = process.extractOne(xml_sentence, choices, scorer=fuzz.ratio)
        if matched is not None:
            equ_text = matched[0]
            score = matched[1]
            if score >= 80:# and len(xml_sentence)/len(equ_text) >= 0.85 and len(xml_sentence)/len(equ_text) <= 1.18
                matched_pairs.append((xml_sentence, equ_text, score))
        else:
            continue
    matched_pairs_all = matched_pairs_all + matched_pairs
with open("11.03.json", "w") as fp:
    json.dump(matched_pairs_all, fp, indent=4)

Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '. . . . . .']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '. . . . . . .']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '. . . . . . .']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '. .) = (. . .']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '. . . . . . .']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '. . . . . . .']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '. . . . . . .']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '. . . . . . .']
Applied processor reduces input query to empty string, all comparisons will have s

In [1]:
import json
with open('./11.03.json') as fp:
    data = json.load(fp)

In [4]:
len(data)

52895

In [3]:
import random

data_sample = random.sample(data, 200)
with open("dataset.json", "w") as fp:
    json.dump(data_sample, fp, indent=4)