In [1]:
import json

import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm

tqdm.pandas()

In [None]:
# Carga de los datos
documents_to_process_reduced = pd.read_pickle('documents_to_process_reduced.pkl')

# Construcción del conjunto de datos de entrenamiento

In [9]:
import spacy

nlp = spacy.load("es_core_news_lg")

In [10]:
from functools import reduce
from typing import Any
from spacy.tokens import Span, Doc
from spacy.matcher import Matcher
from spacy import displacy
import numpy as np
import random

Doc.set_extension('pdf_path', default=None, force=True)
Doc.set_extension('xml_path', default=None, force=True)

def build_patterns(text: str) ->  list[list[dict[str, Any]]]:
    doc = nlp(text)
    if len(doc) == 0:
        return []
    reversed_doc = list(doc)
    reversed_doc.reverse()
    return [
        reduce(lambda rules_list, span: rules_list + [{'LOWER': span.text.lower()}, {'IS_SPACE': True, 'OP': '?'}], doc, [])[:-1],
        reduce(lambda rules_list, span: rules_list + [{'LOWER': span.text.lower()}, {'IS_SPACE': True, 'OP': '?'}], reversed_doc, [])[:-1]
    ]

def build_matcher(entities: list[dict]):
    matcher = Matcher(nlp.vocab)
    for entity in entities:
        if entity['TEXT'] is None:
            continue
        if type(entity['TEXT']) == float and np.isnan(entity['TEXT']):
            continue
        patterns = build_patterns(entity['TEXT'])
        matcher.add(entity['ENTITY_TYPE'], patterns)
    return matcher

def is_overlapping(match1, match2):
    start_1 = match1[1]
    end_1 = match1[2]
    start_2 = match2[1]
    end_2 = match2[2]
    return end_1 >= start_2 and end_2 >= start_1
    
def resolve_matches_conflicts(matches: list[tuple[int,int, int]]):
    result = []
    priority_order = ['TITLE', 'AUTHOR', 'ADVISOR', 'YEAR', 'FACULTY', 'DEPARTMENT']
    for p in priority_order:
        filtered_matches = filter(lambda match: nlp.vocab.strings[match[0]] == p, matches)
        for m in filtered_matches:
            overlaps_with_existing_match = any([is_overlapping(existing_match, m) for existing_match in result])
            if not overlaps_with_existing_match:
                result.append(m)
    return result
            
def get_training_data(row, entities: list[dict]):
    try:
        matcher = build_matcher(entities)
        doc = nlp(row['pdf_cover_page'])
        doc._.pdf_path = row['pdf_path']
        doc._.xml_path = row['xml_path']
        matches = matcher(doc)
        matches = resolve_matches_conflicts(matches)
        entities = [Span(doc, start=match[1], end=match[2], label=nlp.vocab.strings[match[0]]) for match in matches]
        #### for ent in entities:
        ####    print(ent.text, ent.start_char, ent.end_char)
        doc.set_ents(entities=entities)
        return doc
    except ValueError as e:
        if '[E1010]' in str(e):
            print("WARNING", e)
            return np.nan
        else:
            raise e

def get_training_data_for_all_entities(row):
    entities = [
        {'ENTITY_TYPE': 'TITLE', 'TEXT': row['title']},
        {'ENTITY_TYPE': 'FACULTY', 'TEXT': row['faculty']},
        {'ENTITY_TYPE': 'DEPARTMENT', 'TEXT': row['department']},
        {'ENTITY_TYPE': 'YEAR', 'TEXT': str(row['year'])}, ## year overlaps with title as many titles has years in it
        *[{'ENTITY_TYPE': 'AUTHOR', 'TEXT': f'{author["given_names"]} {author["last_names"]}'} for author in row['authors']],
        *[{'ENTITY_TYPE': 'ADVISOR', 'TEXT': f'{author["given_names"]} {author["last_names"]}'} for author in row['advisors']],
    ]
    return get_training_data(row, entities)

def get_training_data_for_advisors(row):
    entities = [
        *[{'ENTITY_TYPE': 'ADVISOR', 'TEXT': f'{author["given_names"]} {author["last_names"]}'} for author in row['advisors']],
    ]
    return get_training_data(row, entities)

def get_training_data_for_authors(row):
    entities = [
        *[{'ENTITY_TYPE': 'AUTHOR', 'TEXT': f'{author["given_names"]} {author["last_names"]}'} for author in row['authors']],
    ]
    return get_training_data(row, entities)

def get_training_data_for_title(row):
    entities = [
        {'ENTITY_TYPE': 'TITLE', 'TEXT': row['title']},
    ]
    return get_training_data(row, entities)

def get_training_data_for_faculty(row):
    entities = [
        {'ENTITY_TYPE': 'FACULTY', 'TEXT': row['faculty']},
    ]
    return get_training_data(row, entities)

def get_training_data_for_department(row):
    entities = [
        {'ENTITY_TYPE': 'DEPARTMENT', 'TEXT': row['department']},
    ]
    return get_training_data(row, entities)

def get_training_data_for_names(row):
    entities = [
        *[{'ENTITY_TYPE': 'LAST_NAME', 'TEXT': f'{author["last_names"]}'} for author in row['authors']],
        *[{'ENTITY_TYPE': 'GIVEN_NAME', 'TEXT': f'{author["given_names"]}'} for author in row['authors']],
    ]
    
    matcher = build_matcher(entities)
    names = []
    for author in row['authors']:
        name = list(author.values())
        random.shuffle(name)
        names.append(' '.join(name))
    names = '\n'.join(names)
    try:
        doc = nlp(names)
        doc._.pdf_path = row['pdf_path']
        doc._.xml_path = row['xml_path']
        matches= matcher(doc)
        entities = [Span(doc, start=match[1], end=match[2], label=nlp.vocab.strings[match[0]]) for match in matches]
        doc.set_ents(entities=entities)
        return doc
    except ValueError as e:
        if '[E1010]' in str(e):
            print("WARNING", e)
            return np.nan
        else:
            raise e
    

In [11]:
from spacy.tokens import DocBin
from random import seed, shuffle
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

def generate_training_files(generator, output_train_file, ouput_test_file):
    docs = documents_to_process_reduced.parallel_apply(generator, axis='columns')
    ##### docs = documents_to_process_reduced[0:1].parallel_apply(generator, axis='columns')
    
    filtered_docs: pd.Series = docs.dropna()
    
    filtered_docs: pd.DataFrame = filtered_docs.apply(lambda doc: pd.Series([len(doc.ents), doc])).rename(columns={0: 'entities_count', 1: 'doc'})
    filtered_docs = filtered_docs[filtered_docs['entities_count'] > 0]
    
    seed(42)
    filtered_docs = list(filtered_docs['doc'])
    shuffle(filtered_docs)
    train_docs = filtered_docs[0:int(len(filtered_docs) // 1.25)]  # 80%
    test_docs = filtered_docs[int(len(filtered_docs) // 1.25):]  # 20%
    train_data = DocBin(docs=train_docs)
    test_data = DocBin(docs=test_docs)
    train_data.to_disk(output_train_file)
    test_data.to_disk(ouput_test_file)
    return docs

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [12]:
# entity = 'all'
generator_per_entity = {
    'all': get_training_data_for_all_entities,
    'title': get_training_data_for_title,
    'authors': get_training_data_for_authors,
    'advisors': get_training_data_for_advisors,
    'faculty': get_training_data_for_faculty,
    'department': get_training_data_for_department,
    'names': get_training_data_for_names,
}

for entity in generator_per_entity:
    print(f"Generating training files for {entity}")
    training_docs = generate_training_files(generator_per_entity[entity], f'{entity}_train.spacy', f'{entity}_test.spacy')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2681), Label(value='0 / 2681'))), …

In [13]:
entity = "all_year"
nlp_test = spacy.load(f'./{entity}_ner_model/model-best')

In [33]:
import random

test_docs: pd.Series = training_docs.dropna()
test_docs: pd.DataFrame = test_docs.apply(lambda doc: pd.Series([len(doc.ents), doc])).rename(columns={0: 'entities_count', 1: 'doc'})
test_docs = test_docs[test_docs['entities_count'] < 3]
test_doc = test_docs.iloc[random.randint(0, len(test_docs))]['doc']

displacy.render(nlp_test(test_doc.text), style='ent')

for ent in test_doc.ents:
    print(f"{ent.label_}: {repr(ent.text)}")
print("###")

DEPARTMENT: 'CARRERA DE BIOQUIMICA'
AUTHOR: 'MARILYN DAYSI MORALES LOPEZ'
###
