In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
confit_str = """
# --- Global variables ---

[vars]
train = "/export/home/pwajsburt/data/eds-biomedic/train/"
val = "/export/home/pwajsburt/data/eds-biomedic/val/"
test = "/export/home/pwajsburt/data/eds-biomedic/test/"
gold_span_group = {"gold_spans": ["dosage", "form", "strength", "Chemical_and_drugs", "BIO", "BIO_comp"]}
default_attributes = {"Certainty": "Certain", "Temporality": "Present", "Family": False, "Negation": False, "Allergie": False}

# --- Pipeline ---

[nlp]
lang = "eds"
pipeline = [
           "normalizer",
           "sentencizer",
           "ner",
           "qualifier",
           ]
batch_size = 8
components = ${components}
tokenizer = {"@tokenizers": "eds.tokenizer"}

[components.normalizer]
@factory = "eds.normalizer"

[components.sentencizer]
@factory = "eds.sentences"

# NER component
[components.ner]
@factory = "eds.ner_crf"
mode = "joint"
window = 40
target_span_getter = ${vars.gold_span_group}
# Set spans as both to ents and in separate `ent.label` groups
span_setter = [ "ents", "*" ]
infer_span_setter = true

[components.ner.embedding]
@factory = "eds.text_cnn"
kernel_sizes = [3, 5, 7]

[components.ner.embedding.embedding]
@factory = "eds.transformer"
model = "/export/home/share/datascientists/models/finetuning-camembert-2021-07-29/"
#model = "/export/home/pwajsburt/data/models/embedding-whole-word/checkpoint-250000/"
window = 128
stride = 96

# Qualifier component
[components.qualifier]
@factory = "eds.span_qualifier"
keep_none = true

[components.qualifier.qualifiers]
Allergie = ["Chemical_and_drugs"]
Action = ["Chemical_and_drugs"]
Certainty = ["Chemical_and_drugs"]
Temporality = ["Chemical_and_drugs"]
Negation = ["Chemical_and_drugs"]
Family = ["Chemical_and_drugs"]

[components.qualifier.embedding]
@factory = "eds.span_pooler"
span_getter = {"Chemical_and_drugs": True, "gold_spans": "Chemical_and_drugs"}

[components.qualifier.embedding.embedding]
@factory = "eds.text_cnn"
kernel_sizes = [3, 5, 7]
#embedding = ${components.ner.embedding.embedding}

[components.qualifier.embedding.embedding.embedding]
@factory = "eds.transformer"
model = "/export/home/share/datascientists/models/finetuning-camembert-2021-07-29/"
#model = "/export/home/pwajsburt/data/models/embedding-whole-word/checkpoint-250000/"
window = 255
stride = 128
span_getter = {
    "@misc": "eds.span_sentence_getter",
    "span_getter": ${components.qualifier.embedding.span_getter},
    "min_context_words": 30
    }

# --- Scorers ---

[scorer.ner.exact_ner]
@scorers = "eds.ner_exact_scorer"
span_getter = ${vars.gold_span_group}

[scorer.qualifier.qualifier]
@scorers = "eds.span_classification_scorer"
span_getter = ${components.qualifier.embedding.span_getter}
qualifiers = ${components.qualifier.qualifiers}
default_values = ${vars.default_attributes}

# --- Data ---

[val_data]
[val_data.source]
@readers = "standoff"
path = ${vars.val}
span_setter = ${vars.gold_span_group}
default_attributes = ${vars.default_attributes}

[test_data]
[test_data.source]
@readers = "standoff"
path = ${vars.test}
span_setter = ${vars.gold_span_group}
default_attributes = ${vars.default_attributes}

[ner_train_dataloader]
batch_size = 2000 words
grad_accumulation_max_tokens = ${512 * 128}
pipe_names = ["ner"]
[ner_train_dataloader.data]
randomize = true
max_length = 128
multi_sentence = true
[ner_train_dataloader.data.source]
@readers = "standoff"
path = ${vars.train}
span_setter = ${vars.gold_span_group}
default_attributes = ${vars.default_attributes}

[qlf_train_dataloader]
batch_size = 64 spans
grad_accumulation_max_tokens = ${512 * 128}
pipe_names = ["qualifier"]
[qlf_train_dataloader.data]
randomize = true
max_length = 128
multi_sentence = true
filter_expr = "sum([e.label_ == 'Chemical_and_drugs' for e in doc.spans['gold_spans']]) > 0"
[qlf_train_dataloader.data.source]
@readers = "standoff"
path = ${vars.train}
span_setter = ${vars.gold_span_group}
default_attributes = ${vars.default_attributes}

# --- Scripts ---

[train]
nlp = ${nlp}
max_steps = 4000
validation_interval = ${train.max_steps//10}
warmup_rate = 0.1
transformer_lr = 5e-5
task_lr = 5e-5
scorer = ${scorer}
train_dataloader = [${ner_train_dataloader}, ${qlf_train_dataloader}]
val_data = ${val_data}
test_data = ${test_data}
loss_scales = {"ner": 1, "qualifier": 300}

[evaluate]
scorer = ${scorer}
model_path = "artifacts/model-last"
data = ${test_data}
"""

In [None]:
import yaml

In [None]:
import confit

In [None]:
from io import StringIO
dumper = ConfitDumper(StringIO())

In [None]:
dumper.flow_level

0

In [41]:
import yaml

# Custom dumper subclass
class ConfitDumper(yaml.Dumper):
    pass

# Custom representer for lists to always dump inline
def custom_list_representer(dumper, data):
    return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True)

# Custom representer for dicts to dump inline only if no key starts with "@"
def custom_dict_representer(dumper, data):
    is_inline = dumper.flow_level > 0 and not any(key.startswith("@") or isinstance(value, dict) for key, value in data.items())
    at_key = next((k for k in data if k.startswith('@')), None)
    if at_key:
        tag = "!" + at_key[1:] + ': ' + data[at_key]
        data = dict(data)
        data.pop(at_key)
    else:
        tag = 'tag:yaml.org,2002:map'
    return dumper.represent_mapping(tag, data, flow_style=is_inline if len(data) else None)

# Add custom representers to our custom dumper
ConfitDumper.add_representer(list, custom_list_representer)
ConfitDumper.add_representer(dict, custom_dict_representer)
ConfitDumper.add_representer(confit.Config, custom_dict_representer)
ConfitDumper.add_representer(confit.config.Reference, lambda dumper, data: dumper.represent_scalar("tag:yaml.org,2002:str", str(data)))

# Example usage
data = {
    "list_example": [1, 2, 3],
    "dict_example_inline": {"key1": "value1", "key2": "value2"},
    "dict_example_block": {"@key1": "value1", "key2": "value2"}
}

# Dumping with the custom dumper
yaml_str = yaml.dump(confit.Config.from_str(confit_str), Dumper=ConfitDumper, sort_keys=False, width=float("inf"))
print(yaml_str)

vars:
  train: /export/home/pwajsburt/data/eds-biomedic/train/
  val: /export/home/pwajsburt/data/eds-biomedic/val/
  test: /export/home/pwajsburt/data/eds-biomedic/test/
  gold_span_group:
    gold_spans: [dosage, form, strength, Chemical_and_drugs, BIO, BIO_comp]
  default_attributes:
    Certainty: Certain
    Temporality: Present
    Family: false
    Negation: false
    Allergie: false
nlp:
  lang: eds
  pipeline: [normalizer, sentencizer, ner, qualifier]
  batch_size: 8
  components: ${components}
  tokenizer: !tokenizers:%20eds.tokenizer {}
components:
  normalizer: !factory:%20eds.normalizer {}
  sentencizer: !factory:%20eds.sentences {}
  ner: !factory:%20eds.ner_crf
    mode: joint
    window: 40
    target_span_getter: ${vars.gold_span_group}
    span_setter: [ents, '*']
    infer_span_setter: true
    embedding: !factory:%20eds.text_cnn
      kernel_sizes: [3, 5, 7]
      embedding: !factory:%20eds.transformer
        model: /export/home/share/datascientists/models/finetuni

In [31]:
import ruamel.yaml
ruamel.yaml.load

In [38]:
ruamel.yaml.load("""
train:
  nlp: ${nlp}
  max_steps: 4000
  validation_interval: ${train.max_steps//10}
  warmup_rate: 0.1
  transformer_lr: 5.0e-05
  task_lr: 5.0e-05
  scorer: ${scorer}
  train_dataloader:
      !test
      - ${ner_train_dataloader}
      - ${qlf_train_dataloader}
  val_data: ${val_data}
  test_data: ${test_data}
  loss_scales:
    ner: 1
    qualifier: 300
""", ruamel.yaml.Loader)

ConstructorError: could not determine a constructor for the tag '!test'
  in "<unicode string>", line 11, column 7:
          !test
          ^ (line: 11)

In [53]:
from ruamel.yaml import YAML
from ruamel.yaml import YAMLError as _YAMLError

yaml = YAML(typ="rt")

In [62]:
#yaml.add_constructor('!test', lambda loader, item: loader.construct_mapping(item), yaml.Loader)
res = yaml.load("""
authors:
- !test
  author: Yukihiro Matsumoto
  title: Ruby in a Nutshell
  year: 2002
  isbn: 0-596-00214-9
# test
-
  _factory_: test:app
  author: Yukihiro Matsumoto
  title: Ruby in a Nutshell
  year: 2002
  isbn: 0-596-00214-9
""")
print(type(res))
res

<class 'ruamel.yaml.comments.CommentedMap'>


{'authors': [{'author': 'Yukihiro Matsumoto', 'title': 'Ruby in a Nutshell', 'year': 2002, 'isbn': '0-596-00214-9'}, {'_target_': 'test:app', 'author': 'Yukihiro Matsumoto', 'title': 'Ruby in a Nutshell', 'year': 2002, 'isbn': '0-596-00214-9'}]}

In [None]:
res['authors'][0]

{'author': 'Yukihiro Matsumoto', 'title': 'Ruby in a Nutshell', 'year': 2002, 'isbn': '0-596-00214-9'}

In [None]:
_.tag

Tag('!test')

In [None]:
import os

In [46]:
for name in Path("/Users/perceval/Downloads/PDFs_annotés").glob("*.pdf"):
    content = Path(name).read_bytes()

In [19]:
# pip install edsnlp edspdf edspdf-mupdf python-docx
import edsnlp
import edspdf
import edspdf.utils.alignment
import rich.console
import os
from collections import defaultdict
from pathlib import Path
from fitz import Document
from docx.shared import RGBColor
import edsnlp
nlp = edsnlp.blank('eds')
nlp.add_pipe('eds.sentences')

pipeline = edspdf.Pipeline()
pipeline.add_pipe("mupdf-extractor")

for name in Path("/Users/perceval/Downloads/PDFs_annotés").glob("*.pdf"):
    content = Path(name).read_bytes()

    fitz_doc = Document(stream=content)
    doc = pipeline(content)

    colors = defaultdict(lambda: f"label-{len(colors)}")
    by_label = defaultdict(list)
    for page, fitz_page in zip(doc.pages, fitz_doc.pages()):
        #print("PAGE", page.page_num)
        boxes = []
        annots = list(fitz_page.get_drawings())
        #print("ANNOTS", len(annots), page.width, page.height)
        for annot in annots:
            for item in annot['items']:
                if item[0] == "c" and annot['color']:
                    color = colors[annot['color']]
                    w, h = page.width, page.height
                    x0, x1, y0, y1 = (
                        (item[1][0])/w,
                        (item[4][0])/w,
                        (item[1][1]-1)/h,
                        (item[4][1]+1)/h,
                    )
                    boxes.append(edspdf.Box(x0=x0, x1=x1, y0=y0, y1=y1, label=color, page_num=page.page_num))
        clusters = []
        #print("BOXES", len(boxes))
        for line in edspdf.utils.alignment.align_box_labels(boxes, page.text_boxes):
            if line.label is None:
                continue
            if (
                not len(clusters)
                or clusters[-1][-1].label != line.label
                or abs(clusters[-1][-1].y1 - line.y0) > (6 / page.height)
            ):
                clusters.append([line])
            else:
                clusters[-1].append(line)

        for cluster in clusters:
            by_label[cluster[0].label].append(cluster)

    out_filename = f"/Users/perceval/Downloads/PDFs_annotés/PDF_extraction/" + name.stem + ".docx"
    doc = docx.Document()
    doc.add_heading(name.stem, level=0)
    colors_by_label = {label: RGBColor(*(int(c * 255) for c in color)) for color, label in colors.items() if isinstance(color, tuple)}
    #console = rich.console.Console(file=f)
    for label, clusters in by_label.items():
        title = clusters[0][0].text
        clusters = [clusters[0][1:], *clusters[1:]]
        heading = doc.add_heading(level=1)
        run = heading.add_run(title)
        run.font.color.rgb = colors_by_label[label]
        print("--------- " + title + " ----------")
        for cluster in clusters:
            paragraph = "\n".join(line.text for line in cluster) + "\n"
            paragraph = "\n".join(sent.text.replace("\n", " ") for sent in nlp(paragraph).sents)
            doc.add_paragraph(paragraph)
            doc.add_paragraph()
            print(paragraph)
    doc.save(out_filename)

--------- NLP ----------
 
--------- Réglementation ----------
 
--------- Communication ----------
 
à la défi quelqu'un qui connaît bien pilote puis je veux dire comment je fais comment elle s'appelle elle était à la dst avant verra verra je vais appeler verra je vais lui mettre un sms sur son 06 salut verra c'est christophe j'ai telle question qu'est ce que je peux faire c'est pas c'est pas formellement probablement le bon circuit et après on fait marcher le réseau 
--------- Fonctionnalités ----------
 
J'avoue que s'il y avait des requêteurs plus faciles à utiliser, plus simples, probablement que je regarderais ces flux aussi sur d'autres sites, 
On ne peut malheureusement pas, en termes d'activité des urgences, tout construire dans des rapports préétablis, et on a besoin d'une grande réactivité pour aller voir les passages par rapport à un code diagnostique dans un RPU, les passages par rapport à une tranche d'âge, par rapport à un certain nombre d'horaires, d'horaires complexes,

In [51]:
mkdir /Users/perceval/Downloads/PDFs_annotés/PDF_extraction/

In [14]:
colors_by_label

{'label-0': 'rgb(255,161,55)',
 'label-1': 'rgb(0,255,255)',
 'label-2': 'rgb(255,183,230)',
 'label-3': 'rgb(228,183,255)',
 'label-4': 'rgb(255,255,0)',
 'label-5': 'rgb(0,255,0)',
 'label-6': 'rgb(91,103,255)',
 'label-7': 'rgb(126,0,212)',
 'label-8': 'rgb(0,132,164)',
 'label-9': 'rgb(205,255,78)'}