In [1]:
import os, sys
from pathlib import Path

ENV_IS_GOOGLE_COLAB = True if 'google.colab' in str(get_ipython()) else False
os.environ["ENV_IS_GOOGLE_COLAB"] = str(ENV_IS_GOOGLE_COLAB)

if ENV_IS_GOOGLE_COLAB:
  from google.colab import drive
  mountpoint = Path("/content/drive")
  drive.mount(str(mountpoint)) # Mount gdrive to BASE
  base = mountpoint / "MyDrive/SODUCO/article_das_2022" # Adapt this to your situation
  sys.path.append(str(base)) # Add BASE to Python Path
  BASE = Path(base).resolve() # Make BASE absolute
  DATASETS =  BASE / "dataset"
else:
  BASE = Path(os.path.dirname(os.path.realpath("__file__"))).resolve() # If not on GColab, BASE will be the directory of this notebook
  DATASETS = BASE.parent.parent / "dataset"

print(sys.path)
print(BASE)
print(DATASETS)

['/home/bertrand/dev/paper-ner-bench-das22/src/ner', '/home/bertrand/anaconda3/lib/python39.zip', '/home/bertrand/anaconda3/lib/python3.9', '/home/bertrand/anaconda3/lib/python3.9/lib-dynload', '', '/home/bertrand/anaconda3/lib/python3.9/site-packages', '/home/bertrand/anaconda3/lib/python3.9/site-packages/locket-0.2.1-py3.9.egg', '/home/bertrand/anaconda3/lib/python3.9/site-packages/IPython/extensions', '/home/bertrand/.ipython']
/home/bertrand/dev/paper-ner-bench-das22/src/ner
/home/bertrand/dev/paper-ner-bench-das22/dataset


In [2]:
import csv
from transformers import AutoTokenizer, AutoModelForTokenClassification

LIMIT = 100

tokenizer = AutoTokenizer.from_pretrained(BASE / "43-camembert_pretrained_finetuned_pero")
model = AutoModelForTokenClassification.from_pretrained(BASE / "43-camembert_pretrained_finetuned_pero")

def xmlize(text, annot):
    txt_l = list(text)
    annot_r = annot.copy()
    annot_r.reverse()
    for elem in annot_r:
        s_ix = elem["start"]
        e_ix = elem["end"]
        txt_l.insert(e_ix,f'</{elem["entity_group"]}>')
        if txt_l[s_ix] == ' ': # Patch entities starting with a whitespace
            s_ix += 1
        txt_l.insert(s_ix,f'<{elem["entity_group"]}>')
    return "".join(txt_l)


##### Process text sample (from wikipedia)

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple", use_fast=True)

In [5]:
samples = """☞  T Dufant (Victor), libraire, r. du Gros-Che- net. 2. JO \n
Dutay, essayeur du commerce, place Dau-  - píině, 5. DOB \n
Dulay, chandronnier, r. du Pont- aux- Chuux, 15. SI \n
Dufay (V.e), grenetière, r. du Fauh.-S. Denis, 20. 372 \n
Y ☞ Dnten,charentier, 1. Montmartre, 89. 318 \n
Dufey fils, bijoutier, passage de la Réunion A \n
Dnley, boucher, r. S.&quot;Louis-au-Marais,3. * \n
☞ Duffand, bijonteer, r. S.-Martin, 107. 284 \n
Duftaud, maçon, r. S-Honore  353. O \n
Dullaut, chandronnier, r. de la Sourdière I- ( ☞T4 \n
lukils. limonauier, 1. des Colonnes, 6. J9 \n
— T Dullocu, architecte, r. Basse-Porte-S.De- — Ris, 28. \n
Duflot, loueur de carrosses, r. de Paradis- 505 Poissonnière, 22. \n
Dufort, bottier, Palais-R., gal. vitrée, 215. A \n
Dufort fils, bottier (exp. 1819.), r. J.-J.. Rousscan. 18. 290 \n
* Dufort, tabletier, r. Jean-Pain-Mollet, 10. 4O \n
Dufossé, teiniurier, r. du Faub.-Montmar- tre , 63. 483 \n
Duſour el Besnard, march, de bois à bruler, quai de la Tournelle, 17. etr. des Fossés- SBernard. 11. Dí \n
Dufour jeune, march, de bois à brüler de la Pépinière, 53. 7 r. \n
Dufour (Mad.), march. de chapeaux de paitle, passage dn Panorama, 11. IO \n
Dutour, thapelier, r. S.-Honore 383. 3og \n
Dufour, cirier, r. de PArbre-Sec, 6o. 316 \n
Dufour et co., commiss., r. des Contures-  DAO S. Dervajs, 18. \n
Dulour, corropeir, 1. des Vertusy1d 3 \n
Dufour (Charles), épicier, r. S.-Denis, Jo7. ☞ 332 \n
Dufour, fab. d&apos;éventails (erp. 1819) Beaubourg, 48.  360 \n
Dufour, faiencter, r. S.-Anne , 50. 302 \n
Dufour, A. Sarnt,r. Ventadour, ☞7 \n
Dufour (Gabriel), libraire, r. de Vaugirard, A \n
Dufour, maçon, r. S.-Jean-Baptiste, 4 402 \n
Dufour, architecte, r. de Chartres-S.-Ho- 2781 40a nore, 12. (Elig.) \n
Dufour (Mad.), nouveautes, 1. Neuve-des- H12 Petits-Champs, 4. \n
Dufour Chabrol, papetier, r. S.-Martin ☞ 4 \n
Dufour (J.) et co,, fab. de papiers peints, ((1819); r. Beauvean-S. Antoine, 19."""


'☞\u2029\u2029T\u2029Dufant (Victor), libraire, r. du Gros-Che-\u2029net. 2.\u2029JO \n\nDutay, essayeur du commerce, place Dau-\u2029\u2029-\u2029píině, 5.\u2029DOB \n\nDulay, chandronnier, r. du Pont- aux-\u2029Chuux, 15.\u2029SI \n\nDufay (V.e), grenetière, r. du Fauh.-S.\u2029Denis, 20.\u2029372 \n\nY\u2029☞\u2029Dnten,charentier, 1. Montmartre, 89. 318 \n\nDufey fils, bijoutier, passage de la Réunion\u2029A \n\nDnley, boucher, r. S.&quot;Louis-au-Marais,3.\u2029* \n\n☞\u2029Duffand, bijonteer, r. S.-Martin, 107. 284 \n\nDuftaud, maçon, r. S-Honore\u2029\u2029353.\u2029O \n\nDullaut, chandronnier, r. de la Sourdière\u2029I- (\u2029☞T4 \n\nlukils. limonauier, 1. des Colonnes, 6. J9 \n\n—\u2029T\u2029Dullocu, architecte, r. Basse-Porte-S.De-\u2029—\u2029Ris, 28. \n\nDuflot, loueur de carrosses, r. de Paradis-\u2029505\u2029Poissonnière, 22. \n\nDufort, bottier, Palais-R., gal. vitrée, 215.\u2029A \n\nDufort fils, bottier (exp. 1819.), r. J.-J..\u2029Rousscan. 18.\u2029290 \n\n*\u2029

In [8]:
for s in samples.split("\n"):
        print(xmlize(s, nlp(s)))

☞  T<PER> Dufant (Victor),</PER> <ACT>libraire</ACT>, <LOC>r. du Gros-Che- net</LOC>. <CARDINAL>2</CARDINAL>. JO 

<PER>Dutay</PER><ACT>, essayeur du commerce</ACT>, <LOC>place Dau-  - píin</LOC>ě, <CARDINAL>5</CARDINAL>. DOB 

<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LOC>r. du Pont- aux- Chuux</LOC>, <CARDINAL>15</CARDINAL>. SI 

<PER>Dufay (V.e),</PER> <ACT>grenetière</ACT>, <LOC>r. du Fauh.-S. Denis</LOC>, <CARDINAL>20</CARDINAL>. 372 

Y ☞<PER> Dnten</PER>,<ACT>charentier</ACT>, <LOC>1. Montmartre</LOC>, <CARDINAL>89</CARDINAL>. 318 

<PER>Dufey fils</PER><ACT>, bijoutier</ACT>, <LOC>passage de la Réunion</LOC><CARDINAL> A</CARDINAL> 

<PER>Dnley</PER>, <ACT>boucher</ACT>, <LOC>r. S.&quot;Louis-au-Marais</LOC>,<CARDINAL>3.</CARDINAL><TITRE> *</TITRE> 

☞<PER> Duffand</PER>, <ACT>bijonteer</ACT>, <LOC>r. S.-Martin</LOC>, <CARDINAL>107</CARDINAL>. 284 

<PER>Duftaud</PER><ACT>, maçon</ACT>, <LOC>r. S-Honore</LOC> <CARDINAL> 353.</CARDINAL> O 

<PER>Dullaut</PER>, <ACT>chandronnier