# 01 - Create dataset for NER

This notebook aims to create a dataset using NER annotation files (Brat Standoff) format to train a NER model.

In [17]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1" #Numéro GPU
print(os.environ["CUDA_VISIBLE_DEVICES"])

1


In [18]:
import os, sys
from pathlib import Path

BASE = Path(os.path.dirname(os.path.realpath("__file__"))).resolve() # If not on GColab, BASE will be the directory of this notebook
DATASETS = Path('/home/STual/DAN-cadastre/data').resolve()
OUT_BASE = Path('/home/STual/DAN-cadastre/outputs/NER_inference').resolve()

print(sys.path)
print(BASE)
print(DATASETS)
print(OUT_BASE)

['/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/STual/DAN-cadastre/.venv_dan/lib/python3.10/site-packages']
/home/STual/DAN-cadastre/scripts/NER
/home/STual/DAN-cadastre/data
/home/STual/DAN-cadastre/outputs/NER_inference


### Parameters

In [20]:
MODEL_NAME = 'camembert_ner'
#camembert_ner
#pretrained_camembert_ner

In [21]:
import glob
import re
import json
import pandas as pd
import config
import os
from pathlib import Path

config.SPLIT_SEED = 42 # Random seed used in train/dev/test. Do not change it if you want to recreate the paper results.
config.DEBUG = False # If true, text versions of the spacy datasets will be saved along with the .spacy files.

## Load data

In [1]:
DATA = DATASETS / "NER/NER_dataset"

# Retrieve files (Brat Standoff annotations)
txts = sorted(glob.glob(f"{DATA}/*.txt"))
anns = sorted(glob.glob(f"{DATA}/*.ann"))

assert len(txts) == len(anns)

NameError: name 'DATASETS' is not defined

In [None]:
# Re-structure annotations
id_ = []
text_ocr = []
brat = []
for i in range(0,len(txts)):
    name = txts[i].replace(str(DATA) + "/","")
    name = name.replace('.txt',"")
    id_.append(name)
    with open(txts[i], 'r', encoding="utf-8") as file:
        txt = file.read()
        text_ocr.append(txt)
    with open(anns[i], 'r', encoding="utf-8") as file:
        ann = file.readlines()
        entities = []
        for elem in ann:
            ls1 = elem.split('\t')
            ls2 = ls1[1].split(' ')
            entities.append([ls1[0],ls2[0],ls1[2][:-1],int(ls2[1]),int(ls2[2])])
        entities.insert(0,txt)
        brat.append(entities)

In [37]:
gold_reference = pd.DataFrame(id_,columns=["id"])
gold_reference["text_ocr"] = text_ocr
gold_reference["brat"] = brat
gold_reference

Unnamed: 0,id,text_ocr,brat
0,001f99ca-b851-4b23-864d-563288dfba1f,Le Prince de neufchâtel→×Vauguyon maire±,"[Le Prince de neufchâtel→×Vauguyon maire±, [T0..."
1,0048e1d8-e348-4fa3-bc5c-10a8daaf1800,Queru P↑re↓ marie,"[Queru P↑re↓ marie, [T0, name, Queru, 0, 5], [..."
2,004ac7cc-9030-47b1-9136-bb5f8ea15ffa,Penain julien,"[Penain julien, [T0, name, Penain, 0, 6], [T1,..."
3,007458b4-e647-47c1-9a46-a0527566e668,Maignan françois→à marolles,"[Maignan françois→à marolles, [T0, name, Maign..."
4,0075e078-e13c-4be0-8f6d-9382087da197,Nul ×Bourdillat pierre gilles±,"[Nul ×Bourdillat pierre gilles±, [T0, name, Nu..."
...,...,...,...
2620,ff842312-b51f-4e7e-9e99-05ab7f2a52f8,Mérillon saturnin veuve,"[Mérillon saturnin veuve, [T0, name, Mérillon,..."
2621,ff91ad3f-fda0-46b6-b7e1-1811f425fde1,Raban philippe,"[Raban philippe, [T0, name, Raban, 0, 5], [T1,..."
2622,ffa866a3-a5ae-46b0-8f04-ca81f56fcc1d,"Defresne honoré frédéric dit Bizet, Vitry","[Defresne honoré frédéric dit Bizet, Vitry, [T..."
2623,ffc91c4a-816b-4874-9e29-935749159300,Montebello la Duchesse,"[Montebello la Duchesse, [T0, name, Montebello..."


In [38]:
# Filter the annotations (drop duplicates)
gold_reference_valid = gold_reference.drop_duplicates(subset=["text_ocr"])
tmp  = ["ETS"] * len(gold_reference_valid)
gold_reference_valid["book"] = tmp
gold_reference_valid = gold_reference_valid[["brat","book"]] # TODO for the final dataset (without duplicated owners) : replace book default value by register reference

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gold_reference_valid["book"] = tmp


In [39]:
# Print an example
gold_reference.iloc[1512]

id                       8f80f546-9a27-4ec9-8db1-8bdb2eb25c29
text_ocr                          Guérin Louis→voiturier à id
brat        [Guérin Louis→voiturier à id, [T0, name, Guéri...
Name: 1512, dtype: object

## 1.1 Create train/val/test subsets

In [40]:
import numpy as np
from sklearn.model_selection import train_test_split

train_tmp_df, test_df = train_test_split(
    gold_reference_valid, test_size=0.2, random_state=config.SPLIT_SEED)

train_df, val_df = train_test_split(
    train_tmp_df, test_size=0.1, random_state=config.SPLIT_SEED)

In [41]:
len(train_df), len(val_df), len(test_df)

(1534, 171, 427)

In [43]:
train = train_df.to_numpy()
val = val_df.to_numpy()
test = test_df.to_numpy()

In [44]:
from util_io import save_dataset_io

RES = OUT_BASE / f"m2-experiment_1_prepared_dataset_ref_io_{MODEL_NAME}"
if not os.path.exists(RES):
    os.makedirs(RES)
NAMES = ["train","dev","test"]

datasets = [train, val, test]
save_dataset_io(RES, datasets, NAMES, suffix=len(train))

0
Dodun de Keroman, le comte
['I-name', 'I-name', 'I-title', 'I-title'] [[0, 5], [5, 16], [18, 20], [20, 26]]
1
Pillet Jean
['I-name', 'I-firstnames'] [[0, 6], [7, 11]]
2
thomas→Bonfils V↑e↑ Bourg à Brie
['I-name', 'I-firstnames', 'I-address', 'I-familystatus'] [[7, 14], [0, 6], [28, 32], [15, 19]]
3
Gachet à Paris
['I-name', 'I-address'] [[0, 6], [9, 14]]
4
Macréau J↑n↓ B↑te↓→militaire
['I-name', 'I-firstnames', 'I-firstnames', 'I-title'] [[0, 7], [8, 12], [12, 18], [19, 28]]
5
Veillard Augustin→Berger à Valenton
['I-name', 'I-firstnames', 'I-activity', 'I-address'] [[0, 8], [9, 17], [18, 24], [27, 35]]
6
Le Prince de Neufchâtel→×Vauguyon maire→de marolles±
['I-name', 'I-name', 'I-name', 'I-activity', 'I-activity'] [[0, 2], [2, 23], [25, 33], [34, 42], [42, 51]]
7
Livernet Etienne
['I-name', 'I-firstnames'] [[0, 8], [9, 16]]
8
Lecouteure pierre Marthe
['I-name', 'I-firstnames', 'I-firstnames'] [[0, 10], [11, 17], [17, 24]]
9
Nul ×Bourdillat pierre gilles±
['I-name', 'I-name', 'I-first

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1534/1534 [00:00<00:00, 26185.59 examples/s]


0
Longhé pierre
['I-name', 'I-firstnames'] [[0, 6], [7, 13]]
1
Vigoureux Gabriel→fab↑t↓ de bas à id
['I-name', 'I-firstnames', 'I-activity', 'I-activity', 'I-address'] [[0, 9], [10, 17], [18, 24], [24, 31], [34, 36]]
2
Paloque Jean
['I-name', 'I-firstnames'] [[0, 7], [8, 12]]
3
Duval prosper
['I-name', 'I-firstnames'] [[0, 5], [6, 13]]
4
Ramelert Côme
['I-name', 'I-firstnames'] [[0, 8], [9, 13]]
5
Duchène ❌↑an↓ Vig. Valenton
['I-name', 'I-firstnames', 'I-activity', 'I-address'] [[0, 7], [8, 13], [14, 18], [19, 27]]
6
Desnets
['I-name'] [[0, 7]]
7
↑Raguet↓Picaré→J↑n↓ L.↑is↓
['I-name', 'I-name', 'I-firstnames', 'I-firstnames'] [[1, 7], [8, 14], [15, 19], [19, 26]]
8
Samson pierre
['I-name', 'I-firstnames'] [[0, 6], [7, 13]]
9
Beuzeville henri
['I-name', 'I-firstnames'] [[0, 10], [11, 16]]
10
Defeux françois alphonse
['I-name', 'I-firstnames', 'I-firstnames'] [[0, 6], [7, 15], [15, 24]]
11
Caron
['I-name'] [[0, 5]]
12
Duchange Charles
['I-name', 'I-firstnames'] [[0, 8], [9, 16]]
13
houill

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 171/171 [00:00<00:00, 18901.73 examples/s]


0
Chaumont Loc↑re↓
['I-name'] [[0, 8]]
1
Foucault Pierre Louis alphonse
['I-name', 'I-firstnames', 'I-firstnames'] [[0, 8], [9, 15], [15, 30]]
2
Mabilat simon
['I-name', 'I-firstnames'] [[0, 7], [8, 13]]
3
Leblanc Jean Joseph (les héritiers)→à id
['I-name', 'I-firstnames', 'I-firstnames', 'I-address', 'I-familystatus', 'I-familystatus'] [[0, 7], [8, 12], [12, 19], [38, 40], [21, 24], [24, 34]]
4
Savart françois nicolas
['I-name', 'I-firstnames', 'I-firstnames'] [[0, 6], [7, 15], [15, 23]]
5
Bérault michel
['I-name', 'I-firstnames'] [[0, 7], [8, 14]]
6
Le Prince de Neuchâtel→×Vauguyon, maire→de marolles±
['I-name', 'I-name', 'I-name', 'I-activity', 'I-activity'] [[0, 2], [2, 22], [24, 32], [34, 42], [42, 51]]
7
Dodun de Kéoman léonie, dame de→Bouteillier
['I-name', 'I-name', 'I-firstnames', 'I-title', 'I-title'] [[0, 5], [5, 15], [16, 22], [24, 28], [28, 43]]
8
Dubois Eugène J↑h↓ Napoléon à Vitry
['I-name', 'I-firstnames', 'I-firstnames', 'I-address'] [[0, 6], [7, 13], [13, 27], [30, 35

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 427/427 [00:00<00:00, 20778.82 examples/s]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1534/1534 [00:00<00:00, 338367.73 examples/s]
Saving the dataset (1/1 shards): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 171/171 [00:00<00:00, 46746.14 examples/s]
Saving the dataset (1/1 shards): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 427/427 [00:00<00:00, 119024.91 examples/s]


## 2. Dataset description

In [45]:
from collections import Counter

def entities_per_entries(df):
    entities = []
    for ix, row in df.iterrows():
        txt = row[0][0]
        ann = row[0][1:]
        ents = []
        for elem in ann:
            ents.append(elem[1])
        entities.append(ents)
    return entities

In [46]:
all_entities = entities_per_entries(gold_reference_valid)
train_entities = entities_per_entries(train_df)
val_entities = entities_per_entries(val_df)
test_entities = entities_per_entries(test_df)

  txt = row[0][0]
  ann = row[0][1:]


### 2.1 Patterns

In [47]:
def get_patterns_count(entities):
    # Convert each list to a tuple for counting
    list_as_tuples = [tuple(sublist) for sublist in entities]
    
    # Count occurrences of each tuple
    counts = Counter(list_as_tuples)
    
    # Sort by count in descending order
    sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    
    # Print the results
    for unique_list, count in sorted_counts:
        print(f"{list(unique_list)}: {count}")

In [48]:
all_entities_pat = get_patterns_count(all_entities)

['name', 'firstnames']: 1057
['name']: 217
['name', 'firstnames', 'activity', 'address']: 134
['name', 'firstnames', 'address']: 128
['name', 'firstnames', 'familystatus']: 126
['name', 'familystatus']: 73
['name', 'firstnames', 'activity']: 46
['name', 'address']: 32
['name', 'firstnames', 'name', 'firstnames']: 31
['name', 'firstnames', 'address', 'familystatus']: 30
['name', 'activity']: 19
['name', 'firstnames', 'activity', 'address', 'familystatus']: 18
['name', 'firstnames', 'title']: 17
['name', 'name']: 16
['name', 'name', 'activity']: 13
['name', 'title']: 11
['name', 'firstnames', 'name']: 10
['name', 'firstnames', 'familystatus', 'name']: 10
['name', 'name', 'firstnames']: 9
['name', 'address', 'familystatus']: 9
['name', 'firstnames', 'familystatus', 'name', 'firstnames']: 7
['name', 'activity', 'address']: 6
['name', 'activity', 'name']: 6
['name', 'firstnames', 'familystatus', 'birthname']: 5
['name', 'firstnames', 'name', 'firstnames', 'familystatus']: 4
['name', 'family

In [49]:
all_entities_pat = get_patterns_count(train_entities)

['name', 'firstnames']: 745
['name']: 168
['name', 'firstnames', 'address']: 101
['name', 'firstnames', 'activity', 'address']: 95
['name', 'firstnames', 'familystatus']: 94
['name', 'familystatus']: 54
['name', 'firstnames', 'activity']: 31
['name', 'firstnames', 'name', 'firstnames']: 23
['name', 'firstnames', 'address', 'familystatus']: 21
['name', 'address']: 21
['name', 'activity']: 16
['name', 'firstnames', 'activity', 'address', 'familystatus']: 12
['name', 'firstnames', 'title']: 11
['name', 'name']: 10
['name', 'title']: 9
['name', 'name', 'activity']: 9
['name', 'firstnames', 'name']: 9
['name', 'name', 'firstnames']: 7
['name', 'firstnames', 'familystatus', 'name']: 7
['name', 'firstnames', 'familystatus', 'name', 'firstnames']: 6
['name', 'address', 'familystatus']: 5
['name', 'activity', 'address']: 5
['name', 'activity', 'name']: 5
['name', 'firstnames', 'familystatus', 'familystatus']: 4
['name', 'firstnames', 'familystatus', 'birthname']: 4
['name', 'activity', 'address

In [50]:
all_entities_pat = get_patterns_count(val_entities)

['name', 'firstnames']: 96
['name', 'firstnames', 'activity', 'address']: 14
['name']: 10
['name', 'firstnames', 'familystatus']: 9
['name', 'firstnames', 'address']: 8
['name', 'familystatus']: 5
['name', 'firstnames', 'activity', 'address', 'familystatus']: 3
['name', 'firstnames', 'name', 'firstnames']: 3
['name', 'firstnames', 'activity']: 3
['name', 'firstnames', 'title']: 2
['name', 'address']: 2
['name', 'name']: 2
['name', 'name', 'firstnames']: 1
['name', 'firstnames', 'address', 'familystatus', 'name', 'address']: 1
['name', 'firstnames', 'name']: 1
['name', 'firstnames', 'activity', 'familystatus']: 1
['name', 'address', 'name', 'firstnames']: 1
['name', 'title', 'title']: 1
['name', 'firstnames', 'name', 'firstnames', 'familystatus']: 1
['name', 'familystatus', 'name', 'address', 'familystatus']: 1
['name', 'activity', 'address']: 1
['name', 'firstnames', 'familystatus', 'birthname']: 1
['name', 'familystatus', 'name', 'title']: 1
['name', 'activity']: 1
['name', 'name', 'a

In [51]:
all_entities_pat = get_patterns_count(test_entities)

['name', 'firstnames']: 216
['name']: 39
['name', 'firstnames', 'activity', 'address']: 25
['name', 'firstnames', 'familystatus']: 23
['name', 'firstnames', 'address']: 19
['name', 'familystatus']: 14
['name', 'firstnames', 'activity']: 12
['name', 'address']: 9
['name', 'firstnames', 'address', 'familystatus']: 8
['name', 'firstnames', 'name', 'firstnames']: 5
['name', 'firstnames', 'title']: 4
['name', 'name']: 4
['name', 'address', 'familystatus']: 4
['name', 'name', 'activity']: 3
['name', 'firstnames', 'familystatus', 'name']: 3
['name', 'firstnames', 'activity', 'address', 'familystatus']: 3
['name', 'firstnames', 'name', 'address']: 2
['name', 'activity']: 2
['name', 'title']: 2
['name', 'firstnames', 'name', 'firstnames', 'activity', 'address']: 2
['name', 'firstnames', 'activity', 'familystatus']: 2
['name', 'title', 'familystatus']: 2
['name', 'firstnames', 'name', 'firstnames', 'familystatus']: 2
['name', 'firstnames', 'activity', 'address', 'name', 'firstnames', 'name', 'fi

### 2.2 Entities

In [52]:
def count_by_entities_types(entities):
    # Flatten the list of lists into a single list
    flattened_list = [item for sublist in entities for item in sublist]
    
    # Count occurrences of each value
    value_counts = Counter(flattened_list)
    
    # Sort by descending order of count
    sorted_value_counts = sorted(value_counts.items(), key=lambda x: x[1], reverse=True)
    
    # Print the results
    for value, count in sorted_value_counts:
        print(f"{value}: {count}")

In [53]:
count_by_entities_types(all_entities)

name: 2310
firstnames: 1763
address: 416
familystatus: 339
activity: 279
title: 48
birthname: 13


In [54]:
count_by_entities_types(train_entities)

name: 1661
firstnames: 1256
address: 294
familystatus: 246
activity: 197
title: 31
birthname: 8


In [55]:
count_by_entities_types(val_entities)

name: 184
firstnames: 150
address: 33
familystatus: 25
activity: 24
title: 5
birthname: 1


In [56]:
count_by_entities_types(test_entities)

name: 465
firstnames: 357
address: 89
familystatus: 68
activity: 58
title: 12
birthname: 4
