In [None]:
# prompt: code pour se connecter a mon drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
pip install loguru

Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Downloading loguru-0.7.3-py3-none-any.whl (61 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.3


In [None]:
from loguru import logger
from os import environ
from pathlib import Path

In [None]:
ROOT_DIR = '/content/drive/MyDrive'  # Changed to a subfolder within MyDrive
# or ROOT_DIR = '/content/drive/Shareddrives/YOUR_SHARED_DRIVE_NAME' if using a shared drive
DATA_DIR = Path(ROOT_DIR, "data_ner")  # data
CONFIG_DIR = Path(ROOT_DIR, "config_ner")  # config, bins
OUTPUT_DIR  = Path(ROOT_DIR, "ner_output_proj")  # models

DATA_DIR.mkdir(parents=True, exist_ok=True)
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Add paths in env
environ["SPACY_BINS_DIR"] = str(CONFIG_DIR)
environ["SPACY_DATA_DIR"] = str(DATA_DIR)
environ["SPACY_OUTPUT_DIR"] = str(OUTPUT_DIR)

logger.info(f"\nRoot directory: {ROOT_DIR} \nData directory: {DATA_DIR} \nConfig directory: {CONFIG_DIR} \nOutput directory: {OUTPUT_DIR}")

[32m2025-04-21 17:48:26.657[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m16[0m - [1m
Root directory: /content/drive/MyDrive 
Data directory: /content/drive/MyDrive/data_ner 
Config directory: /content/drive/MyDrive/config_ner 
Output directory: /content/drive/MyDrive/ner_output_proj[0m


# Prétraitement des données NER depuis un fichier CSV :
# - Le fichier contient une colonne 'Sentence #' indiquant à quelle phrase appartient chaque mot.
# - Certaines lignes n'ont pas cette info car une phrase est indiquée une seule fois, puis les mots suivent sur plusieurs lignes.
# - On utilise `fillna(method='ffill')` pour propager l'identifiant de la phrase à toutes ses lignes.
# - Ensuite, on regroupe les lignes par phrase ('Sentence #'), pour reconstruire chaque phrase sous forme de :
#     - une liste de tokens (mots)
#     - une liste de tags NER correspondants
# - Le résultat est une liste de tuples (tokens, tags), format standard pour l'entraînement des modèles NER avec SpaCy, Flair ou Transformers.


In [None]:
import pandas as pd

# Charger les données
base_ner = pd.read_csv("/content/drive/MyDrive/data_ner/NER_dataset.csv", encoding='utf-8')

base_ner.head(40)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [None]:
# Propager les valeurs de 'Sentence #' sur les lignes NaN
base_ner['Sentence #'] = base_ner['Sentence #'].fillna(method='ffill')

base_ner.head(40)

  base_ner['Sentence #'] = base_ner['Sentence #'].fillna(method='ffill')


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [None]:

# Regrouper les données par phrase
sentences = []
for _, group in base_ner.groupby("Sentence #"):
    tokens = group["Word"].tolist()
    tags = group["Tag"].tolist()
    sentences.append((tokens, tags))

sentences[0:4]

[(['Thousands',
   'of',
   'demonstrators',
   'have',
   'marched',
   'through',
   'London',
   'to',
   'protest',
   'the',
   'war',
   'in',
   'Iraq',
   'and',
   'demand',
   'the',
   'withdrawal',
   'of',
   'British',
   'troops',
   'from',
   'that',
   'country',
   '.'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-geo',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-geo',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-gpe',
   'O',
   'O',
   'O',
   'O',
   'O']),
 (['Iranian',
   'officials',
   'say',
   'they',
   'expect',
   'to',
   'get',
   'access',
   'to',
   'sealed',
   'sensitive',
   'parts',
   'of',
   'the',
   'plant',
   'Wednesday',
   ',',
   'after',
   'an',
   'IAEA',
   'surveillance',
   'system',
   'begins',
   'functioning',
   '.'],
  ['B-gpe',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-tim',
   'O',
   'O',
   'O',
   'B-org',
   'O',
   'O',
   'O',
  

# Modéle NER avec Spacy

In [None]:
import spacy
from spacy.tokens import DocBin
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# On suppose que 'sentences' contient toutes les phrases du corpus au format [(tokens, tags), ...]
# Découpage des données en ensembles d'entraînement, validation et test
sentences_train, temp_sentences = train_test_split(sentences, test_size=0.3, random_state=42)  # 70% train, 30% reste
sentences_valid, sentences_test = train_test_split(temp_sentences, test_size=0.5, random_state=42)  # 15% valid, 15% test

# Récupération de la liste des étiquettes uniques (y compris le tag 'O' pour "outside")
label_list = list(set([tag for _, tags in sentences for tag in tags]))

def create_spacy_docs(sentences, label_names=None):
    nlp = spacy.blank("en")  # Crée un modèle vide
    doc_bin = DocBin()

    for tokens, tags in tqdm(sentences):
        tokens = [token for token in tokens if str(token) != 'nan']
        tags = [tag if isinstance(tag, str) else 'O' for tag in tags]

        doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
        ents = []
        start = None
        end = None
        label = None

        for i, (token, tag) in enumerate(zip(doc, tags)):
            if tag.startswith("B-"):
                if start is not None:
                    # Ajoute l'entité précédente si elle existe
                    span = doc.char_span(start, end, label=label)
                    if span:
                        ents.append(span)
                start = token.idx
                end = token.idx + len(token)
                label = tag[2:]
            elif tag.startswith("I-") and label is not None:
                end = token.idx + len(token)
            else:
                if start is not None:
                    span = doc.char_span(start, end, label=label)
                    if span:
                        ents.append(span)
                    start = end = label = None

        # Dernière entité à la fin de la phrase
        if start is not None:
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)

        doc.ents = ents
        doc_bin.add(doc)

    return doc_bin




In [None]:
# Supposons que ces données existent déjà sous forme de listes de tuples (tokens, tags)
# sentences_train, sentences_valid, sentences_test
# et que label_list contient les entités

# Création des fichiers d'entraînement, validation et test au format .spacy
train_db = create_spacy_docs(sentences_train, label_list)
valid_db = create_spacy_docs(sentences_valid, label_list)
test_db  = create_spacy_docs(sentences_test, label_list)

# Sauvegarde des fichiers .spacy dans le répertoire défini
train_db.to_disk(DATA_DIR / "train.spacy")
valid_db.to_disk(DATA_DIR / "valid.spacy")
test_db.to_disk(DATA_DIR / "test.spacy")

100%|██████████| 33571/33571 [00:08<00:00, 3799.55it/s]
100%|██████████| 7194/7194 [00:01<00:00, 4401.04it/s]
100%|██████████| 7194/7194 [00:01<00:00, 4120.54it/s]


# entrainement du modele NER avec Spacy

In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Crée un fichier config
!python -m spacy init config {CONFIG_DIR}/base_config.cfg --lang en --pipeline ner --optimize accuracy


[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/config_ner/base_config.cfg
You can now add your data and train your pipeline:
python -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy init fill-config ${SPACY_BINS_DIR}/base_config.cfg ${SPACY_BINS_DIR}/comp_config.cfg

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/config_ner/comp_config.cfg
You can now add your data and train your pipeline:
python -m spacy train comp_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train $SPACY_BINS_DIR/comp_config.cfg \
--paths.train $SPACY_DATA_DIR/train.spacy --paths.dev $SPACY_DATA_DIR/valid.spacy \
--output $SPACY_OUTPUT_DIR

[38;5;4mℹ Saving to output directory:
/content/drive/MyDrive/ner_output_proj[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     49.38    0.14    4.29    0.07    0.00
  0     200         57.67   2539.65   61.50   63.45   59.66    0.61
  0     400         79.43   1955.08   68.66   71.57   65.98    0.69
  0     600         57.29   1977.49   72.18   76.03   68.71    0.72
  0     800         64.24   2028.09   78.75   81.15   76.48    0.79
  0    1000         80.90   2389.32   79.40   78.95   79.85    0.79
  0    1200         99.67   2753.55   80.29   83.96   76.94    0.80
  0    1400        127.80   3274.11   80.95   82.39   79.56    0.81
  0    1600        135.22   3635.11   81.42   82.21   80.64    0.81
  0    1800   

# Optimisation du modéle NER avec Spacy

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
!python -m spacy train $SPACY_BINS_DIR/comp_config_opt.cfg --output ${SPACY_OUTPUT_DIR} --paths.train ${SPACY_DATA_DIR}/train.spacy --paths.dev ${SPACY_DATA_DIR}/valid.spacy


[38;5;4mℹ Saving to output directory: /content/drive/MyDrive/ner_output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     13.24    5.35    2.96   27.61    0.10
  0     200        210.59    964.23   38.89   40.13   37.73    0.39
  0     400         39.62    625.01   55.48   57.26   53.80    0.56
  0     600         72.77    716.56   63.53   64.24   62.84    0.64
  0     800         62.82    861.35   70.52   70.56   70.49    0.71
  0    1000         79.70   1076.69   72.03   72.59   71.48    0.72
  0    1200        148.74   1128.62   73.38   73.02   73.75    0.73
  0    1400        110.45   1285.08   76.16   76.52   75.81    0.76
  0    1600        155.45   1586.71   77.05   78.28   75.87    0.77
  0    1800        

In [None]:
%%bash

# performance evaluation
START=$(date '+%Y-%m-%d %H:%M:%S')
echo "Start time: $START"

printf "\n\n***************************************** Training performances\n"
python -m spacy evaluate ${SPACY_OUTPUT_DIR}/model-best $SPACY_DATA_DIR/train.spacy --output ${SPACY_OUTPUT_DIR}/train_evaluation.json

printf "\n\n***************************************** Validation performances\n"
python -m spacy evaluate ${SPACY_OUTPUT_DIR}/model-best $SPACY_DATA_DIR/valid.spacy --output ${SPACY_OUTPUT_DIR}/valid_evaluation.json

printf "\n\n***************************************** Test performances\n"
python -m spacy evaluate ${SPACY_OUTPUT_DIR}/model-best $SPACY_DATA_DIR/test.spacy --output ${SPACY_OUTPUT_DIR}/test_evaluation.json

END=$(date '+%Y-%m-%d %H:%M:%S')
echo "End time: $END"

Start time: 2025-04-21 19:52:15


***************************************** Training performances
[38;5;4mℹ Using CPU[0m
[1m

TOK     -    
NER P   89.95
NER R   88.63
NER F   89.28
SPEED   2654 

[1m

          P       R       F
per   82.80   84.88   83.83
gpe   96.54   95.30   95.91
geo   90.71   92.25   91.47
org   86.99   78.98   82.79
tim   93.41   91.15   92.26
art   81.76   43.53   56.81
nat   36.53   72.79   48.65
eve   83.33   35.55   49.83

[38;5;2m✔ Saved results to
/content/drive/MyDrive/ner_output_proj/train_evaluation.json[0m


***************************************** Validation performances
[38;5;4mℹ Using CPU[0m
[1m

TOK     -    
NER P   85.66
NER R   84.13
NER F   84.89
SPEED   2718 

[1m

          P       R       F
tim   89.81   87.28   88.53
gpe   95.04   94.26   94.64
geo   87.39   89.30   88.34
org   78.85   69.22   73.72
per   77.23   80.27   78.72
art   36.84   10.94   16.87
nat   29.03   48.65   36.36
eve   61.90   24.07   34.67

[38;5;2m✔ Saved re

# prédiction

In [None]:
custom_nlp_ner = spacy.load(Path(environ["SPACY_OUTPUT_DIR"], "model-best"))

text_test = "Ousmane Sonko visited Dakar in 2020."

doc_test = custom_nlp_ner(text_test)
for ent in doc_test.ents:
    print(ent.text, ent.label_)

Ousmane Sonko per
Dakar geo
2020 tim


In [None]:
spacy.displacy.serve(doc_test, style="ent", auto_select_port=True)




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
