In [13]:

import re

def preprocess_contract(text):
    # Supprimer les sections de la table des matières (y compris les points et numéros)
    text = re.sub(r"(Article\s+\d+.*?\.{3,}\s+\d+\n?)", "", text, flags=re.DOTALL)

    # Supprimer les lignes contenant seulement des points 
    text = re.sub(r"\.{3,}", "", text)  # Supprime les lignes contenant des points répétés
    text = re.sub(r"Articl e \d+.*", "", text)  # Supprime les erreurs de format type "Articl e"

    # Supprimer les symboles comme "•", "✓"
    text = re.sub(r"[•✓✔]", "", text)

    # Supprimer explicitement les caractères inconnus affichés comme "?" dans un carré
    text = re.sub(r"[\ufffd\u3000]", "", text)  # Supprime les caractères problématiques

    # Supprimer les caractères non imprimables 
    text = re.sub(r"[^\w\s,.!?;:()\[\]{}'\"-éèêàùçÉÈÊÀÙÇ]", "", text, flags=re.UNICODE)

    # Normaliser les espaces et les lignes vides
    text = re.sub(r"[ ]{2,}", " ", text)  
    text = re.sub(r"\n{3,}", "\n\n", text)  

    return text


#input_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts/Contracts/2MAINS/CCAP_2MAINS_BPM031594_vF.txt"
#output_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts/Contracts/2MAINS/cleaned_contract_fixed_nchalla.txt"
#input_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts/Contracts/2MAINS/CCTP_2MAINS_BPM031594_vF.txt"
#output_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract2.txt"
#input_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts\Contracts\DATAVAL\REGLEMENT DE LA CONSULTATION.txt"
#output_path =  r"C:\Users\hp\Downloads\clever contact\cleanedcontract3.txt"
input_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts\Contracts\DATAVAL\CCAP MARCHE DATAVAL.txt"
output_path =  r"C:\Users\hp\Downloads\clever contact\cleanedcontract4.txt"




with open(input_path, "r", encoding="utf-8") as file:
    original_text = file.read()


cleaned_text = preprocess_contract(original_text)

with open(output_path, "w", encoding="utf-8") as file:
    file.write(cleaned_text)

print(f"Le contrat nettoyé a été sauvegardé dans : {output_path}")


Le contrat nettoyé a été sauvegardé dans : C:\Users\hp\Downloads\clever contact\cleanedcontract4.txt


In [14]:

#input_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts/Contracts/2MAINS/cleaned_contract_fixed_nchalla.txt"
#input_path =  r"C:\Users\hp\Downloads\clever contact\cleanedcontract2.txt"
#input_path =  r"C:\Users\hp\Downloads\clever contact\cleanedcontract3.txt"
input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract4.txt"




from transformers import CamembertTokenizerFast, CamembertForTokenClassification, pipeline
import spacy


model_checkpoint = "Jean-Baptiste/camembert-ner"
tokenizer = CamembertTokenizerFast.from_pretrained(model_checkpoint)
model = CamembertForTokenClassification.from_pretrained(model_checkpoint)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, framework="pt")
nlp = spacy.load("fr_core_news_sm")  # model pour la segmentation des phrase

def split_text_into_sentence_chunks(text, sentences_per_chunk=2):
    
    doc = nlp(text)  
    sentences = [sent.text for sent in doc.sents]  # Extraction des phrases 

    # sentences_per_chunk
    chunks = []
    for i in range(0, len(sentences), sentences_per_chunk):
        chunk = " ".join(sentences[i:i + sentences_per_chunk])
        chunks.append(chunk)
    return chunks
def group_entities(entities, allowed_types={"B-ORG", "I-ORG", "B-LOC", "I-LOC"}):
    
    grouped_entities = []
    current_entity = None

    for entity in entities:
        entity_type = entity["entity"]
        word = entity["word"].replace("▁", "")  
        start = entity.get("start", 0)
        end = entity.get("end", 0)

        
        if not any(allowed in entity_type for allowed in allowed_types):
            continue

         # gerer la cotinuation du meme type d'entites
        if current_entity and (entity_type.startswith("I-") or entity_type == current_entity["entity"]):
            current_entity["text"] += " " + word
            current_entity["end"] = end
        else:
            if current_entity:
                grouped_entities.append(current_entity)
            current_entity = {"entity": entity_type, "text": word, "start": start, "end": end}

    if current_entity:
        grouped_entities.append(current_entity)

    return grouped_entities


with open(input_path, "r", encoding="utf-8") as file:
    cleaned_text = file.read()

sentence_chunks = split_text_into_sentence_chunks(cleaned_text, sentences_per_chunk=2)


sample_chunks = sentence_chunks[16:30]

#  Application de NER sur les chunks
for i, chunk in enumerate(sample_chunks):
    print(f"\nChunk {i + 1}:\n{chunk}\n")
    entities = ner_pipeline(chunk)
    grouped_entities = group_entities(entities)
    for entity in grouped_entities:
        print(f"Entity: {entity['text']} | Type: {entity['entity']} | Start: {entity['start']} | End: {entity['end']}")


Device set to use cpu



Chunk 1:
, Réversibilité et Transférabilité 16

CCAP BPM060225_DATAVAL BPM060225
3/48 2.14Prix de laccord-cadre et des marches subséquents
16
2.14.1

Entity: et | Type: I-LOC | Start: 15 | End: 18

Chunk 2:
Forme et contenu des prix de la partie à bons de commande de laccord-cadre 16
2.14.2 Révision des prix de la partie à bons de commande de laccord-cadre 16
2.14.3


Chunk 3:
Etablissement des prix plafonds de laccord-cadre pour les marchés subséquents 18
2.14.4 Révision des prix plafonds de laccord-cadre 18
3.1


Chunk 4:
Emission et exécution des bons de commande 18
3.2 Délai dexécution des prestations 20
4.1

Entity: E | Type: I-LOC | Start: 0 | End: 1

Chunk 5:
Les pièces constitutives des marchés subséquents 20
4.2 Forme des marchés subséquents 20
4.3 Modalités de passation des marchés subséquents 20
4.3.1

Entity: Les | Type: I-LOC | Start: 0 | End: 3

Chunk 6:
Modalités de passation 20
4.3.2 Obligation de réponse 21
4.4 Durée des marchés subséquents 21
4.5 Délai dexécution des

to save in a json file delete the previous if all okay

In [15]:
import spacy
import json
from transformers import CamembertTokenizerFast, CamembertForTokenClassification, pipeline
model_checkpoint = "Jean-Baptiste/camembert-ner"
tokenizer = CamembertTokenizerFast.from_pretrained(model_checkpoint)
model = CamembertForTokenClassification.from_pretrained(model_checkpoint)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, framework="pt")
nlp = spacy.load("fr_core_news_lg")  # model pour la segmentation des phrase
#input_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts/Contracts/2MAINS/cleaned_contract_fixed_nchalla.txt"
#input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract2.txt"
#input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract3.txt"
input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract4.txt"

def split_text_into_sentence_chunks(text, sentences_per_chunk=2):
    doc = nlp(text)  
    sentences = [sent.text for sent in doc.sents]  # Extract sentences

    # Group sentences into chunks
    chunks = []
    for i in range(0, len(sentences), sentences_per_chunk):
        chunk = " ".join(sentences[i:i + sentences_per_chunk])
        chunks.append(chunk)
    
    return chunks
def group_entities(entities, allowed_types={"B-ORG", "I-ORG", "B-LOC", "I-LOC"}):
    
    grouped_entities = []
    current_entity = None

    for entity in entities:
        entity_type = entity["entity"]
        word = entity["word"].replace("▁", "")  
        start = entity.get("start", 0)
        end = entity.get("end", 0)

        
        if not any(allowed in entity_type for allowed in allowed_types):
            continue

         # gerer la cotinuation du meme type d'entites
        if current_entity and (entity_type.startswith("I-") or entity_type == current_entity["entity"]):
            current_entity["text"] += " " + word
            current_entity["end"] = end
        else:
            if current_entity:
                grouped_entities.append(current_entity)
            current_entity = {"entity": entity_type, "text": word, "start": start, "end": end}

    if current_entity:
        grouped_entities.append(current_entity)

    return grouped_entities
with open(input_path, "r", encoding="utf-8") as file:
    cleaned_text = file.read()

sentence_chunks = split_text_into_sentence_chunks(cleaned_text, sentences_per_chunk=2)
sample_chunks = sentence_chunks[16:50]

extracted_entities = [] 
#  Application de NER sur les chunks
for i, chunk in enumerate(sample_chunks):
    print(f"\nChunk {i + 1}:\n{chunk}\n")
    entities = ner_pipeline(chunk)
    grouped_entities = group_entities(entities)
    for entity in grouped_entities:
          extracted_entities.append({
            "text": entity['text'],
            "type": entity['entity'],
            "start": entity['start'],
            "end": entity['end'],
            "chunk_id": i + 1  # Store which chunk it came from
        })
# Save to a JSON file
with open("extracted_entities_jeanbaptiste4.json", "w", encoding="utf-8") as f:
    json.dump(extracted_entities, f, indent=4, ensure_ascii=False)

print(" Entities saved successfully!")

Device set to use cpu



Chunk 1:
Obligation de respect déontologique 15
2.13.8 État de lart 15
2.13.9 Initialisation


Chunk 2:
, Réversibilité et Transférabilité 16

CCAP BPM060225_DATAVAL BPM060225
3/48


Chunk 3:
2.14Prix de laccord -


Chunk 4:
cadre et des marches subséquents
16
2.14.1 Forme et contenu des prix de la partie à bons de commande de laccord


Chunk 5:
-cadre 16
2.14.2 Révision des prix de la partie à bons de commande de laccord


Chunk 6:
- cadre 16
2.14.3


Chunk 7:
Etablissement des prix plafonds de laccord -


Chunk 8:
cadre pour les marchés subséquents 18
2.14.4 Révision des prix plafonds de laccord


Chunk 9:
-cadre 18
3.1 Emission et exécution des bons de commande 18
3.2


Chunk 10:
Délai dexécution des prestations 20
4.1 Les pièces constitutives des marchés subséquents 20
4.2


Chunk 11:
Forme des marchés subséquents 20
4.3 Modalités de passation des marchés subséquents 20
4.3.1


Chunk 12:
Modalités de passation 20
4.3.2 Obligation de réponse


Chunk 13:
21
4.4 Durée des marchés sub

In [16]:
!python -m spacy download fr_core_news_md


Collecting fr-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.8.0/fr_core_news_md-3.8.0-py3-none-any.whl (45.8 MB)
     ---------------------------------------- 0.0/45.8 MB ? eta -:--:--
     ---------- ---------------------------- 11.8/45.8 MB 61.7 MB/s eta 0:00:01
     ------------------- ------------------- 23.1/45.8 MB 56.2 MB/s eta 0:00:01
     --------------------------------- ----- 39.6/45.8 MB 62.9 MB/s eta 0:00:01
     --------------------------------------  45.6/45.8 MB 64.5 MB/s eta 0:00:01
     --------------------------------------- 45.8/45.8 MB 53.1 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_md')



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


trial of improvement of jean-Baptiste/camembert-ner

In [17]:
import spacy
from transformers import CamembertTokenizerFast, CamembertForTokenClassification, pipeline
model_checkpoint = "Jean-Baptiste/camembert-ner"
tokenizer = CamembertTokenizerFast.from_pretrained(model_checkpoint)
model = CamembertForTokenClassification.from_pretrained(model_checkpoint)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, framework="pt")
nlp = spacy.load("fr_core_news_md")  
#input_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts/Contracts/2MAINS/cleaned_contract_fixed_nchalla.txt"
#input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract2.txt"
#input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract3.txt"
input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract4.txt"

def split_text_into_sentence_chunks(text, sentences_per_chunk=2):
    doc = nlp(text)  
    sentences = [sent.text for sent in doc.sents]  # Extract sentences

    # Group sentences into chunks
    chunks = []
    for i in range(0, len(sentences), sentences_per_chunk):
        chunk = " ".join(sentences[i:i + sentences_per_chunk])
        chunks.append(chunk)
    
    return chunks
def group_entities(entities, allowed_types={"B-ORG", "I-ORG", "B-LOC", "I-LOC"}):
    grouped_entities = []
    current_entity = None

    for entity in entities:
        entity_type = entity["entity"]
        word = entity["word"].replace("▁", "").strip()  # Remove artifacts and spaces
        start = entity.get("start", 0)
        end = entity.get("end", 0)

        # Ignore unwanted entity types
        if not any(allowed in entity_type for allowed in allowed_types):
            continue

        # If there's an ongoing entity, check for merging conditions
        if current_entity:
            if (entity_type.startswith("I-") or entity_type[2:] == current_entity["entity"][2:]) and (
                entity["start"] == current_entity["end"] + 1 or word.islower()
            ):
                # Merge if it's a continuation
                current_entity["text"] += " " + word
                current_entity["end"] = end
            else:
                # Otherwise, save and start a new entity
                grouped_entities.append(current_entity)
                current_entity = {"entity": entity_type, "text": word, "start": start, "end": end}
        else:
            # Initialize first entity
            current_entity = {"entity": entity_type, "text": word, "start": start, "end": end}

    # Append last entity
    if current_entity:
        grouped_entities.append(current_entity)

    return grouped_entities
with open(input_path, "r", encoding="utf-8") as file:
    cleaned_text = file.read()

sentence_chunks = split_text_into_sentence_chunks(cleaned_text, sentences_per_chunk=2)
sample_chunks = sentence_chunks[16:50]


for i, chunk in enumerate(sample_chunks):
    print(f"\nChunk {i + 1}:\n{chunk}\n")
    entities = ner_pipeline(chunk)
    grouped_entities = group_entities(entities)
    for entity in grouped_entities:
        print(f"Entity: {entity['text']} | Type: {entity['entity']} | Start: {entity['start']} | End: {entity['end']}")



Device set to use cpu



Chunk 1:
Obligation de respect déontologique 15
2.13.8 État de lart 15
2.13.9 Initialisation, Réversibilité et Transférabilité 16

CCAP BPM060225_DATAVAL BPM060225
3/48


Chunk 2:
2.14Prix de laccord-cadre et des marches subséquents
16
2.14.1 Forme


Chunk 3:
et contenu des prix de la partie à bons de commande de laccord -cadre 16
2.14.2


Chunk 4:
Révision des prix de la partie à bons de commande de laccord -


Chunk 5:
cadre 16
2.14.3 Etablissement des prix plafonds de laccord-cadre pour les marchés subséquents 18
2.14.4


Chunk 6:
Révision des prix plafonds de laccord-cadre 18
3.1 Emission et exécution des bons de commande 18
3.2 Délai dexécution des prestations 20
4.1

Entity: Ré | Type: I-LOC | Start: 0 | End: 2

Chunk 7:
Les pièces constitutives des marchés subséquents 20
4.2 Forme des marchés subséquents 20
4.3

Entity: Les | Type: I-LOC | Start: 0 | End: 3

Chunk 8:
Modalités de passation des marchés subséquents 20
4.3.1 Modalités de passation 20
4.3.2

Entity: s | Type: I-LOC

In [18]:
import spacy
from transformers import CamembertTokenizerFast, CamembertForTokenClassification, pipeline
model_checkpoint = "Jean-Baptiste/camembert-ner"
tokenizer = CamembertTokenizerFast.from_pretrained(model_checkpoint)
model = CamembertForTokenClassification.from_pretrained(model_checkpoint)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, framework="pt")
nlp = spacy.load("fr_core_news_md")  
#input_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts/Contracts/2MAINS/cleaned_contract_fixed_nchalla.txt"
#input_path =  r"C:\Users\hp\Downloads\clever contact\cleanedcontract2.txt"
#input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract3.txt"
input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract4.txt"
def split_text_into_sentence_chunks(text, sentences_per_chunk=2):
    doc = nlp(text)  
    sentences = [sent.text for sent in doc.sents]  # Extract sentences

    # Group sentences into chunks
    chunks = []
    for i in range(0, len(sentences), sentences_per_chunk):
        chunk = " ".join(sentences[i:i + sentences_per_chunk])
        chunks.append(chunk)
    
    return chunks
def group_entities(entities, allowed_types={"B-ORG", "I-ORG", "B-LOC", "I-LOC"}):
    grouped_entities = []
    current_entity = None

    for entity in entities:
        entity_type = entity["entity"]
        word = entity["word"].replace("▁", "").strip()  # Remove artifacts and spaces
        start = entity.get("start", 0)
        end = entity.get("end", 0)

        # Ignore unwanted entity types
        if not any(allowed in entity_type for allowed in allowed_types):
            continue

        # If there's an ongoing entity, check for merging conditions
        if current_entity:
            if (entity_type.startswith("I-") or entity_type[2:] == current_entity["entity"][2:]) and (
                entity["start"] == current_entity["end"] + 1 or word.islower()
            ):
                # Merge if it's a continuation
                current_entity["text"] += " " + word
                current_entity["end"] = end
            else:
                # Otherwise, save and start a new entity
                grouped_entities.append(current_entity)
                current_entity = {"entity": entity_type, "text": word, "start": start, "end": end}
        else:
            # Initialize first entity
            current_entity = {"entity": entity_type, "text": word, "start": start, "end": end}

    # Append last entity
    if current_entity:
        grouped_entities.append(current_entity)

    return grouped_entities
with open(input_path, "r", encoding="utf-8") as file:
    cleaned_text = file.read()

sentence_chunks = split_text_into_sentence_chunks(cleaned_text, sentences_per_chunk=2)
sample_chunks = sentence_chunks[16:50]


for i, chunk in enumerate(sample_chunks):
    print(f"\nChunk {i + 1}:\n{chunk}\n")
    entities = ner_pipeline(chunk)
    grouped_entities = group_entities(entities)
    for entity in grouped_entities:
         extracted_entities.append({
            "text": entity['text'],
            "type": entity['entity'],
            "start": entity['start'],
            "end": entity['end'],
            "chunk_id": i + 1  # Store which chunk it came from
        })
         print(f"Entity: {entity['text']} | Type: {entity['entity']} | Start: {entity['start']} | End: {entity['end']}")

# Save to a JSON file
with open("extracted_entities_jeanBaptisteimprovement4.json", "w", encoding="utf-8") as f:
    json.dump(extracted_entities, f, indent=4, ensure_ascii=False)

print(" Entities saved successfully!")



Device set to use cpu



Chunk 1:
Obligation de respect déontologique 15
2.13.8 État de lart 15
2.13.9 Initialisation, Réversibilité et Transférabilité 16

CCAP BPM060225_DATAVAL BPM060225
3/48


Chunk 2:
2.14Prix de laccord-cadre et des marches subséquents
16
2.14.1 Forme


Chunk 3:
et contenu des prix de la partie à bons de commande de laccord -cadre 16
2.14.2


Chunk 4:
Révision des prix de la partie à bons de commande de laccord -


Chunk 5:
cadre 16
2.14.3 Etablissement des prix plafonds de laccord-cadre pour les marchés subséquents 18
2.14.4


Chunk 6:
Révision des prix plafonds de laccord-cadre 18
3.1 Emission et exécution des bons de commande 18
3.2 Délai dexécution des prestations 20
4.1

Entity: Ré | Type: I-LOC | Start: 0 | End: 2

Chunk 7:
Les pièces constitutives des marchés subséquents 20
4.2 Forme des marchés subséquents 20
4.3

Entity: Les | Type: I-LOC | Start: 0 | End: 3

Chunk 8:
Modalités de passation des marchés subséquents 20
4.3.1 Modalités de passation 20
4.3.2

Entity: s | Type: I-LOC

In [19]:
! pip install flair





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


trial with flair 

In [20]:
import spacy
import json
import pandas as pd
from flair.data import Sentence
from flair.models import SequenceTagger

# Load spaCy for sentence splitting
nlp = spacy.load("fr_core_news_sm")

def split_text_into_sentence_chunks(text, sentences_per_chunk=2):
    """Splits text into chunks of sentences."""
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    chunks = [" ".join(sentences[i:i + sentences_per_chunk]) for i in range(0, len(sentences), sentences_per_chunk)]
    return chunks

# Load Flair NER tagger
tagger = SequenceTagger.load('flair/ner-french')

# Load text file
#input_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts/Contracts/2MAINS/cleaned_contract_fixed_nchalla.txt"
#input_path= r"C:\Users\hp\Downloads\clever contact\cleanedcontract2.txt"
#input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract3.txt"
input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract4.txt"
with open(input_path, "r", encoding="utf-8") as file:
    cleaned_text = file.read()

# Split text into chunks
sentence_chunks = split_text_into_sentence_chunks(cleaned_text, sentences_per_chunk=2)

# Extract entities
extracted_entities = []
for i, chunk in enumerate(sentence_chunks):
    sentence = Sentence(chunk)
    tagger.predict(sentence)
    
    for entity in sentence.get_spans('ner'):
        extracted_entities.append({
            "text": entity.text,
            "type": entity.tag,
            "start": entity.start_position,
            "end": entity.end_position,
            "chunk_id": i + 1  # Track which chunk the entity belongs to
        })

# Save to JSON file
json_output_file = "extracted_entities_flair4.json"
with open(json_output_file, "w", encoding="utf-8") as f:
    json.dump(extracted_entities, f, indent=4, ensure_ascii=False)

print(f" Entities saved successfully to {json_output_file}!")


2025-04-05 02:54:08,555 SequenceTagger predicts: Dictionary with 19 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-MISC, B-MISC, E-MISC, I-MISC, S-ORG, B-ORG, E-ORG, I-ORG, <START>, <STOP>
 Entities saved successfully to extracted_entities_flair4.json!


First trial using Alizee/xlm-roberta-large-finetuned-wikiner-fr

In [21]:
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load SpaCy for sentence segmentation
nlp = spacy.load("fr_core_news_sm")

def split_text_into_sentence_chunks(text, sentences_per_chunk=2):
    doc = nlp(text)  
    sentences = [sent.text for sent in doc.sents]
    chunks = []
    for i in range(0, len(sentences), sentences_per_chunk):
        chunk = " ".join(sentences[i:i + sentences_per_chunk])
        chunks.append(chunk)
    return chunks


model_name = "Alizee/xlm-roberta-large-finetuned-wikiner-fr"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)


#input_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts/Contracts/2MAINS/cleaned_contract_fixed_nchalla.txt"
#input_path =  r"C:\Users\hp\Downloads\clever contact\cleanedcontract2.txt"
#input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract3.txt"
input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract4.txt"

# Read and process the file
with open(input_path, "r", encoding="utf-8") as file:
    cleaned_text = file.read()

# Split text into chunks
sentence_chunks = split_text_into_sentence_chunks(cleaned_text, sentences_per_chunk=2)

# Apply NER on the chunks using the loaded xlm-roberta model
for i, chunk in enumerate(sentence_chunks):
    print(f"\nChunk {i + 1}:\n{chunk}\n")
    results = ner_pipeline(chunk)
    for result in results:
        entity = result['word']
        entity_type = result['entity']
        score = result['score']
        print(f"Entity: {entity} | Type: {entity_type} | Score: {score}")


Device set to use cpu



Chunk 1:
SECRETARIAT GENERAL
DIRECTION DES FINANCES


Chunk 2:
,
DES ACHATS ET DES SERVICES


Entity: ATS | Type: I-MISC | Score: 0.5736098289489746
Entity: S | Type: I-MISC | Score: 0.6170268058776855

Chunk 3:
Numéro de consultation : BPM060225
ACCORD

Entity: ▁B | Type: I-MISC | Score: 0.5275194644927979
Entity: PM | Type: I-MISC | Score: 0.5919273495674133
Entity: 060 | Type: I-MISC | Score: 0.45134592056274414

Chunk 4:
- CADRE RELATIF


Chunk 5:
A DES PRESTATIONS DE VALORISATION DES
DONNEES
 CAHIER DES CLAUSES ADMINISTRATIVES


Chunk 6:
PARTICULIÈRES
 Numéro de consultation :

Entity: ▁PARTI | Type: I-LOC | Score: 0.44765371084213257
Entity: C | Type: I-LOC | Score: 0.41455554962158203
Entity: ULI | Type: I-LOC | Score: 0.48105278611183167
Entity: È | Type: I-LOC | Score: 0.6068423390388489
Entity: RES | Type: I-LOC | Score: 0.6377915143966675

Chunk 7:
BPM060225
Procédure de passation : Appel doffres ouvert, en application des articles L2124-1, L2124-2, R2124-2
et R2161-2 à R21

trial : Alizee/xlm-roberta-large-finetuned-wikiner-fr , with grouped_entities

In [22]:
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline


nlp = spacy.load("fr_core_news_sm")

def split_text_into_sentence_chunks(text, sentences_per_chunk=2):
    doc = nlp(text)  
    #This line creates a list of all sentences in the text as strings.

    sentences = [sent.text for sent in doc.sents]
    chunks = []
    for i in range(0, len(sentences), sentences_per_chunk):
        chunk = " ".join(sentences[i:i + sentences_per_chunk])
        chunks.append(chunk)
    return chunks
#if current entity is in the allowed , it adds it , if not it resets it . then the function return the list of the grouped entities
def group_entities(results, allowed_types={"ORG", "LOC"}):
    grouped_entities = []
    current_entity = None

    for result in results:
        entity_type = result['entity'].split('-')[1]  # Assumes format B-ORG, I-LOC 
        word = result['word'].replace("▁", " ").strip()
        start = result['start']
        end = result['end']

        if entity_type not in allowed_types:
            if current_entity:
                grouped_entities.append(current_entity)
                current_entity = None
            continue

        if current_entity and entity_type == current_entity["type"]:
            current_entity["text"] += " " + word
            current_entity["end"] = end
        else:
            if current_entity:
                grouped_entities.append(current_entity)
            current_entity = {"text": word, "type": entity_type, "start": start, "end": end}

    if current_entity:
        grouped_entities.append(current_entity)

    return grouped_entities

# Load tokenizer and model 
model_name = "Alizee/xlm-roberta-large-finetuned-wikiner-fr"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
#input_path = r"C:\Users\hp\Downloads\clever contact\clevercontracts/Contracts/2MAINS/cleaned_contract_fixed_nchalla.txt"
#input_path =  r"C:\Users\hp\Downloads\clever contact\cleanedcontract2.txt"
#input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract3.txt"
input_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract4.txt"

# Read and process the file
with open(input_path, "r", encoding="utf-8") as file:
    cleaned_text = file.read()


sentence_chunks = split_text_into_sentence_chunks(cleaned_text, sentences_per_chunk=2)
extracted_entities = []
# Apply NER on the chunks using the loaded model and group results
for i, chunk in enumerate(sentence_chunks):
    print(f"\nChunk {i + 1}:\n{chunk}\n")
    results = ner_pipeline(chunk)
    grouped_results = group_entities(results, allowed_types={"ORG", "LOC"})
    for entity in grouped_results:
         extracted_entities.append({
            "text": entity['text'],
            "type": entity['type'],
            "start": entity['start'],
            "end": entity['end'],
            "chunk_id": i + 1  # Track the chunk for reference
        })
         
        
    
# Save to a JSON file
with open("extracted_entities_Roberta4.json", "w", encoding="utf-8") as f:
    json.dump(extracted_entities, f, indent=4, ensure_ascii=False)

print(" Entities saved successfully!")


Device set to use cpu



Chunk 1:
SECRETARIAT GENERAL
DIRECTION DES FINANCES


Chunk 2:
,
DES ACHATS ET DES SERVICES



Chunk 3:
Numéro de consultation : BPM060225
ACCORD


Chunk 4:
- CADRE RELATIF


Chunk 5:
A DES PRESTATIONS DE VALORISATION DES
DONNEES
 CAHIER DES CLAUSES ADMINISTRATIVES


Chunk 6:
PARTICULIÈRES
 Numéro de consultation :


Chunk 7:
BPM060225
Procédure de passation : Appel doffres ouvert, en application des articles L2124-1, L2124-2, R2124-2
et R2161-2 à R2161-5 du Code de la commande publique
ACHETEUR
MINISTERES SOCIAUX
SECRETARIAT GENERAL
DIRECTION DU NUMERIQUE

CCAP BPM060225_DATAVAL BPM060225
2/48 Table des matières
2.6


Chunk 8:
Forme et étendue de laccord-cadre 10
2.6.1 Forme de chaque lot 10
2.6.2


Chunk 9:
Etendue de laccord-cadre 11
 2.7 Documents contractuels de laccord-cadre 11
2.8


Chunk 10:
Représentation des parties de laccord-cadre 12
2.8.1 Représentation du pouvoir adjudicateur 12



Chunk 11:
2.8.2 Représentation du titulaire 12



Chunk 12:
2.9 Durée de laccord-cadre 12
