In [8]:
import spacy
import json

# Load spaCy French NER model
nlp = spacy.load("fr_core_news_lg")

def split_text_into_sentence_chunks(text, sentences_per_chunk=2):
    """Splits text into chunks containing a fixed number of sentences."""
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    
    chunks = []
    for i in range(0, len(sentences), sentences_per_chunk):
        chunk = " ".join(sentences[i:i + sentences_per_chunk])
        chunks.append(chunk)
    
    return chunks

def extract_entities_spacy(text):
    
    doc = nlp(text)
    extracted_entities = []

    for ent in doc.ents:
        extracted_entities.append({
            "text": ent.text,
            "type": ent.label_,
            "start": ent.start_char,
            "end": ent.end_char
        })
    
    return extracted_entities

def group_entities(entities, allowed_types={"ORG", "LOC"}):
    """Groups consecutive entities of the same type to handle multi-word entities."""
    grouped_entities = []
    current_entity = None

    for entity in entities:
        entity_type = entity["type"]

        # Skip entity if not in allowed types
        if entity_type not in allowed_types:
            if current_entity:
                grouped_entities.append(current_entity)
                current_entity = None
            continue

        if current_entity and entity_type == current_entity["type"]:
            # Merge entity if it's part of the previous one
            current_entity["text"] += " " + entity["text"]
            current_entity["end"] = entity["end"]
        else:
            # Save previous entity and start a new one
            if current_entity:
                grouped_entities.append(current_entity)
            current_entity = entity  # Start new entity

    # Append last entity if it exists
    if current_entity:
        grouped_entities.append(current_entity)

    return grouped_entities

#contract_path = "cleaned_contract_fixed_final.txt"
#contract_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract2.txt"
#contract_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract3.txt"
contract_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract4.txt"


# Load contract text
with open(contract_path, "r", encoding="utf-8") as file:
    contract_text = file.read()

# Step 1: Split into chunks
chunks = split_text_into_sentence_chunks(contract_text, sentences_per_chunk=2)

# Step 2: Apply NER and group entities
all_entities = []
for chunk in chunks:
    raw_entities = extract_entities_spacy(chunk)
    grouped = group_entities(raw_entities)
    all_entities.extend(grouped)
print(f"Entities extracted: {len(all_entities)}")

# Step 3: Save results to JSON
with open("spacy_extracted_entities4.json", "w", encoding="utf-8") as f:
    json.dump(all_entities, f, indent=4, ensure_ascii=False)

# Step 4: Print results
print("Extracted Entities:")
for entity in all_entities:
    print(f"Entity: {entity['text']} | Type: {entity['type']} | Start: {entity['start']} | End: {entity['end']}")


Entities extracted: 285
Extracted Entities:
Entity: BPM060225 | Type: LOC | Start: 97 | End: 106
Entity: R2161-2 R2161-5 | Type: LOC | Start: 79 | End: 96
Entity: ACHETEUR | Type: ORG | Start: 129 | End: 137
Entity: BPM060225_DATAVAL BPM060225 | Type: LOC | Start: 206 | End: 233
Entity: laccord | Type: LOC | Start: 43 | End: 50
Entity: laccord | Type: ORG | Start: 40 | End: 47
Entity: laccord | Type: LOC | Start: 30 | End: 37
Entity: État | Type: LOC | Start: 46 | End: 50
Entity: BPM060225_DATAVAL BPM060225 | Type: LOC | Start: 44 | End: 71
Entity: Exigences | Type: LOC | Start: 20 | End: 29
Entity: BPM060225_DATAVAL BPM060225 | Type: LOC | Start: 120 | End: 147
Entity: Décision | Type: LOC | Start: 42 | End: 50
Entity: Pénalités | Type: LOC | Start: 160 | End: 169
Entity: TVA | Type: ORG | Start: 59 | End: 62
Entity: Monnaie | Type: LOC | Start: 75 | End: 82
Entity: Régime des droits sur les résultats 42
5.15.2 Régime des droits sur les connaissances antérieures | Type: ORG | Start: 0

In [1]:
import spacy
nlp = spacy.load("fr_core_news_lg")
print("Model loaded!")


Model loaded!


In [9]:
import os
# Change working directory to the correct path
os.chdir(r"C:\Users\hp\Downloads\clever contact")

# Verify the new working directory
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\hp\Downloads\clever contact


In [10]:
import json

def merge_ner_results(spacy_results, flair_results):
    """ Merges SpaCy and Flair results by matching entity text. """
    merged_entities = []
    entity_dict = {}

    # Combine entities from both models
    for entity in spacy_results + flair_results:
        key = entity["text"]  # Match by entity text only

        if key in entity_dict:
            existing = entity_dict[key]
            # If the entity types are the same, keep one
            if entity["type"] == existing["type"]:  
                entity_dict[key] = entity  
        else:
            entity_dict[key] = entity  # Add new entity if it's not present

    return list(entity_dict.values())

# File paths
spacy_json_path = "spacy_extracted_entities4.json"
flair_json_path = "flair_finetuned_entities_fulltext4.json"
merged_output_path = "merged_spacy_flair_entities4.json"

# Load extracted entities from JSON files
with open(spacy_json_path, "r", encoding="utf-8") as f:
    spacy_entities = json.load(f)
print(f" Loaded {len(spacy_entities)} entities from SpaCy output!")

with open(flair_json_path, "r", encoding="utf-8") as f:
    flair_entities = json.load(f)
print(f" Loaded {len(flair_entities)} entities from Flair output!")

# Merge SpaCy & Flair results
final_entities = merge_ner_results(spacy_entities, flair_entities)

# Save merged results to a new JSON file
with open(merged_output_path, "w", encoding="utf-8") as f:
    json.dump(final_entities, f, ensure_ascii=False, indent=4)

print(f"\n Merged NER results saved in {merged_output_path}!")

# Print some merged entities for verification
print("\n Final Merged Entities Sample:")
for entity in final_entities[:10]:  # Print only first 10
    print(f"Entity: {entity['text']} | Type: {entity['type']}")


 Loaded 285 entities from SpaCy output!
 Loaded 295 entities from Flair output!

 Merged NER results saved in merged_spacy_flair_entities4.json!

 Final Merged Entities Sample:
Entity: BPM060225 | Type: LOC
Entity: R2161-2 R2161-5 | Type: LOC
Entity: ACHETEUR | Type: ORG
Entity: BPM060225_DATAVAL BPM060225 | Type: LOC
Entity: laccord | Type: LOC
Entity: État | Type: LOC
Entity: Exigences | Type: LOC
Entity: Décision | Type: LOC
Entity: Pénalités | Type: LOC
Entity: TVA | Type: ORG


In [11]:
import json

def get_common_entities(spacy_results, flair_results):
    """Extracts only entities that are common between SpaCy and Flair."""
    common_entities = []
    
    # Convert Flair entities into a dictionary for quick lookup
    flair_dict = { (entity["text"].lower(), entity["type"]) for entity in flair_results }

    # Compare each SpaCy entity to Flair's results
    for entity in spacy_results:
        key = (entity["text"].lower(), entity["type"])  # Normalize case
        if key in flair_dict:  # Entity exists in both SpaCy and Flair
            common_entities.append(entity)

    return common_entities 

# File paths
spacy_json_path = "spacy_extracted_entities4.json"
flair_json_path = "flair_finetuned_entities_fulltext4.json"
common_output_path = "common_spacy_flair_entities4.json"

# Load extracted entities from JSON files
with open(spacy_json_path, "r", encoding="utf-8") as f:
    spacy_entities = json.load(f)
print(f" Loaded {len(spacy_entities)} entities from SpaCy output!")

with open(flair_json_path, "r", encoding="utf-8") as f:
    flair_entities = json.load(f)
print(f" Loaded {len(flair_entities)} entities from Flair output!")

# Find common entities
common_entities = get_common_entities(spacy_entities, flair_entities)

# Save common results to a new JSON file
with open(common_output_path, "w", encoding="utf-8") as f:
    json.dump(common_entities, f, ensure_ascii=False, indent=4)

print(f"\n Common NER results saved in {common_output_path}!")

# Print some common entities for verification
print("\n Common Entities Found in Both SpaCy & Flair:")
for entity in common_entities[:10]:  # Print only first 10
    print(f"Entity: {entity['text']} | Type: {entity['type']}")


 Loaded 285 entities from SpaCy output!
 Loaded 295 entities from Flair output!

 Common NER results saved in common_spacy_flair_entities4.json!

 Common Entities Found in Both SpaCy & Flair:
Entity: État | Type: LOC
Entity: Direction du numérique | Type: ORG
Entity: Directions des Ministères sociaux | Type: ORG
Entity: Direction du Numérique | Type: ORG
Entity: Direction du numérique | Type: ORG
Entity: Direction du NUMérique | Type: ORG
Entity: Lot | Type: LOC
Entity: Lot | Type: LOC
Entity: Lot | Type: LOC
Entity: Ministères Sociaux | Type: ORG
