In [None]:
! pip install datasets pandas pyarrow flair torch



In [None]:
import pandas as pd
from datasets import load_dataset

# Define the mapping from numerical labels to BIO tags.

label_map = {
    0: "O",  
    1: "B-PER", 2: "I-PER",
    3: "B-ORG", 4: "I-ORG",
    5: "B-LOC", 6: "I-LOC"
}

def convert_labels(tags):
    """
    Convert a list of tags 
    into a list of BIO labels using the label_map.
    """
    new_tags = []
    for t in tags:
        try:
            # Try to convert the tag to an integer and use the label map.
            t_int = int(t)
            new_tags.append(label_map.get(t_int, "O"))
        except ValueError:
            new_tags.append(t)
    return new_tags


# Step 1: Load and process Adminset-NER from Parquet

df_adminset = pd.read_parquet("adminset_ner.parquet")
# Convert numeric ner_tags to BIO strings
df_adminset["ner_tags"] = df_adminset["ner_tags"].apply(convert_labels)


# Step 2: Load and process wikiner_fr from Hugging Face
#The dataset falls within the 100K - 1M size range / Approximately 170,634 sentences
wikiner_dataset = load_dataset("Jean-Baptiste/wikiner_fr", split="train")
df_wikiner = pd.DataFrame(wikiner_dataset)
# Convert wikiner_fr ner_tags to BIO strings
df_wikiner["ner_tags"] = df_wikiner["ner_tags"].apply(convert_labels)

#combine the 2
combined_df = pd.concat([
    df_adminset[["tokens", "ner_tags"]],
    df_wikiner[["tokens", "ner_tags"]]
], ignore_index=True)


# Step 4: Save combined dataset in Flair train format

output_filename = "train_combined.txt"
with open(output_filename, "w", encoding="utf-8") as f:
    for tokens, tags in zip(combined_df["tokens"], combined_df["ner_tags"]):
        for word, tag in zip(tokens, tags):
            f.write(f"{word} {tag}\n")
        f.write("\n")  # Separate sentences by an empty line

print(f" Combined dataset saved as {output_filename}")



  from .autonotebook import tqdm as notebook_tqdm


 Combined dataset saved as train_combined.txt


In [1]:
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer


columns = {0: 'text', 1: 'ner'}

# Load dataset
corpus = ColumnCorpus('.', columns, train_file='train_combined.txt')

tagger = SequenceTagger.load("flair/ner-french")

# at first i tried minibatchsize= 16 but my kernel crahsed since im on cpu so i used minibatchsize= 6
trainer = ModelTrainer(tagger, corpus)
trainer.train("flair_output_combined",
              learning_rate=0.05,
              mini_batch_size=6,
              max_epochs=5)


  from .autonotebook import tqdm as notebook_tqdm


2025-03-26 16:31:22,330 Reading data from .
2025-03-26 16:31:22,331 Train: train_combined.txt
2025-03-26 16:31:22,332 Dev: None
2025-03-26 16:31:22,332 Test: None
2025-03-26 16:31:51,862 No test split found. Using 10% (i.e. 12141 samples) of the train split as test data
2025-03-26 16:31:51,939 No dev split found. Using 10% (i.e. 10927 samples) of the train split as dev data
2025-03-26 16:31:57,747 SequenceTagger predicts: Dictionary with 19 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-MISC, B-MISC, E-MISC, I-MISC, S-ORG, B-ORG, E-ORG, I-ORG, <START>, <STOP>
2025-03-26 16:31:58,153 ----------------------------------------------------------------------------------------------------
2025-03-26 16:31:58,155 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      '0-/vol/home-vol2/ml/akbikala/.flair/embeddings/fr-wiki-fasttext-300d-1M'
      (embedding): Embedding(1000000, 300)
    )
    (list_embedding_1): FlairEmbeddin

  scaler = torch.cuda.amp.GradScaler(enabled=use_amp and flair.device.type != "cpu")


2025-03-26 16:48:04,262 epoch 1 - iter 1639/16391 - loss 0.10721415 - time (sec): 966.06 - samples/sec: 268.28 - lr: 0.050000 - momentum: 0.000000


: 

In [None]:
import os
import pandas as pd
from datasets import load_dataset
import spacy
from spacy.tokens import DocBin

# Define a label mapping from numerical tags to BIO tags.
label_map = {
    0: "O",  
    1: "B-PER", 2: "I-PER",
    3: "B-ORG", 4: "I-ORG",
    5: "B-LOC", 6: "I-LOC"
}

# convert a list of tags (which may be numbers or strings) to BIO strings.
def convert_labels(tags):
    new_tags = []
    for t in tags:
        try:
            # If t is a number, convert it to int and then map
            new_tags.append(label_map[int(t)])
        except ValueError:
            # Otherwise, assume it's already a string (like "O") and keep it.
            new_tags.append(t)
    return new_tags


adminset_dataset = load_dataset("taln-ls2n/Adminset-NER")['train']
df_adminset = pd.DataFrame(adminset_dataset)
df_adminset['ner_tags'] = df_adminset['ner_tags'].apply(convert_labels)


wikiner_dataset = load_dataset("Jean-Baptiste/wikiner_fr", split="train")
df_wikiner = pd.DataFrame(wikiner_dataset)
df_wikiner['ner_tags'] = df_wikiner['ner_tags'].apply(convert_labels)


combined_df = pd.concat([
    df_adminset[['tokens', 'ner_tags']],
    df_wikiner[['tokens', 'ner_tags']]
], ignore_index=True)

# ----------------------------
# Define a function to create a spaCy Doc with entities from tokens and BIO tags.
nlp = spacy.blank("fr")

def create_spacy_doc(tokens, ner_tags):
    # Create a doc from the tokens (join them with spaces)
    doc = nlp.make_doc(" ".join(tokens))
    ents = []
    current_ent = None
    offset = 0
    # Iterate over each token and its corresponding tag
    for token, tag_label in zip(tokens, ner_tags):
        token_len = len(token)
        if tag_label.startswith("B-"):
            if current_ent:
                ents.append(current_ent)
            current_ent = [offset, offset + token_len, tag_label[2:]]  # remove "B-"
        elif tag_label.startswith("I-") and current_ent and current_ent[2] == tag_label[2:]:
            current_ent[1] = offset + token_len
        else:
            if current_ent:
                ents.append(current_ent)
                current_ent = None
        offset += token_len + 1  # account for the space
    if current_ent:
        ents.append(current_ent)
    
    # Convert the collected entity spans into spaCy spans
    spacy_ents = []
    for start, end, label in ents:
        span = doc.char_span(start, end, label=label, alignment_mode='expand')
        if span is not None:
            spacy_ents.append(span)
    doc.ents = spacy_ents
    return doc


# Function to convert a DataFrame into a DocBin file
def convert_dataframe_to_docbin(df, output_path):
    db = DocBin()
    for _, row in df.iterrows():
        doc = create_spacy_doc(row['tokens'], row['ner_tags'])
        db.add(doc)
    db.to_disk(output_path)
    print(f" Combined spaCy training file created at {output_path}")

output_dir = r"C:\Users\hp\Downloads\clever contact"
output_path = os.path.join(output_dir, "combined_all.spacy")

# Convert the combined DataFrame to spaCy's binary format
convert_dataframe_to_docbin(combined_df, output_path)


 Combined spaCy training file created at C:\Users\hp\Downloads\clever contact\combined_all.spacy


In [None]:
import os
import random
import spacy
from spacy.tokens import DocBin

combined_all_path = r"C:\Users\hp\Downloads\clever contact\combined_all.spacy"

# Load the combined dataset using a blank French model.
nlp = spacy.blank("fr")
db = DocBin().from_disk(combined_all_path)
docs = list(db.get_docs(nlp.vocab))

# Shuffle the docs to randomize the split.
random.shuffle(docs)

# Split ratio: 90% for training, 10% for dev.
split_idx = int(0.9 * len(docs))
train_docs = docs[:split_idx]
dev_docs = docs[split_idx:]

# Create new DocBin objects for training and dev.
train_db = DocBin(docs=train_docs)
dev_db = DocBin(docs=dev_docs)

# Define output paths.
output_dir = r"C:\Users\hp\Downloads\clever contact"
train_output_path = os.path.join(output_dir, "combined_train.spacy")
dev_output_path = os.path.join(output_dir, "combined_dev.spacy")

# Save the split datasets.
train_db.to_disk(train_output_path)
dev_db.to_disk(dev_output_path)

print(f"Saved training set with {len(train_docs)} docs to {train_output_path}")
print(f"Saved dev set with {len(dev_docs)} docs to {dev_output_path}")
i

Saved training set with 109269 docs to C:\Users\hp\Downloads\clever contact\combined_train.spacy
Saved dev set with 12142 docs to C:\Users\hp\Downloads\clever contact\combined_dev.spacy


In [2]:
! python -m spacy init config config.cfg --lang fr --pipeline ner --optimize accuracy --pretrained-model fr_core_news_sm


Usage: python -m spacy init config [OPTIONS] OUTPUT_FILE
Try 'python -m spacy init config --help' for help.
┌─ Error ─────────────────────────────────────────────────────────────────────┐
│ No such option: --pretrained-model Did you mean --pretraining?              │
└─────────────────────────────────────────────────────────────────────────────┘


In [4]:
! python -m spacy init config spacyconfig.cfg --lang fr --pipeline ner --optimize accuracy


[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: fr
- Pipeline: ner
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
spacyconfig.cfg
You can now add your data and train your pipeline:
python -m spacy train spacyconfig.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [18]:
config_str = r"""
[paths]
train = "C:\Users\hp\Downloads\clever contact\combined_train.spacy"
dev = "C:\Users\hp\Downloads\clever contact\combined_dev.spacy"

[system]
gpu_allocator = null
seed = 42

[nlp]
lang = "fr"
pipeline = ["tok2vec", "ner"]
batch_size = 100

[components]

[components.ner]
factory = "ner"

[components.tok2vec]
factory = "tok2vec"

[initialize]
vectors = null
init_tok2vec = "fr_core_news_sm/tok2vec"

[training]
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
max_epochs = 10
patience = 1000

[training.optimizer]
@optimizers = "Adam.v1"
learn_rate = 0.001
L2 = 0.01
grad_clip = 1.0
"""


In [19]:
import spacy
from spacy.cli.train import train

# Write your configuration to a file (your config.cfg as above)
with open("spacyconfig.cfg", "w", encoding="utf-8") as f:
    f.write(config_str)

# Now, run training with overrides for the training and dev paths,
# and also override max_epochs if needed:
train("spacyconfig.cfg", output_path="spacy_my_finetuned_model",
      overrides={
          "paths.train": r"C:\Users\hp\Downloads\clever contact\combined_train.spacy",
          "paths.dev": r"C:\Users\hp\Downloads\clever contact\combined_dev.spacy",
          "training.max_epochs": 10  # override max_epochs to a positive number
      })


[38;5;4mℹ Saving to output directory: spacy_my_finetuned_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     43.21    0.00    0.00    0.00    0.00
  0     200          0.00   2335.84   46.33   54.16   40.48    0.46
  0     400          0.00   1822.37   48.14   69.52   36.82    0.48
  0     600          0.00   2145.25   56.31   67.61   48.24    0.56
  0     800          0.00   2436.33   58.49   72.84   48.86    0.58
  0    1000          0.00   2779.94   63.87   73.69   56.36    0.64
  0    1200          0.00   3068.44   67.02   67.50   66.54    0.67
  0    1400          0.00   3547.89   68.29   73.13   64.05    0.68
  0    1600          0.00   4367.91   71.14   76.68   66.35    0.71
  0    1800          0.00   

In [3]:
import spacy
import json

# Loading your fine-tuned spaCy model 
nlp_trained = spacy.load('spacy_my_finetuned_model\model-best')

#file_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract2.txt"
file_path = r"C:\Users\hp\Downloads\clever contact\cleanedcontract4.txt"
with open(file_path, "r", encoding="utf-8") as f:
    contract_text = f.read()

# Process the text to get a spaCy Doc with entities
doc = nlp_trained(contract_text)

# Extract entities into a list of dictionaries
extracted_entities = []
for ent in doc.ents:
    extracted_entities.append({
        "text": ent.text,
        "type": ent.label_,
        "start": ent.start_char,
        "end": ent.end_char
    })


output_file = "predicted_entitiees_spacy_finetuned_with2datasets4.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(extracted_entities, f, indent=4, ensure_ascii=False)

print(f" Extracted entities saved to {output_file}!")


  nlp_trained = spacy.load('spacy_my_finetuned_model\model-best')


 Extracted entities saved to predicted_entitiees_spacy_finetuned_with2datasets4.json!
