In [None]:
%pip install spacy

In [None]:
!python -m spacy download fr_core_news_md

Converting json files generated by mistral to spacy format  

In [None]:
import spacy
from spacy.tokens import DocBin
import os
import json
import random
from collections import Counter

# === CONFIG ===
input_folder = "/content/drive/MyDrive/merged_jsons"  # your merged JSONs
output_folder = "/content/drive/MyDrive/spacy_corpus_finale1"
os.makedirs(output_folder, exist_ok=True)

# Load blank French model
nlp = spacy.blank("fr")

# Label normalization and mapping
def map_label(label):
    label_lower = label.strip().lower()
    label_map = {
    # PHONE related
    "phone_number": "phone",
    "phone": "phone",
    "skype": "phone",

    # EXPERIENCE related
    "experiences": "experience",
    "experience": "experience",
    "freelance_work": "experience",

    # PROJECTS related
    "projects_esprit": "projects",
    "academic_projects": "projects",
    "projects": "projects",

    # TECHNICAL SKILLS related
    "technical_skills": "technical_skills",
    "office_skills": "technical_skills",
    "additional_skills": "technical_skills",
    "modeling": "technical_skills",
    "versioning": "technical_skills",
    "database_administration": "technical_skills",

    # CERTIFICATIONS
    "certifications": "certifications",
    "certificates": "certifications",
    "certification": "certifications",
    "technical_certifications": "certifications",

    # EDUCATION
    "education": "education",
    "high_school": "education",

    # INTERNSHIPS
    "internships": "internships",

    # SOFT SKILLS
    "soft_skills": "soft_skills",

    # PERSONAL INFO
    "name": "name",
    "email": "email",
    "country": "country",
    "city": "country",
    "address": "country",
    "location": "country",

    # LANGUAGES
    "languages": "languages",
    "language": "languages"
   }

    return label_map.get(label_lower, None)  # None = skip unwanted labels

# Collect examples
all_examples = []

for filename in os.listdir(input_folder):
    if filename.endswith(".json"):
        with open(os.path.join(input_folder, filename), "r", encoding="utf-8") as f:
            cv = json.load(f)

        for key, value in cv.items():
            if not value:
                continue

            new_label = map_label(key)
            if new_label is None:
                continue  # skip unwanted labels

            # Merge lists/dicts into single string
            if isinstance(value, list):
                text_parts = []
                for item in value:
                    if isinstance(item, dict):
                        text_parts.append(", ".join(str(v) for v in item.values()))
                    else:
                        text_parts.append(str(item))
                text = " | ".join(text_parts)
            elif isinstance(value, dict):
                text = ", ".join(str(v) for v in value.values())
            else:
                text = str(value)

            all_examples.append((text, new_label))

# Shuffle and split train/dev
random.shuffle(all_examples)
split = int(0.8 * len(all_examples))
train_data = all_examples[:split]
dev_data = all_examples[split:]

# Convert to DocBin
def to_docbin(examples, out_path):
    db = DocBin()
    for text, label in examples:
        doc = nlp.make_doc(text)
        doc.cats = {label: 1.0}
        db.add(doc)
    db.to_disk(out_path)

# Save
to_docbin(train_data, os.path.join(output_folder, "train.spacy"))
to_docbin(dev_data, os.path.join(output_folder, "dev.spacy"))

print("‚úÖ Conversion done! Check folder:", output_folder)
print(f"Total examples: {len(all_examples)}")

# Optional: show label counts
counter = Counter([label for _, label in all_examples])
print("Example counts per label:")
for k, v in counter.most_common():
    print(f"{k}: {v}")


‚úÖ Conversion done! Check folder: /content/drive/MyDrive/spacy_corpus_finale1
Total examples: 3161
Example counts per label:
country: 426
technical_skills: 416
name: 411
email: 408
education: 391
internships: 333
phone: 319
projects: 306
soft_skills: 118
languages: 15
certifications: 11
experience: 7


Augmentation of train data

In [None]:
import spacy
from spacy.tokens import DocBin
import random
from pathlib import Path

# ===============================
# CONFIG
# ===============================
TRAIN_FILE = "/content/drive/MyDrive/spacy_corpus_finale1/train.spacy"
DEV_FILE = "/content/drive/MyDrive/spacy_corpus_finale1/dev.spacy"
OUTPUT_FILE = "/content/drive/MyDrive/spacy_corpus_finale1/train_augmented.spacy"

# Small labels to oversample
SMALL_LABELS = ["experience", "internships", "projects", "languages", "soft_skills", "certifications", "profile"]

# ===============================
# GENERATORS
# ===============================
def make_experience():
    companies = ["Capgemini", "Deloitte", "Sopra Steria", "BNP Paribas", "Vermeg", "Ooredoo", "Microsoft", "IBM"]
    roles = ["D√©veloppeur Full Stack", "Data Scientist", "Consultant Big Data", "Ing√©nieur Cloud", "Analyste Cybers√©curit√©"]
    tasks = [
        "d√©veloppement et maintenance d‚Äôapplications",
        "mise en place de pipelines de donn√©es",
        "d√©ploiement de solutions cloud s√©curis√©es",
        "optimisation de mod√®les d‚ÄôIA",
        "gestion d‚Äô√©quipes agiles et reporting",
        "analyse des besoins clients et r√©daction des sp√©cifications",
        "conception d‚Äôarchitectures logicielles √©volutives"
    ]
    years = ["2020‚Äì2022", "2019‚Äì2021", "2021‚Äì2023", "2022‚Äì2024"]
    num_tasks = random.randint(2, 4)
    responsibilities = " ; ".join(random.sample(tasks, num_tasks))
    return f"{random.choice(roles)} chez {random.choice(companies)} ({random.choice(years)}) ‚Äì Responsabilit√©s : {responsibilities}"

def make_internship():
    companies = ["Orange", "Eni", "STMicroelectronics", "Sagemcom", "SNCFT", "Tunisie T√©l√©com"]
    roles = ["Stagiaire D√©veloppement Web", "Stagiaire Data Analyst", "Stagiaire DevOps", "Stagiaire S√©curit√© Informatique"]
    tasks = [
        "d√©veloppement d‚Äôun module interne",
        "analyse de donn√©es clients",
        "mise en place d‚Äôun pipeline CI/CD",
        "r√©daction de documentation technique",
        "tests et validation des fonctionnalit√©s"
    ]
    periods = ["Juin‚ÄìAo√ªt 2023", "Janvier‚ÄìJuin 2022", "F√©vrier‚ÄìAvril 2021", "Mars‚ÄìAo√ªt 2020"]
    return f"{random.choice(roles)} chez {random.choice(companies)} ({random.choice(periods)}) ‚Äì Missions : {random.choice(tasks)}"

def make_project():
    projects = [
        "Projet universitaire : cr√©ation d‚Äôune application e-commerce en Django",
        "Projet acad√©mique : d√©veloppement d‚Äôun chatbot NLP avec Python",
        "Projet personnel : site web de gestion de t√¢ches avec React et Node.js",
        "Projet de fin d‚Äô√©tudes : plateforme de recommandation musicale avec IA",
        "Projet open-source : contribution √† une librairie Python de machine learning",
        "Projet scientifique : analyse pr√©dictive des ventes avec Scikit-learn"
    ]
    return random.choice(projects)

def make_languages():
    langs = [
        "Fran√ßais (courant), Anglais (avanc√©), Arabe (natif)",
        "Anglais (TOEFL 95), Allemand (interm√©diaire)",
        "Italien (d√©butant), Fran√ßais (C2), Anglais (C1)",
        "Espagnol (B2), Arabe (langue maternelle)"
    ]
    return random.choice(langs)

def make_soft_skills():
    skills = [
        "Esprit d‚Äô√©quipe et sens de la communication",
        "Leadership et capacit√© de prise de d√©cision",
        "R√©solution de probl√®mes complexes",
        "Gestion du temps et organisation",
        "Cr√©ativit√© et pens√©e critique"
    ]
    return random.choice(skills)

def make_certification():
    certs = [
        "Certification AWS Solutions Architect ‚Äì Associate",
        "Certification Cisco CCNA Routing & Switching",
        "Certification PMP ‚Äì Project Management Professional",
        "Certification Microsoft Azure Fundamentals",
        "Certification Scrum Master (PSM I)"
    ]
    return random.choice(certs)

def make_profile():
    profiles = [
        "Ing√©nieur logiciel passionn√© avec 3 ans d'exp√©rience en d√©veloppement full stack et en gestion de projets agiles.",
        "Data Scientist sp√©cialis√© en machine learning et intelligence artificielle, avec un solide parcours acad√©mique et professionnel.",
        "√âtudiant en informatique motiv√© par le d√©veloppement web et la cr√©ation d'applications innovantes.",
        "Consultant Big Data orient√© r√©sultats, expert en analyse de donn√©es et optimisation des processus m√©tier.",
        "D√©veloppeur polyvalent ma√Ætrisant Python, JavaScript et les technologies cloud, avec une forte capacit√© √† r√©soudre des probl√®mes complexes."
    ]
    return random.choice(profiles)

GENERATORS = {
    "experience": make_experience,
    "internships": make_internship,
    "projects": make_project,
    "languages": make_languages,
    "soft_skills": make_soft_skills,
    "certifications": make_certification,
    "profile": make_profile
}

# ===============================
# MAIN SCRIPT
# ===============================
def main():
    nlp = spacy.blank("fr")

    # Load train and dev
    print("üîπ Loading datasets...")
    train_docs = list(DocBin().from_disk(TRAIN_FILE).get_docs(nlp.vocab))
    dev_docs = list(DocBin().from_disk(DEV_FILE).get_docs(nlp.vocab))
    print(f"‚úÖ Loaded {len(train_docs)} train docs, {len(dev_docs)} dev docs")

    # Merge all docs for counting
    all_docs = train_docs + dev_docs

    # Count existing examples per class
    counts = {}
    for d in all_docs:
        for k, v in d.cats.items():
            if v == 1.0:
                counts[k] = counts.get(k, 0) + 1
    print("Current counts per class:", counts)

    max_count = max(counts.values())
    print("Target count per small class:", max_count)

    # Generate new docs efficiently
    new_docs = []
    for label in SMALL_LABELS:
        n_to_generate = max_count - counts.get(label, 0)
        print(f"Generating {n_to_generate} examples for {label}...")
        if n_to_generate <= 0:
            continue
        gen_func = GENERATORS[label]

        # Bulk create docs
        texts = [gen_func() for _ in range(n_to_generate)]
        for text in texts:
            doc = nlp.make_doc(text)
            doc.cats = {k: 0.0 for k in counts.keys()}
            doc.cats[label] = 1.0
            new_docs.append(doc)

    print(f"‚úÖ Generated a total of {len(new_docs)} new docs")

    # Merge with train only (dev remains untouched)
    final_docs = train_docs + new_docs
    out_bin = DocBin(docs=final_docs)
    out_bin.to_disk(OUTPUT_FILE)
    print(f"üéâ Augmented dataset saved! Total train examples: {len(final_docs)}")

    # Optional: final counts
    final_counts = {}
    for d in final_docs:
        for k, v in d.cats.items():
            if v == 1.0:
                final_counts[k] = final_counts.get(k, 0) + 1
    print("Final counts per class:", final_counts)

if __name__ == "__main__":
    main()


üîπ Loading datasets...
‚úÖ Loaded 2528 train docs, 633 dev docs
Current counts per class: {'country': 426, 'phone': 319, 'soft_skills': 118, 'education': 391, 'projects': 306, 'email': 408, 'technical_skills': 416, 'name': 411, 'internships': 333, 'languages': 15, 'experience': 7, 'certifications': 11}
Target count per small class: 426
Generating 419 examples for experience...
Generating 93 examples for internships...
Generating 120 examples for projects...
Generating 411 examples for languages...
Generating 308 examples for soft_skills...
Generating 415 examples for certifications...
Generating 426 examples for profile...
‚úÖ Generated a total of 2192 new docs
üéâ Augmented dataset saved! Total train examples: 4720
Final counts per class: {'country': 341, 'phone': 259, 'soft_skills': 399, 'education': 301, 'projects': 370, 'email': 322, 'technical_skills': 324, 'name': 341, 'internships': 362, 'languages': 424, 'experience': 426, 'certifications': 425, 'profile': 426}


Augmentation of dev data

In [None]:
import spacy
from spacy.tokens import DocBin
import random
from pathlib import Path

# ===============================
# CONFIG
# ===============================
DEV_FILE = "/content/drive/MyDrive/spacy_corpus_finale1/dev.spacy"
OUTPUT_FILE = "/content/drive/MyDrive/spacy_corpus_finale1/deva.spacy"

# Small labels to oversample
SMALL_LABELS = ["experience", "internships", "projects", "languages", "soft_skills", "certifications", "profile"]

# ===============================
# GENERATORS
# ===============================
def make_experience():
    companies = ["Capgemini", "Deloitte", "Sopra Steria", "BNP Paribas", "Vermeg", "Ooredoo", "Microsoft", "IBM"]
    roles = ["D√©veloppeur Full Stack", "Data Scientist", "Consultant Big Data", "Ing√©nieur Cloud", "Analyste Cybers√©curit√©"]
    tasks = [
        "d√©veloppement et maintenance d‚Äôapplications",
        "mise en place de pipelines de donn√©es",
        "d√©ploiement de solutions cloud s√©curis√©es",
        "optimisation de mod√®les d‚ÄôIA",
        "gestion d‚Äô√©quipes agiles et reporting",
        "analyse des besoins clients et r√©daction des sp√©cifications",
        "conception d‚Äôarchitectures logicielles √©volutives"
    ]
    years = ["2020‚Äì2022", "2019‚Äì2021", "2021‚Äì2023", "2022‚Äì2024"]
    num_tasks = random.randint(2, 4)
    responsibilities = " ; ".join(random.sample(tasks, num_tasks))
    return f"{random.choice(roles)} chez {random.choice(companies)} ({random.choice(years)}) ‚Äì Responsabilit√©s : {responsibilities}"

def make_internship():
    companies = ["Orange", "Eni", "STMicroelectronics", "Sagemcom", "SNCFT", "Tunisie T√©l√©com"]
    roles = ["Stagiaire D√©veloppement Web", "Stagiaire Data Analyst", "Stagiaire DevOps", "Stagiaire S√©curit√© Informatique"]
    tasks = [
        "d√©veloppement d‚Äôun module interne",
        "analyse de donn√©es clients",
        "mise en place d‚Äôun pipeline CI/CD",
        "r√©daction de documentation technique",
        "tests et validation des fonctionnalit√©s"
    ]
    periods = ["Juin‚ÄìAo√ªt 2023", "Janvier‚ÄìJuin 2022", "F√©vrier‚ÄìAvril 2021", "Mars‚ÄìAo√ªt 2020"]
    return f"{random.choice(roles)} chez {random.choice(companies)} ({random.choice(periods)}) ‚Äì Missions : {random.choice(tasks)}"

def make_project():
    projects = [
        "Projet universitaire : cr√©ation d‚Äôune application e-commerce en Django",
        "Projet acad√©mique : d√©veloppement d‚Äôun chatbot NLP avec Python",
        "Projet personnel : site web de gestion de t√¢ches avec React et Node.js",
        "Projet de fin d‚Äô√©tudes : plateforme de recommandation musicale avec IA",
        "Projet open-source : contribution √† une librairie Python de machine learning",
        "Projet scientifique : analyse pr√©dictive des ventes avec Scikit-learn"
    ]
    return random.choice(projects)

def make_languages():
    langs = [
        "Fran√ßais (courant), Anglais (avanc√©), Arabe (natif)",
        "Anglais (TOEFL 95), Allemand (interm√©diaire)",
        "Italien (d√©butant), Fran√ßais (C2), Anglais (C1)",
        "Espagnol (B2), Arabe (langue maternelle)"
    ]
    return random.choice(langs)

def make_soft_skills():
    skills = [
        "Esprit d‚Äô√©quipe et sens de la communication",
        "Leadership et capacit√© de prise de d√©cision",
        "R√©solution de probl√®mes complexes",
        "Gestion du temps et organisation",
        "Cr√©ativit√© et pens√©e critique"
    ]
    return random.choice(skills)

def make_certification():
    certs = [
        "Certification AWS Solutions Architect ‚Äì Associate",
        "Certification Cisco CCNA Routing & Switching",
        "Certification PMP ‚Äì Project Management Professional",
        "Certification Microsoft Azure Fundamentals",
        "Certification Scrum Master (PSM I)"
    ]
    return random.choice(certs)

def make_profile():
    profiles = [
        "Ing√©nieur logiciel passionn√© avec 3 ans d'exp√©rience en d√©veloppement full stack et en gestion de projets agiles.",
        "Data Scientist sp√©cialis√© en machine learning et intelligence artificielle, avec un solide parcours acad√©mique et professionnel.",
        "√âtudiant en informatique motiv√© par le d√©veloppement web et la cr√©ation d'applications innovantes.",
        "Consultant Big Data orient√© r√©sultats, expert en analyse de donn√©es et optimisation des processus m√©tier.",
        "D√©veloppeur polyvalent ma√Ætrisant Python, JavaScript et les technologies cloud, avec une forte capacit√© √† r√©soudre des probl√®mes complexes."
    ]
    return random.choice(profiles)

GENERATORS = {
    "experience": make_experience,
    "internships": make_internship,
    "projects": make_project,
    "languages": make_languages,
    "soft_skills": make_soft_skills,
    "certifications": make_certification,
    "profile": make_profile
}

# ===============================
# MAIN SCRIPT
# ===============================
def main():
    nlp = spacy.blank("fr")
    print("üîπ Loading dev dataset...")
    doc_bin = DocBin().from_disk(DEV_FILE)
    docs = list(doc_bin.get_docs(nlp.vocab))
    print(f"‚úÖ Loaded {len(docs)} dev examples")

    # Count current examples
    counts = {}
    for d in docs:
        for label, v in d.cats.items():
            if v == 1.0:
                counts[label] = counts.get(label, 0) + 1

    max_count = max(counts.values())
    print("Largest class in dev:", max_count)

    # Generate synthetic examples
    new_docs = []
    for label in SMALL_LABELS:
        n_to_generate = max_count - counts.get(label, 0)
        gen = GENERATORS[label]
        for _ in range(n_to_generate):
            text = gen()
            doc = nlp.make_doc(text)
            doc.cats[label] = 1.0
            # other labels = 0
            for other_label in counts.keys():
                if other_label != label:
                    doc.cats[other_label] = 0.0
            new_docs.append(doc)
        print(f"‚úÖ Generated {n_to_generate} new examples for {label}")

    # Merge and save
    all_docs = docs + new_docs
    DocBin(docs=all_docs).to_disk(OUTPUT_FILE)
    print(f"üéâ Dev augmentation complete! Total examples: {len(all_docs)}")

if __name__ == "__main__":
    main()


üîπ Loading dev dataset...
‚úÖ Loaded 633 dev examples
Largest class in dev: 92
‚úÖ Generated 92 new examples for experience
‚úÖ Generated 28 new examples for internships
‚úÖ Generated 36 new examples for projects
‚úÖ Generated 90 new examples for languages
‚úÖ Generated 65 new examples for soft_skills
‚úÖ Generated 91 new examples for certifications
‚úÖ Generated 92 new examples for profile
üéâ Dev augmentation complete! Total examples: 1127


In [None]:
!python -m spacy init config config.cfg --lang fr --pipeline textcat --optimize accuracy


[38;5;3m‚ö† To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4m‚Ñπ Generated config template specific for your use case[0m
- Language: fr
- Pipeline: textcat
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m‚úî Auto-filled config with all values[0m
[38;5;2m‚úî Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


Trainnig process

In [None]:
!python -m spacy train config.cfg \
    --paths.train /content/drive/MyDrive/spacy_corpus_finale1/train_augmented.spacy \
    --paths.dev /content/drive/MyDrive/spacy_corpus_finale1/deva.spacy \
    --output  /content/drive/MyDrive/spacy_augmented_model \
    --gpu-id -1

[38;5;4m‚Ñπ Saving to output directory:
/content/drive/MyDrive/spacy_augmented_model[0m
[38;5;4m‚Ñπ Using CPU[0m
[1m
[38;5;2m‚úî Initialized pipeline[0m
[1m
[38;5;4m‚Ñπ Pipeline: ['tok2vec', 'textcat'][0m
[38;5;4m‚Ñπ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ------------  ----------  ------
  0       0          0.00          0.07       15.78    0.16
  0     200         88.41          6.37       85.21    0.85
  0     400        271.72          4.47       89.37    0.89
  0     600       1044.02          3.69       83.42    0.83
  1     800       2517.88          3.35       92.12    0.92
  1    1000       5670.21          2.18       90.84    0.91
  1    1200       6218.41          2.14       89.74    0.90
  2    1400      18515.68          2.25       94.35    0.94
  3    1600      36444.35          1.98       95.40    0.95
  4    1800      58762.74          1.40       95.01    0.95
  5    2000      63853

Evaluation of first model

In [None]:
!python -m spacy evaluate /content/drive/MyDrive/spacy_augmented_model/model-best /content/drive/MyDrive/spacy_corpus_finale1/deva.spacy --gpu-id -1


[38;5;4m‚Ñπ Using CPU[0m
[1m

TOK                 100.00
TEXTCAT (macro F)   96.67 
SPEED               1901  

[1m

                        P        R        F
country             98.80    96.47    97.62
phone               98.28    95.00    96.61
soft_skills         97.75    94.57    96.13
education          100.00    97.78    98.88
projects            92.05    88.04    90.00
email              100.00    98.84    99.42
technical_skills    93.68    96.74    95.19
name                87.34    98.57    92.62
internships         89.47    93.41    91.40
languages          100.00    98.91    99.45
experience         100.00   100.00   100.00
certifications     100.00    98.91    99.45
profile            100.00   100.00   100.00

[1m

                   ROC AUC
country               1.00
phone                 1.00
soft_skills           0.99
education             0.99
projects              0.99
email                 1.00
technical_skills      1.00
name                  1.00
internships  

Augmentation of dev data

In [None]:
import spacy
from spacy.tokens import DocBin
import random
from pathlib import Path

# ===============================
# CONFIG
# ===============================
DEV_FILE = "/content/drive/MyDrive/spacy_corpus_finale1/deva.spacy"
OUTPUT_FILE = "/content/drive/MyDrive/spacy_corpus_finale1/dev_augmented.spacy"

# Labels to oversample
SMALL_LABELS = ["experience", "internships", "projects", "languages", "soft_skills", "certifications", "profile"]

# ===============================
# SECTION GENERATORS
# ===============================

def make_experience():
    companies = ["Capgemini", "Deloitte", "Sopra Steria", "BNP Paribas", "Vermeg", "Ooredoo", "Microsoft", "IBM"]
    roles = ["D√©veloppeur Full Stack", "Data Scientist", "Consultant Big Data", "Ing√©nieur Cloud", "Analyste Cybers√©curit√©"]
    tasks = [
        "d√©veloppement et maintenance d‚Äôapplications",
        "mise en place de pipelines de donn√©es",
        "d√©ploiement de solutions cloud s√©curis√©es",
        "optimisation de mod√®les d‚ÄôIA",
        "gestion d‚Äô√©quipes agiles et reporting",
        "analyse des besoins clients et r√©daction des sp√©cifications",
        "conception d‚Äôarchitectures logicielles √©volutives"
    ]
    years = ["2020‚Äì2022", "2019‚Äì2021", "2021‚Äì2023", "2022‚Äì2024"]
    num_tasks = random.randint(2, 4)
    responsibilities = " ; ".join(random.sample(tasks, num_tasks))
    return f"{random.choice(roles)} chez {random.choice(companies)} ({random.choice(years)}) ‚Äì Responsabilit√©s : {responsibilities}"

def make_experience_paragraph():
    n = random.randint(2, 4)
    connectors = [
        "Durant ces exp√©riences professionnelles, j'ai travaill√© sur plusieurs projets significatifs :",
        "Ces missions m'ont permis de d√©velopper des comp√©tences cl√©s dans le domaine :",
        "Au cours de ces exp√©riences, j'ai acquis de solides connaissances en :"
    ]
    base_text = " ".join(make_experience() for _ in range(n))
    return f"{random.choice(connectors)} {base_text}"

def make_internship_paragraph():
    companies = ["Orange", "Eni", "STMicroelectronics", "Sagemcom", "SNCFT", "Tunisie T√©l√©com"]
    roles = ["Stagiaire D√©veloppement Web", "Stagiaire Data Analyst", "Stagiaire DevOps", "Stagiaire S√©curit√© Informatique"]
    tasks = [
        "d√©veloppement d‚Äôun module interne",
        "analyse de donn√©es clients",
        "mise en place d‚Äôun pipeline CI/CD",
        "r√©daction de documentation technique",
        "tests et validation des fonctionnalit√©s",
        "participation √† des r√©unions de planification et d'√©valuation"
    ]
    periods = ["Juin‚ÄìAo√ªt 2023", "Janvier‚ÄìJuin 2022", "F√©vrier‚ÄìAvril 2021", "Mars‚ÄìAo√ªt 2020"]

    n = random.randint(3, 5)
    sentences = []
    for _ in range(n):
        sentences.append(f"{random.choice(roles)} chez {random.choice(companies)} ({random.choice(periods)}), o√π j'ai effectu√© {random.choice(tasks)}.")
    connector = "Au cours de mes stages, j'ai r√©alis√© plusieurs missions importantes :"
    return f"{connector} {' '.join(sentences)}"

def make_project_paragraph():
    projects = [
        "un projet universitaire de cr√©ation d‚Äôune application e-commerce en Django",
        "un projet acad√©mique de d√©veloppement d‚Äôun chatbot NLP avec Python",
        "un projet personnel de site web de gestion de t√¢ches avec React et Node.js",
        "un projet de fin d‚Äô√©tudes : plateforme de recommandation musicale avec IA",
        "une contribution open-source √† une librairie Python de machine learning",
        "un projet scientifique d‚Äôanalyse pr√©dictive des ventes avec Scikit-learn"
    ]
    n = random.randint(3, 5)
    sentences = [f"J'ai men√© {random.choice(projects)}." for _ in range(n)]
    connector = "Parmi les projets que j'ai r√©alis√©s, on peut citer :"
    return f"{connector} {' '.join(sentences)}"

def make_languages():
    langs = [
        "Fran√ßais (courant), Anglais (avanc√©), Arabe (natif)",
        "Anglais (TOEFL 95), Allemand (interm√©diaire)",
        "Italien (d√©butant), Fran√ßais (C2), Anglais (C1)",
        "Espagnol (B2), Arabe (langue maternelle)"
    ]
    return random.choice(langs)

def make_soft_skills():
    skills = [
        "Esprit d‚Äô√©quipe et sens de la communication",
        "Leadership et capacit√© de prise de d√©cision",
        "R√©solution de probl√®mes complexes",
        "Gestion du temps et organisation",
        "Cr√©ativit√© et pens√©e critique"
    ]
    return random.choice(skills)

def make_certification():
    certs = [
        "Certification AWS Solutions Architect ‚Äì Associate",
        "Certification Cisco CCNA Routing & Switching",
        "Certification PMP ‚Äì Project Management Professional",
        "Certification Microsoft Azure Fundamentals",
        "Certification Scrum Master (PSM I)"
    ]
    return random.choice(certs)

def make_profile():
    profiles = [
        "Ing√©nieur logiciel passionn√© avec 3 ans d'exp√©rience en d√©veloppement full stack et en gestion de projets agiles.",
        "Data Scientist sp√©cialis√© en machine learning et intelligence artificielle, avec un solide parcours acad√©mique et professionnel.",
        "√âtudiant en informatique motiv√© par le d√©veloppement web et la cr√©ation d'applications innovantes.",
        "Consultant Big Data orient√© r√©sultats, expert en analyse de donn√©es et optimisation des processus m√©tier.",
        "D√©veloppeur polyvalent ma√Ætrisant Python, JavaScript et les technologies cloud, avec une forte capacit√© √† r√©soudre des probl√®mes complexes."
    ]
    return random.choice(profiles)

# ===============================
# SECTION MAPPER
# ===============================
def make_section_paragraph(label):
    if label == "experience":
        return make_experience_paragraph()
    elif label == "internships":
        return make_internship_paragraph()
    elif label == "projects":
        return make_project_paragraph()
    elif label == "languages":
        return make_languages()
    elif label == "soft_skills":
        return make_soft_skills()
    elif label == "certifications":
        return make_certification()
    elif label == "profile":
        return make_profile()
    else:
        return ""

def make_full_cv():
    """Generate a full CV with all sections concatenated naturally."""
    sections = SMALL_LABELS.copy()
    random.shuffle(sections)
    return " ".join(make_section_paragraph(label) for label in sections)

# ===============================
# MAIN SCRIPT
# ===============================
def main():
    nlp = spacy.blank("fr")
    print("üîπ Loading dev dataset...")
    doc_bin = DocBin().from_disk(DEV_FILE)
    docs = list(doc_bin.get_docs(nlp.vocab))
    print(f"‚úÖ Loaded {len(docs)} dev examples")

    # Count current examples per label
    counts = {}
    for d in docs:
        for label, v in d.cats.items():
            if v == 1.0:
                counts[label] = counts.get(label, 0) + 1

    max_count = max(counts.values())
    print("Largest class in dev:", max_count)

    # Generate synthetic full CVs
    new_docs = []
    for label in SMALL_LABELS:
        n_to_generate = max_count - counts.get(label, 0)
        for _ in range(n_to_generate):
            text = make_full_cv()
            doc = nlp.make_doc(text)
            doc.cats = {lbl: float(lbl==label) for lbl in counts.keys()}
            new_docs.append(doc)
        print(f"‚úÖ Generated {n_to_generate} synthetic full CVs for {label}")

    # Merge and save
    all_docs = docs + new_docs
    DocBin(docs=all_docs).to_disk(OUTPUT_FILE)
    print(f"üéâ Dev augmentation complete! Total examples: {len(all_docs)}")

if __name__ == "__main__":
    main()


üîπ Loading dev dataset...
‚úÖ Loaded 1127 dev examples
Largest class in dev: 92
‚úÖ Generated 0 synthetic full CVs for experience
‚úÖ Generated 0 synthetic full CVs for internships
‚úÖ Generated 0 synthetic full CVs for projects
‚úÖ Generated 0 synthetic full CVs for languages
‚úÖ Generated 0 synthetic full CVs for soft_skills
‚úÖ Generated 0 synthetic full CVs for certifications
‚úÖ Generated 0 synthetic full CVs for profile
üéâ Dev augmentation complete! Total examples: 1127


Augmentation of train data

In [None]:
import spacy
from spacy.tokens import DocBin
import random
from pathlib import Path

# ===============================
# CONFIG
# ===============================
OUTPUT_FILE = "/content/drive/MyDrive/spacy_corpus_finale1/train_augmented1.spacy"
TRAIN_FILE = "/content/drive/MyDrive/spacy_corpus_finale1/train_augmented.spacy"

# Labels to oversample
SMALL_LABELS = ["experience", "internships", "projects", "languages", "soft_skills", "certifications", "profile"]

# ===============================
# SECTION GENERATORS
# ===============================

# Experience generator
def make_experience():
    companies = ["Capgemini", "Deloitte", "Sopra Steria", "BNP Paribas", "Vermeg", "Ooredoo", "Microsoft", "IBM"]
    roles = ["D√©veloppeur Full Stack", "Data Scientist", "Consultant Big Data", "Ing√©nieur Cloud", "Analyste Cybers√©curit√©"]
    tasks = [
        "d√©veloppement et maintenance d‚Äôapplications",
        "mise en place de pipelines de donn√©es",
        "d√©ploiement de solutions cloud s√©curis√©es",
        "optimisation de mod√®les d‚ÄôIA",
        "gestion d‚Äô√©quipes agiles et reporting",
        "analyse des besoins clients et r√©daction des sp√©cifications",
        "conception d‚Äôarchitectures logicielles √©volutives"
    ]
    years = ["2020‚Äì2022", "2019‚Äì2021", "2021‚Äì2023", "2022‚Äì2024"]
    num_tasks = random.randint(2, 4)
    responsibilities = " ; ".join(random.sample(tasks, num_tasks))
    return f"{random.choice(roles)} chez {random.choice(companies)} ({random.choice(years)}) ‚Äì Responsabilit√©s : {responsibilities}"

def make_experience_paragraph():
    n = random.randint(2, 4)
    connectors = [
        "Durant ces exp√©riences professionnelles, j'ai travaill√© sur plusieurs projets significatifs :",
        "Ces missions m'ont permis de d√©velopper des comp√©tences cl√©s dans le domaine :",
        "Au cours de ces exp√©riences, j'ai acquis de solides connaissances en :"
    ]
    base_text = " ".join(make_experience() for _ in range(n))
    return f"{random.choice(connectors)} {base_text}"

# Long internship paragraph generator
def make_internship_paragraph():
    companies = ["Orange", "Eni", "STMicroelectronics", "Sagemcom", "SNCFT", "Tunisie T√©l√©com"]
    roles = ["Stagiaire D√©veloppement Web", "Stagiaire Data Analyst", "Stagiaire DevOps", "Stagiaire S√©curit√© Informatique"]
    tasks = [
        "d√©veloppement d‚Äôun module interne",
        "analyse de donn√©es clients",
        "mise en place d‚Äôun pipeline CI/CD",
        "r√©daction de documentation technique",
        "tests et validation des fonctionnalit√©s",
        "participation √† des r√©unions de planification et d'√©valuation"
    ]
    periods = ["Juin‚ÄìAo√ªt 2023", "Janvier‚ÄìJuin 2022", "F√©vrier‚ÄìAvril 2021", "Mars‚ÄìAo√ªt 2020"]

    n = random.randint(3, 5)  # number of internships in the paragraph
    sentences = []
    for _ in range(n):
        sentences.append(f"{random.choice(roles)} chez {random.choice(companies)} ({random.choice(periods)}), o√π j'ai effectu√© {random.choice(tasks)}.")
    connector = "Au cours de mes stages, j'ai r√©alis√© plusieurs missions importantes :"
    return f"{connector} {' '.join(sentences)}"

# Long project paragraph generator
def make_project_paragraph():
    projects = [
        "un projet universitaire de cr√©ation d‚Äôune application e-commerce en Django",
        "un projet acad√©mique de d√©veloppement d‚Äôun chatbot NLP avec Python",
        "un projet personnel de site web de gestion de t√¢ches avec React et Node.js",
        "un projet de fin d‚Äô√©tudes : plateforme de recommandation musicale avec IA",
        "une contribution open-source √† une librairie Python de machine learning",
        "un projet scientifique d‚Äôanalyse pr√©dictive des ventes avec Scikit-learn"
    ]
    n = random.randint(3, 5)
    sentences = [f"J'ai men√© {random.choice(projects)}." for _ in range(n)]
    connector = "Parmi les projets que j'ai r√©alis√©s, on peut citer :"
    return f"{connector} {' '.join(sentences)}"

# Languages
def make_languages():
    langs = [
        "Fran√ßais (courant), Anglais (avanc√©), Arabe (natif)",
        "Anglais (TOEFL 95), Allemand (interm√©diaire)",
        "Italien (d√©butant), Fran√ßais (C2), Anglais (C1)",
        "Espagnol (B2), Arabe (langue maternelle)"
    ]
    return random.choice(langs)

# Soft skills
def make_soft_skills():
    skills = [
        "Esprit d‚Äô√©quipe et sens de la communication",
        "Leadership et capacit√© de prise de d√©cision",
        "R√©solution de probl√®mes complexes",
        "Gestion du temps et organisation",
        "Cr√©ativit√© et pens√©e critique"
    ]
    return random.choice(skills)

# Certifications
def make_certification():
    certs = [
        "Certification AWS Solutions Architect ‚Äì Associate",
        "Certification Cisco CCNA Routing & Switching",
        "Certification PMP ‚Äì Project Management Professional",
        "Certification Microsoft Azure Fundamentals",
        "Certification Scrum Master (PSM I)"
    ]
    return random.choice(certs)

# Profile
def make_profile():
    profiles = [
        "Ing√©nieur logiciel passionn√© avec 3 ans d'exp√©rience en d√©veloppement full stack et en gestion de projets agiles.",
        "Data Scientist sp√©cialis√© en machine learning et intelligence artificielle, avec un solide parcours acad√©mique et professionnel.",
        "√âtudiant en informatique motiv√© par le d√©veloppement web et la cr√©ation d'applications innovantes.",
        "Consultant Big Data orient√© r√©sultats, expert en analyse de donn√©es et optimisation des processus m√©tier.",
        "D√©veloppeur polyvalent ma√Ætrisant Python, JavaScript et les technologies cloud, avec une forte capacit√© √† r√©soudre des probl√®mes complexes."
    ]
    return random.choice(profiles)

# ===============================
# SECTION MAPPER
# ===============================
def make_section_paragraph(label):
    if label == "experience":
        return make_experience_paragraph()
    elif label == "internships":
        return make_internship_paragraph()
    elif label == "projects":
        return make_project_paragraph()
    elif label == "languages":
        return make_languages()
    elif label == "soft_skills":
        return make_soft_skills()
    elif label == "certifications":
        return make_certification()
    elif label == "profile":
        return make_profile()
    else:
        return ""

def make_full_cv():
    """Generate a full CV with all sections concatenated naturally."""
    sections = SMALL_LABELS.copy()
    random.shuffle(sections)
    return " ".join(make_section_paragraph(label) for label in sections)

# ===============================
# MAIN SCRIPT
# ===============================
def main():
    nlp = spacy.blank("fr")
    print("üîπ Loading train dataset...")
    doc_bin = DocBin().from_disk(TRAIN_FILE)
    docs = list(doc_bin.get_docs(nlp.vocab))
    print(f"‚úÖ Loaded {len(docs)} train examples")

    # Count current examples per label
    counts = {}
    for d in docs:
        for label, v in d.cats.items():
            if v == 1.0:
                counts[label] = counts.get(label, 0) + 1

    max_count = max(counts.values())
    print("Largest class in train:", max_count)

    # Generate synthetic full CVs
    new_docs = []
    for label in SMALL_LABELS:
        n_to_generate = max_count - counts.get(label, 0)
        for _ in range(n_to_generate):
            text = make_full_cv()
            doc = nlp.make_doc(text)
            doc.cats = {lbl: float(lbl==label) for lbl in counts.keys()}
            new_docs.append(doc)
        print(f"‚úÖ Generated {n_to_generate} synthetic full CVs for {label}")

    # Merge and save
    all_docs = docs + new_docs
    DocBin(docs=all_docs).to_disk(OUTPUT_FILE)
    print(f"üéâ Train augmentation complete! Total examples: {len(all_docs)}")

if __name__ == "__main__":
    main()


üîπ Loading train dataset...
‚úÖ Loaded 4720 train examples
Largest class in train: 426
‚úÖ Generated 0 synthetic full CVs for experience
‚úÖ Generated 64 synthetic full CVs for internships
‚úÖ Generated 56 synthetic full CVs for projects
‚úÖ Generated 2 synthetic full CVs for languages
‚úÖ Generated 27 synthetic full CVs for soft_skills
‚úÖ Generated 1 synthetic full CVs for certifications
‚úÖ Generated 0 synthetic full CVs for profile
üéâ Train augmentation complete! Total examples: 4870


Creating second model after data augmentation

In [None]:
!python -m spacy init config ./config.cfg --lang fr --pipeline textcat --optimize accuracy


[38;5;3m‚ö† To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4m‚Ñπ Generated config template specific for your use case[0m
- Language: fr
- Pipeline: textcat
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m‚úî Auto-filled config with all values[0m
[38;5;2m‚úî Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train --output /content/drive/MyDrive/spacy_corpus_finale1/output_model \
    /content/config.cfg \
    --paths.train /content/drive/MyDrive/spacy_corpus_finale1/train_augmented1.spacy \
    --paths.dev /content/drive/MyDrive/spacy_corpus_finale1/dev_augmented.spacy \
    --gpu-id -1


[38;5;2m‚úî Created output directory:
/content/drive/MyDrive/spacy_corpus_finale1/output_model[0m
[38;5;4m‚Ñπ Saving to output directory:
/content/drive/MyDrive/spacy_corpus_finale1/output_model[0m
[38;5;4m‚Ñπ Using CPU[0m
[1m
[38;5;2m‚úî Initialized pipeline[0m
[1m
[38;5;4m‚Ñπ Pipeline: ['tok2vec', 'textcat'][0m
[38;5;4m‚Ñπ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ------------  ----------  ------
  0       0          0.00          0.07        7.52    0.08
  0     200         69.00          7.67       73.43    0.73
  0     400        228.15          6.44       87.09    0.87
  0     600        342.48          6.70       85.17    0.85
  0     800        823.70          5.29       82.68    0.83
  1    1000       1619.14          6.18       93.77    0.94
  1    1200       1608.95          6.12       92.58    0.93
  1    1400       2470.53          4.00       91.64    0.92


Evaluation of second model

In [9]:
!python -m spacy evaluate /content/drive/MyDrive/model-best /content/drive/MyDrive/spacy_corpus_finale1/deva.spacy --gpu-id -1


[38;5;4m‚Ñπ Using CPU[0m
[38;5;4m‚Ñπ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK                 100.00
TEXTCAT (macro F)   93.77 
SPEED               2681  

[1m

                        P        R       F
country             96.51    97.65   97.08
phone               98.36   100.00   99.17
soft_skills         97.47    83.70   90.06
education           96.51    92.22   94.32
projects            75.47    86.96   80.81
email              100.00    98.84   99.42
technical_skills    96.59    92.39   94.44
name                92.11   100.00   95.89
internships         86.67    71.43   78.31
languages           97.85    98.91   98.38
experience          96.84   100.00   98.40
certifications      88.35    98.91   93.33
profile             98.92   100.00   99.46

[1m

                   ROC AUC
country               0.99
phone                 1.00
soft_skills           0.98
education             0.99
projects              0.99
email                 1.00
technical_skills 