In [1]:
# Complete spaCy NER Training - OPTIMIZED FOR SMALL DATASET
# Fixed training parameters for 75 documents

import json
import spacy
from spacy.tokens import Doc
from spacy.training import Example
from spacy.tokens import DocBin
from spacy.util import filter_spans
import os
import random
from tqdm import tqdm
from spacy.scorer import Scorer
import subprocess
import sys

print("=" * 70)
print("--- Step 0: Ensuring Correct Libraries ---")
print("=" * 70)
try:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 
                          'spacy==3.7.2', 'typer<0.10.0', 'click<8.2.0', 'tqdm'])
    print("‚úì Libraries installed successfully\n")
except Exception as e:
    print(f"‚ö† Warning: Could not install libraries. Error: {e}\n")

print("=" * 70)
print("--- Step 1: Loading and Cleaning Data ---")
print("=" * 70)

ANNOTATIONS_PATH = "../training_data/annotations.json"
SOURCE_FILES_DIR = "../training_data/source_files/"
training_data = []

if not os.path.exists(ANNOTATIONS_PATH):
    print(f"‚ùå ERROR: annotations.json not found at {ANNOTATIONS_PATH}")
    sys.exit(1)

if not os.path.exists(SOURCE_FILES_DIR):
    print(f"‚ùå ERROR: source_files directory not found at {SOURCE_FILES_DIR}")
    sys.exit(1)

with open(ANNOTATIONS_PATH, 'r', encoding='utf-8') as f:
    tasks = json.load(f)

print(f"Found {len(tasks)} tasks in annotations.json\n")

for task in tasks:
    task_id = task.get("id")
    if task_id is None:
        continue
    
    original_filename = f"{task_id}.txt"
    text_filepath = os.path.join(SOURCE_FILES_DIR, original_filename)
    
    if not os.path.exists(text_filepath):
        print(f"‚ö† Skipping task {task_id}: {original_filename} not found")
        continue
    
    with open(text_filepath, 'r', encoding='utf-8') as f:
        text = f.read()
    
    annotations = []
    if task.get('annotations') and len(task['annotations']) > 0:
        if task['annotations'][0].get('result'):
            for entity in task['annotations'][0]['result']:
                if entity.get('type') == 'labels':
                    vals = entity['value']
                    annotations.append({
                        "start": vals['start'],
                        "end": vals['end'],
                        "label": vals['labels'][0]
                    })
    
    if annotations:
        training_data.append({
            "text": text,
            "entities": annotations
        })
        print(f"‚úì Task {task_id}: {len(annotations)} entities found")

print(f"\n‚úì Successfully loaded {len(training_data)} documents with annotations\n")

if len(training_data) == 0:
    print("‚ùå ERROR: No training data loaded!")
    sys.exit(1)

print("=" * 70)
print("--- Step 2: Creating .spacy Files ---")
print("=" * 70)

nlp = spacy.blank("xx")
db_train = DocBin()
db_valid = DocBin()

random.seed(42)
random.shuffle(training_data)
split_point = int(len(training_data) * 0.8)
train_set = training_data[:split_point]
valid_set = training_data[split_point:]

print(f"Train set: {len(train_set)} documents")
print(f"Valid set: {len(valid_set)} documents\n")

def create_spacy_docs(data_set, set_name=""):
    docs = []
    print(f"Converting {set_name} documents to spaCy format...")
    
    for item in tqdm(data_set, desc=set_name):
        doc = nlp.make_doc(item['text'])
        ents = []
        
        for entity in item['entities']:
            span = doc.char_span(
                entity['start'],
                entity['end'],
                label=entity['label'],
                alignment_mode="contract"
            )
            if span is not None:
                ents.append(span)
        
        doc.ents = filter_spans(ents)
        docs.append(doc)
    
    return docs

train_docs = create_spacy_docs(train_set, "Training")
valid_docs = create_spacy_docs(valid_set, "Validation")

# Add docs to DocBin
for doc in train_docs:
    db_train.add(doc)

for doc in valid_docs:
    db_valid.add(doc)

os.makedirs("./training", exist_ok=True)
db_train.to_disk("./training/train.spacy")
db_valid.to_disk("./training/valid.spacy")

print(f"\n‚úì Saved train.spacy ({len(train_docs)} documents)")
print(f"‚úì Saved valid.spacy ({len(valid_docs)} documents)\n")

print("=" * 70)
print("--- Step 2.5: Creating spaCy Configuration File ---")
print("=" * 70)

CONFIG_PATH = "./training/config.cfg"

# OPTIMIZED CONFIG FOR SMALL DATASET (75 docs)
# Key changes:
# - max_epochs = 100 (more training time)
# - patience = 1600 (don't stop early)
# - eval_frequency = 50 (evaluate more often)
# - batcher with smaller batch size
config_content = """[paths]
train = null
dev = null

[system]
seed = 0
gpu_allocator = null
allow_alloc_mb = 5000

[nlp]
lang = "xx"
pipeline = ["ner"]
disabled = []
batch_size = 128

[components]

[components.ner]
factory = "ner"
moves = null
update_with_oracle_cut_size = 0

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"

[components.ner.model.tok2vec.embed]
@architectures = "spacy.CharacterEmbed.v2"
width = 96
rows = 7000
nM = 64
nC = 8
include_static_vectors = false

[components.ner.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
max_epochs = 100
patience = 1600
max_steps = 0
eval_frequency = 50
accumulate_gradient = 1
score_weights = {"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 1000
discard_oversize = false
tolerance = 0.2

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = true

[training.optimizer]
@optimizers = "Adam.v1"

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.001

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[initialize]
components = {}
before_init = null
after_init = null
"""

with open(CONFIG_PATH, 'w', encoding='utf-8') as f:
    f.write(config_content)

print(f"‚úì Config file created at {CONFIG_PATH}\n")

print("=" * 70)
print("--- Step 3: Training NER Model (Persian & English) ---")
print("=" * 70)

OUTPUT_DIR = "../models/librarian_ner_model_v1"
os.makedirs(OUTPUT_DIR, exist_ok=True)

train_cmd = [
    sys.executable, '-m', 'spacy', 'train',
    CONFIG_PATH,
    '--output', OUTPUT_DIR,
    '--paths.train', './training/train.spacy',
    '--paths.dev', './training/valid.spacy'
]

print(f"Output directory: {OUTPUT_DIR}\n")
print("Starting training (this may take 5-15 minutes)...\n")

try:
    result = subprocess.run(train_cmd, check=True, capture_output=False)
    print("\n‚úì Training completed successfully!\n")
except subprocess.CalledProcessError as e:
    print(f"\n‚ùå Training failed with error code {e.returncode}")
    print(f"Error: {e}\n")
    sys.exit(1)

print("=" * 70)
print("--- Step 4: Evaluating Model on Validation Set ---")
print("=" * 70)

MODEL_PATH = os.path.join(OUTPUT_DIR, "model-best")

if not os.path.exists(MODEL_PATH):
    print(f"‚ùå ERROR: Model not found at {MODEL_PATH}")
    print("Training may have failed. Check the output above.\n")
    sys.exit(1)

print(f"Loading model from: {MODEL_PATH}\n")

try:
    nlp_custom = spacy.load(MODEL_PATH)
    print(f"‚úì Model loaded successfully\n")
except Exception as e:
    print(f"‚ùå Could not load model: {e}\n")
    sys.exit(1)

print("Creating evaluation examples...\n")
eval_examples = []
for doc in valid_docs:
    pred_doc = nlp_custom(doc.text)
    eval_examples.append(Example(pred_doc, doc))

print("Scoring model on validation set...\n")
scorer = Scorer()
scores = scorer.score(eval_examples)

print("=" * 70)
print("--- EVALUATION RESULTS ---")
print("=" * 70)

if scores:
    print(f"\nüìä Overall F1-Score: {scores.get('ents_f', 0):.4f}")
    print(f"   Precision: {scores.get('ents_p', 0):.4f}")
    print(f"   Recall: {scores.get('ents_r', 0):.4f}")
    
    if scores.get('ents_per_type'):
        print("\nüìã Scores per Entity Type:")
        print("-" * 70)
        for label, metrics in sorted(scores['ents_per_type'].items()):
            f_score = metrics.get('f', 0)
            precision = metrics.get('p', 0)
            recall = metrics.get('r', 0)
            print(f"  {label.upper():20} | F1: {f_score:.4f} | P: {precision:.4f} | R: {recall:.4f}")
        print("-" * 70)
    
    print(f"\n‚úì Validation set size: {len(eval_examples)} documents")
    print(f"‚úì Training set size: {len(train_docs)} documents")
else:
    print("‚ùå No scores returned from evaluation")

print("\n" + "=" * 70)
print("‚úì TRAINING AND EVALUATION COMPLETE!")
print("=" * 70)
print(f"\nModel saved at: {OUTPUT_DIR}")
print(f"Best model at: {MODEL_PATH}")
print("\nYou can now use the model with:")
print(f"  nlp = spacy.load('{MODEL_PATH}')")
print(f"  doc = nlp('your text here')")
print(f"  for ent in doc.ents:")
print(f"      print(ent.text, ent.label_)\n")


--- Step 0: Ensuring Correct Libraries ---
‚úì Libraries installed successfully

--- Step 1: Loading and Cleaning Data ---
Found 75 tasks in annotations.json

‚úì Task 1: 3 entities found
‚úì Task 2: 6 entities found
‚úì Task 3: 4 entities found
‚úì Task 4: 4 entities found
‚úì Task 5: 7 entities found
‚úì Task 6: 3 entities found
‚úì Task 7: 6 entities found
‚úì Task 8: 8 entities found
‚úì Task 9: 5 entities found
‚úì Task 10: 6 entities found
‚úì Task 11: 6 entities found
‚úì Task 12: 3 entities found
‚úì Task 13: 3 entities found
‚úì Task 14: 3 entities found
‚úì Task 15: 4 entities found
‚úì Task 16: 6 entities found
‚úì Task 17: 4 entities found
‚úì Task 18: 6 entities found
‚úì Task 19: 5 entities found
‚úì Task 20: 5 entities found
‚úì Task 21: 5 entities found
‚úì Task 22: 3 entities found
‚úì Task 23: 6 entities found
‚úì Task 24: 3 entities found
‚úì Task 25: 6 entities found
‚úì Task 26: 5 entities found
‚úì Task 27: 4 entities found
‚úì Task 28: 5 entities found
‚úì Task 2

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


Train set: 60 documents
Valid set: 15 documents

Converting Training documents to spaCy format...


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 60/60 [00:00<00:00, 166.60it/s]


Converting Validation documents to spaCy format...


Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:00<00:00, 169.26it/s]



‚úì Saved train.spacy (60 documents)
‚úì Saved valid.spacy (15 documents)

--- Step 2.5: Creating spaCy Configuration File ---
‚úì Config file created at ./training/config.cfg

--- Step 3: Training NER Model (Persian & English) ---
Output directory: ../models/librarian_ner_model_v1

Starting training (this may take several minutes)...



  torch.utils._pytree._register_pytree_node(


[38;5;4m‚Ñπ Saving to output directory: ../models/librarian_ner_model_v1[0m
[38;5;4m‚Ñπ Using CPU[0m
[1m
[38;5;2m‚úî Initialized pipeline[0m
[1m
[38;5;4m‚Ñπ Pipeline: ['ner'][0m
[38;5;4m‚Ñπ Initial learn rate: 0.0[0m
E    #       LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  --------  ------  ------  ------  ------
  0       0    173.71    0.00    0.00    0.00    0.00
[38;5;2m‚úî Saved pipeline to output directory[0m
../models/librarian_ner_model_v1/model-last

‚úì Training completed successfully!

--- Step 4: Evaluating Model on Validation Set ---
Loading model from: ../models/librarian_ner_model_v1/model-best

‚úì Model loaded successfully

Creating evaluation examples...

Scoring model on validation set...

--- EVALUATION RESULTS ---

üìä Overall F1-Score: 0.0000
   Precision: 0.0000
   Recall: 0.0000

üìã Scores per Entity Type:
----------------------------------------------------------------------
  AUTHOR               | F1: 0.0000 | P: 0.0000 | R: 0.0000
