In [1]:
# 📦 Install spaCy and download the English model
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-3.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Downloading spacy-3.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.0/33.0 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.8.5
    Uninstalling spacy-3.8.5:
      Successfully uninstalled spacy-3.8.5
Successfully installed spacy-3.8.7
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_cor

In [2]:
# ✅ Imports
import spacy
from pathlib import Path
from spacy.training import Example
from spacy.tokens import DocBin
import random
from spacy.util import minibatch, compounding

# ✅ Define paths (using Kaggle directories)
INPUT_SPACY_PATH = Path("/kaggle/input/upd-spacy/spacy_train_data.spacy")
MODEL_OUTPUT_DIR = Path("/kaggle/working/fine_tuned_model")
MODEL_OUTPUT_DIR.mkdir(exist_ok=True)

# ✅ Load blank spaCy model and add NER pipe
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# ✅ Load training data
print(f"📂 Loading training data from: {INPUT_SPACY_PATH}")
doc_bin = DocBin().from_disk(str(INPUT_SPACY_PATH))
docs = list(doc_bin.get_docs(nlp.vocab))

# ✅ Convert docs to training examples
examples = [
    Example.from_dict(doc, {
        "entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    })
    for doc in docs
]

# ✅ Add entity labels to NER
for example in examples:
    for ent in example.reference.ents:
        ner.add_label(ent.label_)

# ✅ Train the model
n_iter = 30
optimizer = nlp.initialize()

print("\n🚀 Starting training...")
for i in range(n_iter):
    random.shuffle(examples)
    losses = {}
    batches = minibatch(examples, size=compounding(4.0, 32.0, 1.5))
    for batch in batches:
        nlp.update(batch, drop=0.35, losses=losses)
    print(f"📉 Iteration {i+1}/{n_iter}, Loss: {losses['ner']:.4f}")

# ✅ Save fine-tuned model
nlp.to_disk(MODEL_OUTPUT_DIR)
print(f"\n✅ Trained model saved to: {MODEL_OUTPUT_DIR}")


[2025-06-22 13:23:34,410] [INFO] Created vocabulary
[2025-06-22 13:23:34,412] [INFO] Finished initializing nlp object


📂 Loading training data from: /kaggle/input/upd-spacy/spacy_train_data.spacy

🚀 Starting training...


  d_xhat = N * dY - sum_dy - dist * var ** (-1.0) * sum_dy_dist


📉 Iteration 1/30, Loss: 6877.2378
📉 Iteration 2/30, Loss: 6403.3179
📉 Iteration 3/30, Loss: 5649.2274
📉 Iteration 4/30, Loss: 3821.7261
📉 Iteration 5/30, Loss: 907.0033
📉 Iteration 6/30, Loss: 97.6902
📉 Iteration 7/30, Loss: 33.2743
📉 Iteration 8/30, Loss: 32.0012
📉 Iteration 9/30, Loss: 31.9931
📉 Iteration 10/30, Loss: 31.9858
📉 Iteration 11/30, Loss: 31.8638
📉 Iteration 12/30, Loss: 29.3040
📉 Iteration 13/30, Loss: 20.5486
📉 Iteration 14/30, Loss: 17.1410
📉 Iteration 15/30, Loss: 17.3488
📉 Iteration 16/30, Loss: 16.6650
📉 Iteration 17/30, Loss: 16.2616
📉 Iteration 18/30, Loss: 15.7492
📉 Iteration 19/30, Loss: 14.6385
📉 Iteration 20/30, Loss: 15.0073
📉 Iteration 21/30, Loss: 14.2382
📉 Iteration 22/30, Loss: 13.5338
📉 Iteration 23/30, Loss: 13.7539
📉 Iteration 24/30, Loss: 12.1412
📉 Iteration 25/30, Loss: 11.6197
📉 Iteration 26/30, Loss: 11.3523
📉 Iteration 27/30, Loss: 11.3980
📉 Iteration 28/30, Loss: 10.5204
📉 Iteration 29/30, Loss: 9.0801
📉 Iteration 30/30, Loss: 11.1067

✅ Trained 

In [3]:
import spacy

# 🔄 Load the fine-tuned model
nlp = spacy.load("/kaggle/working/fine_tuned_model")

# 🧪 Sample text
text = """
This Rental Agreement is made between John Doe and Jane Smith. The Agreement starts on March 15, 2024 and ends on March 14, 2025. A renewal notice of 30 days is required.

"""

# 🔎 Run NER prediction
doc = nlp(text)
print("🔍 Named Entities Detected:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")


🔍 Named Entities Detected:


In [4]:
nlp = spacy.load("/kaggle/working/fine_tuned_model")
print("📋 Entity Labels in Model:", nlp.get_pipe("ner").labels)


📋 Entity Labels in Model: ('Agreement End Date', 'Agreement Start Date', 'Party Two')


In [5]:
import shutil

# Define source and destination
model_dir = "/kaggle/working/fine_tuned_model"
zip_file = "/kaggle/working/fine_tuned_model.zip"

# Create zip archive
shutil.make_archive(base_name=zip_file.replace('.zip', ''), format='zip', root_dir=model_dir)

print("✅ Model zipped successfully!")


✅ Model zipped successfully!


In [6]:
# ==============================================================================
# SCRIPT: 04_evaluate_on_test_data.py (Robust Final Version)
# PURPOSE:
#   - Evaluate fine-tuned spaCy model on test set (.spacy format)
#   - Show Precision, Recall, F1 safely
#   - Handle cases with no predictions gracefully
# ==============================================================================

import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from pathlib import Path

# --- Paths ---
MODEL_PATH = Path("/kaggle/working/fine_tuned_model")
TEST_SPACY_PATH = Path("/kaggle/input/upd-test/spacy_test_data.spacy")

# --- Load model ---
print("📦 Loading fine-tuned model...")
nlp = spacy.load(MODEL_PATH)

# --- Load test data ---
print("📂 Loading test data...")
doc_bin = DocBin().from_disk(TEST_SPACY_PATH)
test_docs = list(doc_bin.get_docs(nlp.vocab))
examples = [
    Example.from_dict(doc, {
        "entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    })
    for doc in test_docs
]

# --- Evaluate ---
scores = nlp.evaluate(examples)

# --- Safe print helper ---
def safe_fmt(val):
    return f"{val:.2f}" if isinstance(val, float) else "N/A"

# --- Display scores ---
print("\n🔍 Evaluation Results:")
print(f"✅ Precision: {safe_fmt(scores.get('ents_p'))}")
print(f"✅ Recall:    {safe_fmt(scores.get('ents_r'))}")
print(f"✅ F1 Score:  {safe_fmt(scores.get('ents_f'))}")
print(f"📊 Number of Examples Evaluated: {len(examples)}")

# --- Entity label breakdown ---
ents_per_type = scores.get("ents_per_type")
if isinstance(ents_per_type, dict) and ents_per_type:
    print("\n📋 Entity-wise Breakdown:")
    for label, stats in ents_per_type.items():
        p = safe_fmt(stats.get("p"))
        r = safe_fmt(stats.get("r"))
        f = safe_fmt(stats.get("f"))
        print(f" - {label}: Precision={p}, Recall={r}, F1={f}")
else:
    print("\n⚠️ No entity-wise stats available (possibly no predictions made).")

# --- Optional: Show predicted entities ---
print("\n🔎 Sample Predictions:")
for i, doc in enumerate(test_docs):
    print(f"\n📄 Example {i+1}")
    if not doc.ents:
        print("  - No entities predicted.")
    else:
        for ent in doc.ents:
            print(f"  - {ent.text} ({ent.label_})")


📦 Loading fine-tuned model...
📂 Loading test data...

🔍 Evaluation Results:
✅ Precision: 1.00
✅ Recall:    1.00
✅ F1 Score:  1.00
📊 Number of Examples Evaluated: 4

📋 Entity-wise Breakdown:
 - Agreement End Date: Precision=1.00, Recall=1.00, F1=1.00

🔎 Sample Predictions:

📄 Example 1
  - up to end of March 2009 (Agreement End Date)

📄 Example 2
  - No entities predicted.

📄 Example 3
  - No entities predicted.

📄 Example 4
  - No entities predicted.
