In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
!pip install -q spacy==3.8.7
!pip install -q fuzzywuzzy
!pip install -q tqdm

In [7]:
# ======================================================================
# SCRIPT: 02_convert_to_spacy_format_fuzzy.py (Fixed Overlapping Spans)
# PURPOSE:
#   - Improved entity matching using multiple answer variants
#   - Convert to spaCy DocBin format for training
#   - Handles overlapping entity spans safely
# ======================================================================

import json
from pathlib import Path
import spacy
from spacy.tokens import DocBin
from fuzzywuzzy import fuzz
from tqdm import tqdm
import re

# --- Paths ---
INPUT_JSON_PATH = Path('/kaggle/input/normalized-processes-data/train_data.json')
OUTPUT_SPACY_PATH = Path('/kaggle/working/spacy_train_data.spacy')

# --- Config ---
FUZZY_MATCH_THRESHOLD = 85

# --- Load data ---
with open(INPUT_JSON_PATH, 'r', encoding='utf-8') as f:
    dataset = json.load(f)

print(f"📂 Loaded {len(dataset)} training samples")

# --- NLP pipeline ---
nlp = spacy.blank("en")
doc_bin = DocBin()
label_counts = {}

# --- Helper: Generate multiple forms of answer ---
def generate_variants(answer):
    answer = answer.strip()
    variants = set()

    variants.add(answer.lower())
    variants.add(answer.replace(",", ""))
    variants.add(re.sub(r"[^\w\s]", "", answer))
    variants.add(" ".join(answer.split()))
    variants.add(answer.strip(' .').title())

    words_only = re.sub(r"[^a-zA-Z\s]", "", answer)
    if words_only.strip():
        variants.add(words_only.strip().lower())

    return list(variants)

# --- Fuzzy Matcher ---
def find_entity_spans(text, answer, label):
    candidates = []
    variants = generate_variants(answer)

    for variant in variants:
        for i in range(len(text)):
            window = text[i:i + len(variant) + 10]
            score = fuzz.partial_ratio(variant.lower(), window.lower())
            if score >= FUZZY_MATCH_THRESHOLD:
                start = text.lower().find(window.lower())
                if start != -1:
                    end = start + len(window)
                    candidates.append((start, end, label))
                    return candidates
    return candidates

# --- Build DocBin ---
for item in tqdm(dataset):
    text = item['context']
    entities = []

    for label, answers in item['ground_truth'].items():
        if isinstance(answers, str):
            answers = [answers]

        found = False
        for answer in answers:
            if not answer.strip():
                continue
            match = find_entity_spans(text, answer, label)
            if match:
                entities.extend(match)
                label_counts[label] = label_counts.get(label, 0) + 1
                found = True
                break

        if not found:
            print(f"❌ No match for '{' / '.join(answers)}' ({label}) in file {item['file_name']}")

    doc = nlp.make_doc(text)
    spans = []
    used_tokens = set()

    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            span_tokens = set(range(span.start, span.end))
            if not span_tokens & used_tokens:
                spans.append(span)
                used_tokens.update(span_tokens)

    doc.ents = spans
    doc_bin.add(doc)

# --- Save Output ---
doc_bin.to_disk(OUTPUT_SPACY_PATH)
print(f"\n✅ Saved .spacy training file to: {OUTPUT_SPACY_PATH}")
print(f"📊 Entity Label Counts: {label_counts}")


📂 Loaded 9 training samples


 11%|█         | 1/9 [00:00<00:06,  1.17it/s]

❌ No match for '15' (Renewal Notice (Days)) in file 6683127-House-Rental-Contract-GERALDINE-GALINATO-v2-Page-1


 33%|███▎      | 3/9 [00:04<00:08,  1.37s/it]

❌ No match for '90' (Renewal Notice (Days)) in file 18325926-Rental-Agreement-1
❌ No match for '30' (Renewal Notice (Days)) in file 36199312-Rental-Agreement


 44%|████▍     | 4/9 [00:04<00:04,  1.13it/s]

❌ No match for '3000 / 3,000.00 / Three Thousand' (Agreement Value) in file 44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement
❌ No match for '20 September 2010' (Agreement Start Date) in file 44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement
❌ No match for '19 July 2011' (Agreement End Date) in file 44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement
❌ No match for '' (Renewal Notice (Days)) in file 44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement


 56%|█████▌    | 5/9 [00:06<00:05,  1.26s/it]

❌ No match for 'M.V.V. Vijaya Shankar' (Party One) in file 44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement


 67%|██████▋   | 6/9 [00:06<00:02,  1.08it/s]

❌ No match for '31.02.2011' (Agreement End Date) in file 47854715-RENTAL-AGREEMENT
❌ No match for '1 April 2010' (Agreement Start Date) in file 50070534-RENTAL-AGREEMENT


 78%|███████▊  | 7/9 [00:08<00:02,  1.11s/it]

❌ No match for '30 March 2011' (Agreement End Date) in file 50070534-RENTAL-AGREEMENT
❌ No match for '90' (Renewal Notice (Days)) in file 50070534-RENTAL-AGREEMENT
❌ No match for '31 March 2012' (Agreement End Date) in file 54770958-Rental-Agreement
❌ No match for '90' (Renewal Notice (Days)) in file 54770958-Rental-Agreement
❌ No match for 'K. Parthasarathy' (Party One) in file 54770958-Rental-Agreement


100%|██████████| 9/9 [00:17<00:00,  1.96s/it]


✅ Saved .spacy training file to: /kaggle/working/spacy_train_data.spacy
📊 Entity Label Counts: {'Agreement Value': 8, 'Agreement Start Date': 7, 'Agreement End Date': 5, 'Party One': 7, 'Party Two': 9, 'Renewal Notice (Days)': 3}





In [8]:
# ======================================================================
# SCRIPT: 02_convert_to_spacy_format_fuzzy_test.py
# PURPOSE:
#   - Convert normalized test_data.json to spaCy DocBin format
#   - Uses fuzzy matching to find entity spans
# ======================================================================

import json
from pathlib import Path
import spacy
from spacy.tokens import DocBin
from fuzzywuzzy import fuzz
from tqdm import tqdm
import re

# --- Paths (Updated for test set) ---
INPUT_JSON_PATH = Path('/kaggle/input/normalised-test-processed-data/test_data.json')
OUTPUT_SPACY_PATH = Path('/kaggle/working/spacy_test_data.spacy')

# --- Config ---
FUZZY_MATCH_THRESHOLD = 85

# --- Load data ---
with open(INPUT_JSON_PATH, 'r', encoding='utf-8') as f:
    dataset = json.load(f)

print(f"📂 Loaded {len(dataset)} test samples")

# --- NLP pipeline ---
nlp = spacy.blank("en")
doc_bin = DocBin()
label_counts = {}

# --- Helper: Generate multiple forms of answer ---
def generate_variants(answer):
    answer = answer.strip()
    variants = set()

    # Strip and lowercase
    variants.add(answer.lower())

    # Remove commas
    variants.add(answer.replace(",", ""))

    # Remove currency symbols and punctuation
    variants.add(re.sub(r"[^\w\s]", "", answer))

    # Normalize whitespace
    variants.add(" ".join(answer.split()))

    # Remove leading/trailing punctuations and normalize case
    variants.add(answer.strip(' .').title())

    # Remove non-letter chars for verbal forms
    words_only = re.sub(r"[^a-zA-Z\s]", "", answer)
    if words_only.strip():
        variants.add(words_only.strip().lower())

    return list(variants)

# --- Fuzzy Matcher ---
def find_entity_spans(text, answer, label):
    candidates = []
    variants = generate_variants(answer)

    for variant in variants:
        for i in range(len(text)):
            window = text[i:i + len(variant) + 10]
            score = fuzz.partial_ratio(variant.lower(), window.lower())
            if score >= FUZZY_MATCH_THRESHOLD:
                start = text.lower().find(window.lower())
                if start != -1:
                    end = start + len(window)
                    candidates.append((start, end, label))
                    return candidates  # return first match
    return candidates

# --- Build DocBin ---
for item in tqdm(dataset):
    text = item['context']
    entities = []

    for label, answers in item['ground_truth'].items():
        if isinstance(answers, str):
            answers = [answers]

        found = False
        for answer in answers:
            if not answer.strip():
                continue
            match = find_entity_spans(text, answer, label)
            if match:
                entities.extend(match)
                label_counts[label] = label_counts.get(label, 0) + 1
                found = True
                break

        if not found:
            print(f"❌ No match for '{' / '.join(answers)}' ({label}) in file {item['file_name']}")

    doc = nlp.make_doc(text)
    spans = [doc.char_span(start, end, label=label) for start, end, label in entities if doc.char_span(start, end, label=label)]
    doc.ents = spans
    doc_bin.add(doc)

# --- Save Output ---
doc_bin.to_disk(OUTPUT_SPACY_PATH)
print(f"\n✅ Saved .spacy test file to: {OUTPUT_SPACY_PATH}")
print(f"📊 Entity Label Counts: {label_counts}")


📂 Loaded 4 test samples


 50%|█████     | 2/4 [00:01<00:01,  1.10it/s]

❌ No match for '31 March 2011' (Agreement End Date) in file 95980236-Rental-Agreement
❌ No match for '14 November 2013' (Agreement End Date) in file 156155545-Rental-Agreement-Kns-Home
❌ No match for '30' (Renewal Notice (Days)) in file 156155545-Rental-Agreement-Kns-Home


100%|██████████| 4/4 [00:03<00:00,  1.02it/s]


✅ Saved .spacy test file to: /kaggle/working/spacy_test_data.spacy
📊 Entity Label Counts: {'Agreement Value': 4, 'Agreement Start Date': 4, 'Agreement End Date': 2, 'Renewal Notice (Days)': 3, 'Party One': 4, 'Party Two': 4}



