In [None]:
# 📦 Import Required Libraries
import json
import time
import random
import re
import os
from typing import List, Dict, Tuple, Any
from dataclasses import dataclass, asdict
from collections import defaultdict, Counter
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

print("📦 All libraries imported successfully!")


In [None]:
# ⚙️ Configuration
print("🚀 GLiNER vs OpenAI NER Benchmark Configuration")
print("=" * 60)

# Choose benchmark mode
print("\n🤖 Available benchmark modes:")
print("1. 🆓 GLiNER Large only (FREE - no API key required)")
print("2. 🔥 GLiNER Large vs OpenAI (requires API key)")

choice = input("\nChoose your mode (1-2, default=1): ").strip() or "1"

if choice == "2":
    RUN_OPENAI = True
    print("✅ Selected: GLiNER Large vs OpenAI comparison")
else:
    RUN_OPENAI = False
    print("✅ Selected: GLiNER Large only (FREE mode)")

# Sample size configuration
while True:
    try:
        SAMPLE_SIZE = int(input("\n📊 How many samples to test? (50-1000, default 100): ") or "100")
        if 50 <= SAMPLE_SIZE <= 1000:
            break
        else:
            print("⚠️ Please enter a number between 50 and 1000")
    except ValueError:
        print("⚠️ Please enter a valid number")

# Performance tier guidance
if SAMPLE_SIZE <= 100:
    print("🔍 Quick Test Mode: Fast evaluation for initial testing")
elif SAMPLE_SIZE <= 500:
    print("📊 Standard Evaluation: Balanced performance assessment")
else:
    print("🏆 Comprehensive Benchmark: Full production-grade evaluation")

# Get OpenAI API key if needed
if RUN_OPENAI:
    print(f"\n💰 Note: OpenAI comparison will use API calls (small cost)")
    import getpass
    try:
        OPENAI_API_KEY = getpass.getpass("🔑 Enter your OpenAI API key: ")
        if not OPENAI_API_KEY.strip():
            print("❌ No API key provided. Switching to GLiNER-only mode.")
            RUN_OPENAI = False
        else:
            os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY.strip()
            print("✅ OpenAI API key set successfully!")
    except Exception as e:
        print(f"❌ OpenAI initialization failed: {e}")
        print("🔄 Falling back to GLiNER-only mode...")
        RUN_OPENAI = False

# Entity labels for business card NER
ENTITY_LABELS = ["person", "email", "phone", "organization"]

print(f"\n🎯 FINAL CONFIGURATION:")
print(f"   📊 Sample size: {SAMPLE_SIZE}")
print(f"   🤖 GLiNER Large: ✅ Enabled")
print(f"   🔥 OpenAI: {'✅ Enabled' if RUN_OPENAI else '❌ Disabled'}")
print(f"   🏷️ Entities: {', '.join(ENTITY_LABELS)}")
print("=" * 60)


In [None]:
# 🏗️ Data Structures
@dataclass
class GroundTruth:
    name: str
    company: str
    email: str
    phone: str

@dataclass
class BusinessCardSample:
    sample_id: int
    scenario: str
    ocr_lines: List[str]
    ground_truth: GroundTruth

@dataclass
class BenchmarkResult:
    sample_id: int
    scenario: str
    gliner_accuracy: Dict[str, float]
    openai_accuracy: Dict[str, float]
    gliner_time: float
    openai_time: float

print("🏗️ Data structures defined successfully!")


In [None]:
# 🎲 Synthetic Business Card Data Generator
class BusinessCardGenerator:
    def __init__(self):
        self.names = [
            "John Smith", "Sarah Johnson", "Michael Brown", "Emily Davis", "David Wilson",
            "Lisa Anderson", "Robert Taylor", "Jennifer Martinez", "William Garcia", "Maria Rodriguez"
        ]
        
        self.companies = [
            "TechCorp Solutions", "Global Dynamics Inc", "Innovation Labs", "Digital Ventures",
            "Future Systems", "Smart Technologies", "Advanced Analytics", "Cloud Solutions"
        ]
        
        self.domains = ["gmail.com", "company.com", "business.org", "corp.net", "tech.io"]
    
    def generate_phone(self):
        return f"+1-{random.randint(200,999)}-{random.randint(200,999)}-{random.randint(1000,9999)}"
    
    def create_clean_sample(self, sample_id: int) -> BusinessCardSample:
        name = random.choice(self.names)
        company = random.choice(self.companies)
        email = f"{name.lower().replace(' ', '.')}.{random.choice(self.domains)}"
        phone = self.generate_phone()
        
        ocr_lines = [
            name,
            "Senior Manager",
            company,
            email,
            phone,
            "www.company.com"
        ]
        
        return BusinessCardSample(
            sample_id=sample_id,
            scenario="clean",
            ocr_lines=ocr_lines,
            ground_truth=GroundTruth(name=name, company=company, email=email, phone=phone)
        )
    
    def create_noisy_sample(self, sample_id: int) -> BusinessCardSample:
        clean_sample = self.create_clean_sample(sample_id)
        
        # Add OCR noise
        noisy_lines = []
        for line in clean_sample.ocr_lines:
            if random.random() < 0.3:  # 30% chance of noise
                line = line.replace('o', '0').replace('l', '1').replace('S', '5')
            noisy_lines.append(line)
        
        clean_sample.ocr_lines = noisy_lines
        clean_sample.scenario = "noisy"
        return clean_sample
    
    def generate_dataset(self, size: int) -> List[BusinessCardSample]:
        dataset = []
        for i in range(size):
            if random.random() < 0.7:  # 70% clean, 30% noisy
                sample = self.create_clean_sample(i)
            else:
                sample = self.create_noisy_sample(i)
            dataset.append(sample)
        return dataset

generator = BusinessCardGenerator()
print("🎲 Business card data generator ready!")


In [None]:
# 📦 Install Required Packages
print("📦 Installing required packages...")

# Check if we're in Colab
try:
    import google.colab
    IN_COLAB = True
    print("📍 Running in Google Colab")
    
    # Install packages in Colab
    import subprocess
    import sys
    
    def install_package(package):
        print(f"🔧 Installing {package}...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])
            print(f"✅ {package} installed successfully!")
            return True
        except subprocess.CalledProcessError as e:
            print(f"❌ Failed to install {package}: {e}")
            return False
    
    # Install required packages
    packages = [
        "torch",  # PyTorch for GLiNER
        "gliner",  # GLiNER model
        "openai",  # OpenAI API (optional)
        "transformers",  # For model loading
        "accelerate"  # For GPU optimization
    ]
    
    success_count = 0
    for package in packages:
        if install_package(package):
            success_count += 1
    
    print(f"\n🎯 Installation Summary: {success_count}/{len(packages)} packages installed")
    
    if success_count == len(packages):
        print("✅ All packages installed successfully!")
    else:
        print("⚠️ Some packages failed to install - continuing anyway...")
        
except ImportError:
    print("📍 Running locally")
    print("💡 Please ensure you have installed the required packages:")
    print("   pip install torch gliner openai transformers accelerate")
    print("📋 Checking if packages are available...")
    
    # Check local packages
    missing_packages = []
    try:
        import torch
        print("✅ PyTorch available")
    except ImportError:
        missing_packages.append("torch")
        print("❌ PyTorch missing")
    
    try:
        import gliner
        print("✅ GLiNER available")
    except ImportError:
        missing_packages.append("gliner")
        print("❌ GLiNER missing")
    
    try:
        import openai
        print("✅ OpenAI available")
    except ImportError:
        missing_packages.append("openai")
        print("⚠️ OpenAI missing (optional for comparison mode)")
    
    if missing_packages:
        print(f"\n⚠️ Missing packages: {', '.join(missing_packages)}")
        print("📥 Install with: pip install " + " ".join(missing_packages))
    else:
        print("\n✅ All required packages are available!")

print("\n" + "="*60)
print("🚀 Ready to proceed with GLiNER setup!")
print("="*60)


In [None]:
# 🤖 GLiNER Setup
print("🤖 Setting up GLiNER Large model...")

try:
    import torch
    from gliner import GLiNER
    
    # Check for GPU
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"🔧 Device: {DEVICE}")
    
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"🚀 GPU: {gpu_name} ({gpu_memory:.1f} GB)")
    
    # Load GLiNER Large model
    print("📥 Loading GLiNER Large model...")
    gliner_model = GLiNER.from_pretrained("urchade/gliner_large-v2.1")
    gliner_model.to(DEVICE)
    gliner_model.eval()
    
    print("✅ GLiNER Large model loaded successfully!")
    
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        memory_used = torch.cuda.memory_allocated(0) / 1024**3
        print(f"📊 GPU memory used: {memory_used:.2f} GB")
        
except Exception as e:
    print(f"❌ GLiNER setup failed: {e}")
    print("💡 Install with: pip install gliner torch")
    print("💡 For Colab: !pip install gliner torch")
    raise


In [None]:
# 🧠 NER Benchmark Class
class NERBenchmark:
    def __init__(self):
        self.entity_labels = ENTITY_LABELS
        self.device = DEVICE
        
        # Initialize OpenAI client if needed
        if RUN_OPENAI:
            try:
                from openai import OpenAI
                self.openai_client = OpenAI()
                print("✅ OpenAI client initialized")
            except Exception as e:
                print(f"❌ OpenAI initialization failed: {e}")
                self.openai_client = None
    
    def extract_with_gliner(self, text: str) -> Tuple[Dict[str, List[str]], float]:
        start_time = time.time()
        
        try:
            # Enhanced labels for better person detection
            enhanced_labels = {
                "person": ["person", "name", "individual", "contact name", "full name"],
                "email": ["email", "email address"],
                "phone": ["phone", "telephone", "phone number"],
                "organization": ["organization", "company", "business"]
            }
            
            results = {label: [] for label in self.entity_labels}
            
            for entity_type, labels in enhanced_labels.items():
                entities = gliner_model.predict_entities(text, labels)
                for entity in entities:
                    results[entity_type].append(entity["text"])
            
            # Remove duplicates
            for key in results:
                results[key] = list(set(results[key]))
                
        except Exception as e:
            print(f"GLiNER error: {e}")
            results = {label: [] for label in self.entity_labels}
        
        elapsed_time = time.time() - start_time
        return results, elapsed_time
    
    def extract_with_openai(self, text: str) -> Tuple[Dict[str, List[str]], float]:
        if not self.openai_client:
            return {label: [] for label in self.entity_labels}, 0.0
        
        start_time = time.time()
        
        prompt = f"""Extract named entities from this business card text. Return ONLY a JSON object with these exact keys: person, email, phone, organization. Each value should be a list of strings.

Text: {text}

JSON:"""
        
        try:
            response = self.openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                max_tokens=200
            )
            
            result_text = response.choices[0].message.content.strip()
            if result_text.startswith("```json"):
                result_text = result_text[7:-3]
            elif result_text.startswith("```"):
                result_text = result_text[3:-3]
            
            results = json.loads(result_text.strip())
            
        except Exception as e:
            print(f"OpenAI error: {e}")
            results = {label: [] for label in self.entity_labels}
        
        elapsed_time = time.time() - start_time
        return results, elapsed_time
    
    def calculate_accuracy(self, predictions: Dict[str, List[str]], ground_truth: GroundTruth) -> Dict[str, float]:
        gt_map = {
            "person": ground_truth.name,
            "email": ground_truth.email,
            "phone": ground_truth.phone,
            "organization": ground_truth.company
        }
        
        accuracy = {}
        for entity_type in self.entity_labels:
            predicted = predictions.get(entity_type, [])
            expected = gt_map[entity_type]
            
            if not predicted:
                accuracy[entity_type] = 0.0
            else:
                # Check if any prediction matches (partial match for flexibility)
                matches = any(expected.lower() in pred.lower() or pred.lower() in expected.lower() 
                            for pred in predicted)
                accuracy[entity_type] = 1.0 if matches else 0.0
        
        return accuracy

benchmark = NERBenchmark()
print("🧠 NER Benchmark class initialized!")


In [None]:
# 🔄 Pull Latest Changes from GitHub (Colab Setup)
import os
import subprocess

def run_command(cmd):
    """Run shell command and return output"""
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        return result.returncode == 0, result.stdout, result.stderr
    except Exception as e:
        return False, "", str(e)

print("🚀 Setting up latest version from GitHub...")

# Repository details
REPO_URL = "https://github.com/shubhamhackz/ner_benchmark.git"
REPO_NAME = "ner_benchmark"

# Check if we're in Colab
try:
    import google.colab
    IN_COLAB = True
    print("📍 Running in Google Colab")
except ImportError:
    IN_COLAB = False
    print("📍 Running locally")

if IN_COLAB:
    # Change to content directory in Colab
    os.chdir('/content')
    
    # Check if repository already exists
    if os.path.exists(REPO_NAME):
        print(f"📂 Repository '{REPO_NAME}' found - pulling latest changes...")
        os.chdir(REPO_NAME)
        
        # Pull latest changes
        success, stdout, stderr = run_command("git pull origin main")
        if success:
            print("✅ Successfully pulled latest changes!")
            if stdout.strip():
                print(f"📄 Git output: {stdout.strip()}")
        else:
            print(f"⚠️ Pull failed: {stderr}")
            print("🔄 Trying to reset and pull again...")
            run_command("git reset --hard HEAD")
            success, stdout, stderr = run_command("git pull origin main")
            if success:
                print("✅ Successfully pulled after reset!")
            else:
                print(f"❌ Still failed: {stderr}")
    else:
        print(f"📥 Cloning repository '{REPO_NAME}'...")
        success, stdout, stderr = run_command(f"git clone {REPO_URL}")
        if success:
            print("✅ Successfully cloned repository!")
            os.chdir(REPO_NAME)
        else:
            print(f"❌ Clone failed: {stderr}")
    
    # Show current status
    if os.path.exists('.git'):
        success, commit_hash, _ = run_command("git rev-parse --short HEAD")
        success2, branch, _ = run_command("git rev-parse --abbrev-ref HEAD")
        
        if success and success2:
            print(f"📍 Current: {branch.strip()} @ {commit_hash.strip()}")
        
        # Show recent commits
        success, log_output, _ = run_command("git log --oneline -3")
        if success:
            print(f"📋 Recent commits:")
            for line in log_output.strip().split('\n')[:3]:
                if line.strip():
                    print(f"   • {line.strip()}")
    
    print(f"📁 Working directory: {os.getcwd()}")
    print("🎯 Ready to run the NER benchmark notebook!")

else:
    print("💻 Running locally - skipping git operations")
    print("💡 Make sure you've pulled the latest changes manually if needed")

print("=" * 60)


In [None]:
# 🔄 Pull Latest Changes from GitHub (Colab Setup)
import os
import subprocess

def run_command(cmd):
    """Run shell command and return output"""
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        return result.returncode == 0, result.stdout, result.stderr
    except Exception as e:
        return False, "", str(e)

print("🚀 Setting up latest version from GitHub...")

# Repository details
REPO_URL = "https://github.com/shubhamhackz/ner_benchmark.git"
REPO_NAME = "ner_benchmark"

# Check if we're in Colab
try:
    import google.colab
    IN_COLAB = True
    print("📍 Running in Google Colab")
except ImportError:
    IN_COLAB = False
    print("📍 Running locally")

if IN_COLAB:
    # Change to content directory in Colab
    os.chdir('/content')
    
    # Check if repository already exists
    if os.path.exists(REPO_NAME):
        print(f"📂 Repository '{REPO_NAME}' found - pulling latest changes...")
        os.chdir(REPO_NAME)
        
        # Pull latest changes
        success, stdout, stderr = run_command("git pull origin main")
        if success:
            print("✅ Successfully pulled latest changes!")
            if stdout.strip():
                print(f"📄 Git output: {stdout.strip()}")
        else:
            print(f"⚠️ Pull failed: {stderr}")
            print("🔄 Trying to reset and pull again...")
            run_command("git reset --hard HEAD")
            success, stdout, stderr = run_command("git pull origin main")
            if success:
                print("✅ Successfully pulled after reset!")
            else:
                print(f"❌ Still failed: {stderr}")
    else:
        print(f"📥 Cloning repository '{REPO_NAME}'...")
        success, stdout, stderr = run_command(f"git clone {REPO_URL}")
        if success:
            print("✅ Successfully cloned repository!")
            os.chdir(REPO_NAME)
        else:
            print(f"❌ Clone failed: {stderr}")
    
    # Show current status
    if os.path.exists('.git'):
        success, commit_hash, _ = run_command("git rev-parse --short HEAD")
        success2, branch, _ = run_command("git rev-parse --abbrev-ref HEAD")
        
        if success and success2:
            print(f"📍 Current: {branch.strip()} @ {commit_hash.strip()}")
        
        # Show recent commits
        success, log_output, _ = run_command("git log --oneline -3")
        if success:
            print(f"📋 Recent commits:")
            for line in log_output.strip().split('\n')[:3]:
                if line.strip():
                    print(f"   • {line.strip()}")
    
    print(f"📁 Working directory: {os.getcwd()}")
    print("🎯 Ready to run the NER benchmark notebook!")

else:
    print("💻 Running locally - skipping git operations")
    print("💡 Make sure you've pulled the latest changes manually if needed")

print("=" * 60)


In [None]:
# 🧪 Quick Test
print("🧪 Running quick test...")

test_sample = generator.create_clean_sample(0)
test_text = "\n".join(test_sample.ocr_lines)

print("📝 Test Sample:")
print(test_text)
print("\n" + "="*50)

# Test GLiNER
print("\n🤖 GLiNER Large Results:")
gliner_results, gliner_time = benchmark.extract_with_gliner(test_text)
for entity_type, entities in gliner_results.items():
    if entities:
        print(f"  {entity_type}: {entities}")
print(f"⏱️ Time: {gliner_time:.4f}s")

# Test OpenAI if enabled
if RUN_OPENAI:
    print("\n🔥 OpenAI Results:")
    openai_results, openai_time = benchmark.extract_with_openai(test_text)
    for entity_type, entities in openai_results.items():
        if entities:
            print(f"  {entity_type}: {entities}")
    print(f"⏱️ Time: {openai_time:.4f}s")
    
    if gliner_time > 0:
        print(f"\n⚡ Speed: GLiNER is {openai_time/gliner_time:.1f}x faster")

print("\n✅ Ground Truth:")
print(f"  Name: {test_sample.ground_truth.name}")
print(f"  Company: {test_sample.ground_truth.company}")
print(f"  Email: {test_sample.ground_truth.email}")
print(f"  Phone: {test_sample.ground_truth.phone}")

print("\n✅ Quick test completed!")


In [None]:
# 📊 Generate Dataset and Run Benchmark
print(f"📊 Generating {SAMPLE_SIZE} test samples...")
test_samples = generator.generate_dataset(SAMPLE_SIZE)

print(f"✅ Generated {len(test_samples)} test samples")
scenario_counts = Counter(sample.scenario for sample in test_samples)
print(f"   📊 Scenarios: {dict(scenario_counts)}")

# Run benchmark
print(f"\n🚀 Running benchmark on {len(test_samples)} samples...")
results = []

for i, sample in enumerate(test_samples):
    if (i + 1) % 50 == 0 or i == 0:
        print(f"   📈 Progress: {i + 1}/{len(test_samples)} samples")
    
    text = "\n".join(sample.ocr_lines)
    
    # GLiNER extraction
    gliner_predictions, gliner_time = benchmark.extract_with_gliner(text)
    gliner_accuracy = benchmark.calculate_accuracy(gliner_predictions, sample.ground_truth)
    
    # OpenAI extraction (if enabled)
    if RUN_OPENAI:
        openai_predictions, openai_time = benchmark.extract_with_openai(text)
        openai_accuracy = benchmark.calculate_accuracy(openai_predictions, sample.ground_truth)
    else:
        openai_accuracy = {label: 0.0 for label in ENTITY_LABELS}
        openai_time = 0.0
    
    # Store results
    result = BenchmarkResult(
        sample_id=sample.sample_id,
        scenario=sample.scenario,
        gliner_accuracy=gliner_accuracy,
        openai_accuracy=openai_accuracy,
        gliner_time=gliner_time,
        openai_time=openai_time
    )
    results.append(result)

print(f"\n✅ Benchmark completed! Processed {len(results)} samples")


In [None]:
# 📈 FINAL BENCHMARK RESULTS ANALYSIS
print("🔥 BENCHMARK RESULTS ANALYSIS")
print("=" * 80)

# Convert to DataFrame for analysis
data = []
for r in results:
    for entity_type in ENTITY_LABELS:
        data.append({
            'sample_id': r.sample_id,
            'scenario': r.scenario,
            'entity_type': entity_type,
            'gliner_accuracy': r.gliner_accuracy.get(entity_type, 0),
            'openai_accuracy': r.openai_accuracy.get(entity_type, 0),
            'gliner_time': r.gliner_time,
            'openai_time': r.openai_time
        })

df = pd.DataFrame(data)
print(f"📊 Analysis dataset: {len(df)} rows")

# Overall Performance
print("\n🏆 OVERALL PERFORMANCE:")
gliner_overall = df['gliner_accuracy'].mean()
avg_gliner_time = df['gliner_time'].mean()

print(f"   🤖 GLiNER Large: {gliner_overall:.3f} accuracy, {avg_gliner_time:.4f}s per sample")

if RUN_OPENAI:
    openai_overall = df['openai_accuracy'].mean()
    avg_openai_time = df['openai_time'].mean()
    print(f"   🔥 OpenAI: {openai_overall:.3f} accuracy, {avg_openai_time:.4f}s per sample")
    
    # Winner determination
    if gliner_overall > openai_overall:
        diff = gliner_overall - openai_overall
        print(f"   🏆 WINNER: GLiNER Large (+{diff:.3f} accuracy advantage)")
    elif openai_overall > gliner_overall:
        diff = openai_overall - gliner_overall
        print(f"   🏆 WINNER: OpenAI (+{diff:.3f} accuracy advantage)")
    else:
        print(f"   🤝 TIE: Both models perform equally")

# Performance by Entity Type
print("\n📊 PERFORMANCE BY ENTITY TYPE:")
entity_performance = df.groupby('entity_type')[['gliner_accuracy', 'openai_accuracy']].mean()

for entity in ENTITY_LABELS:
    gliner_acc = entity_performance.loc[entity, 'gliner_accuracy']
    status = "🔴" if gliner_acc < 0.5 else "🟡" if gliner_acc < 0.7 else "🟢" if gliner_acc < 0.9 else "✅"
    
    print(f"   {entity:12}: GLiNER {gliner_acc:.3f} {status}", end="")
    
    if RUN_OPENAI:
        openai_acc = entity_performance.loc[entity, 'openai_accuracy']
        openai_status = "🔴" if openai_acc < 0.5 else "🟡" if openai_acc < 0.7 else "🟢" if openai_acc < 0.9 else "✅"
        winner = "GLiNER" if gliner_acc > openai_acc else "OpenAI" if openai_acc > gliner_acc else "Tie"
        print(f" | OpenAI {openai_acc:.3f} {openai_status} | Winner: {winner}")
    else:
        print()

# Performance by Scenario
print("\n🎭 PERFORMANCE BY SCENARIO:")
scenario_performance = df.groupby('scenario')[['gliner_accuracy', 'openai_accuracy']].mean()

for scenario in scenario_performance.index:
    gliner_acc = scenario_performance.loc[scenario, 'gliner_accuracy']
    print(f"   {scenario:8}: GLiNER {gliner_acc:.3f}", end="")
    
    if RUN_OPENAI:
        openai_acc = scenario_performance.loc[scenario, 'openai_accuracy']
        print(f" | OpenAI {openai_acc:.3f}")
    else:
        print()

# Speed Analysis
print("\n⚡ SPEED ANALYSIS:")
total_gliner_time = df['gliner_time'].sum()
throughput_gliner = len(results) / total_gliner_time if total_gliner_time > 0 else 0

print(f"   🤖 GLiNER Large: {throughput_gliner:.1f} samples/second")

if RUN_OPENAI:
    total_openai_time = df['openai_time'].sum()
    throughput_openai = len(results) / total_openai_time if total_openai_time > 0 else 0
    
    print(f"   🔥 OpenAI: {throughput_openai:.1f} samples/second")
    
    if throughput_openai > 0:
        speedup = throughput_gliner / throughput_openai
        print(f"   📈 GLiNER is {speedup:.1f}x faster than OpenAI")

# Cost Analysis (if OpenAI enabled)
if RUN_OPENAI:
    print("\n💰 COST ANALYSIS (per 1000 samples):")
    
    # Rough OpenAI cost estimate
    openai_cost_1000 = 0.15  # Approximate cost for GPT-4o-mini
    gliner_cost_1000 = 0.0   # Free local model
    
    print(f"   🤖 GLiNER Large: $0.00 (FREE)")
    print(f"   🔥 OpenAI: ~${openai_cost_1000:.2f}")
    print(f"   💡 GLiNER saves ~${openai_cost_1000:.2f} per 1000 samples")

# 📊 VISUAL BENCHMARK CHARTS
print("\n" + "=" * 80)
print("📊 VISUAL BENCHMARK CHARTS")
print("=" * 80)

# Set up the plotting style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (15, 12)
plt.rcParams['font.size'] = 10

if RUN_OPENAI:
    # Create comprehensive comparison charts
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('🔥 GLiNER vs OpenAI Benchmark Results', fontsize=16, fontweight='bold')
    
    # 1. Overall Accuracy Comparison
    models = ['GLiNER Large', 'OpenAI GPT-4o-mini']
    accuracies = [gliner_overall, openai_overall]
    colors = ['#2E8B57', '#FF6B35']
    
    bars1 = ax1.bar(models, accuracies, color=colors, alpha=0.8, edgecolor='black', linewidth=1)
    ax1.set_title('🏆 Overall Accuracy Comparison', fontweight='bold', pad=20)
    ax1.set_ylabel('Accuracy Score')
    ax1.set_ylim(0, 1)
    ax1.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for bar, acc in zip(bars1, accuracies):
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')
    
    # 2. Entity Type Performance
    entities = entity_performance.index
    gliner_accs = entity_performance['gliner_accuracy'].values
    openai_accs = entity_performance['openai_accuracy'].values
    
    x = np.arange(len(entities))
    width = 0.35
    
    bars2 = ax2.bar(x - width/2, gliner_accs, width, label='GLiNER Large', 
                    color='#2E8B57', alpha=0.8, edgecolor='black', linewidth=1)
    bars3 = ax2.bar(x + width/2, openai_accs, width, label='OpenAI GPT-4o-mini', 
                    color='#FF6B35', alpha=0.8, edgecolor='black', linewidth=1)
    
    ax2.set_title('📊 Performance by Entity Type', fontweight='bold', pad=20)
    ax2.set_ylabel('Accuracy Score')
    ax2.set_xlabel('Entity Types')
    ax2.set_xticks(x)
    ax2.set_xticklabels(entities, rotation=45)
    ax2.legend()
    ax2.grid(axis='y', alpha=0.3)
    ax2.set_ylim(0, 1)
    
    # Add value labels
    for bars in [bars2, bars3]:
        for bar in bars:
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                    f'{height:.2f}', ha='center', va='bottom', fontsize=8)
    
    # 3. Speed Comparison
    speeds = [throughput_gliner, throughput_openai]
    bars4 = ax3.bar(models, speeds, color=colors, alpha=0.8, edgecolor='black', linewidth=1)
    ax3.set_title('⚡ Speed Comparison (Samples/Second)', fontweight='bold', pad=20)
    ax3.set_ylabel('Throughput (samples/sec)')
    ax3.grid(axis='y', alpha=0.3)
    
    for bar, speed in zip(bars4, speeds):
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2., height * 1.02,
                f'{speed:.1f}', ha='center', va='bottom', fontweight='bold')
    
    # 4. Cost Analysis
    costs = [0.0, openai_cost_1000]
    bars5 = ax4.bar(models, costs, color=colors, alpha=0.8, edgecolor='black', linewidth=1)
    ax4.set_title('💰 Cost per 1000 Samples (USD)', fontweight='bold', pad=20)
    ax4.set_ylabel('Cost ($)')
    ax4.grid(axis='y', alpha=0.3)
    
    for bar, cost in zip(bars5, costs):
        height = bar.get_height()
        ax4.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'${cost:.2f}', ha='center', va='bottom', fontweight='bold')
    
else:
    # GLiNER-only visualization
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('🤖 GLiNER Large Performance Analysis', fontsize=16, fontweight='bold')
    
    # 1. Overall Performance
    ax1.bar(['GLiNER Large'], [gliner_overall], color='#2E8B57', alpha=0.8, 
            edgecolor='black', linewidth=1)
    ax1.set_title('🏆 Overall Accuracy', fontweight='bold', pad=20)
    ax1.set_ylabel('Accuracy Score')
    ax1.set_ylim(0, 1)
    ax1.grid(axis='y', alpha=0.3)
    ax1.text(0, gliner_overall + 0.02, f'{gliner_overall:.3f}', 
            ha='center', va='bottom', fontweight='bold')
    
    # 2. Entity Type Performance
    entities = entity_performance.index
    gliner_accs = entity_performance['gliner_accuracy'].values
    
    colors_entities = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
    bars = ax2.bar(entities, gliner_accs, color=colors_entities, alpha=0.8, 
                  edgecolor='black', linewidth=1)
    ax2.set_title('📊 Performance by Entity Type', fontweight='bold', pad=20)
    ax2.set_ylabel('Accuracy Score')
    ax2.set_xlabel('Entity Types')
    ax2.tick_params(axis='x', rotation=45)
    ax2.grid(axis='y', alpha=0.3)
    ax2.set_ylim(0, 1)
    
    for bar, acc in zip(bars, gliner_accs):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{acc:.2f}', ha='center', va='bottom', fontsize=9)
    
    # 3. Speed Analysis
    ax3.bar(['GLiNER Large'], [throughput_gliner], color='#2E8B57', alpha=0.8,
            edgecolor='black', linewidth=1)
    ax3.set_title('⚡ Processing Speed', fontweight='bold', pad=20)
    ax3.set_ylabel('Throughput (samples/sec)')
    ax3.grid(axis='y', alpha=0.3)
    ax3.text(0, throughput_gliner * 1.02, f'{throughput_gliner:.1f}', 
            ha='center', va='bottom', fontweight='bold')
    
    # 4. Scenario Performance
    scenarios = scenario_performance.index
    scenario_accs = scenario_performance['gliner_accuracy'].values
    
    ax4.bar(scenarios, scenario_accs, color=['#FFB6C1', '#98FB98'], alpha=0.8,
            edgecolor='black', linewidth=1)
    ax4.set_title('🎭 Performance by Scenario', fontweight='bold', pad=20)
    ax4.set_ylabel('Accuracy Score')
    ax4.set_xlabel('Scenarios')
    ax4.grid(axis='y', alpha=0.3)
    ax4.set_ylim(0, 1)
    
    for i, (scenario, acc) in enumerate(zip(scenarios, scenario_accs)):
        ax4.text(i, acc + 0.01, f'{acc:.2f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# Final Recommendation
print("\n🎯 FINAL RECOMMENDATION:")
if RUN_OPENAI:
    if gliner_overall >= openai_overall * 0.95:  # Within 5%
        print("   🏆 RECOMMENDATION: Use GLiNER Large")
        print("   💡 Reasons: Comparable accuracy + FREE + Faster + Privacy")
    else:
        accuracy_gap = openai_overall - gliner_overall
        print("   🤔 RECOMMENDATION: Consider your priorities")
        print(f"   📊 OpenAI has {accuracy_gap:.3f} better accuracy but costs money")
        print(f"   💰 GLiNER is free and faster but {accuracy_gap:.3f} lower accuracy")
else:
    print("   🏆 GLiNER Large Performance Summary:")
    print(f"   📊 Overall Accuracy: {gliner_overall:.3f}")
    print(f"   ⚡ Speed: {throughput_gliner:.1f} samples/second")
    print(f"   💰 Cost: FREE")
    print(f"   🔒 Privacy: Complete (local processing)")

print("\n" + "=" * 80)
print("✅ Benchmark analysis completed!")
print("📊 Visual charts displayed above!")
print("🚀 Ready for production deployment!")


In [None]:
# 🔄 Pull Latest Changes from GitHub (Colab Setup)
import os
import subprocess

def run_command(cmd):
    """Run shell command and return output"""
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        return result.returncode == 0, result.stdout, result.stderr
    except Exception as e:
        return False, "", str(e)

print("🚀 Setting up latest version from GitHub...")

# Repository details
REPO_URL = "https://github.com/shubhamhackz/ner_benchmark.git"
REPO_NAME = "ner_benchmark"

# Check if we're in Colab
try:
    import google.colab
    IN_COLAB = True
    print("📍 Running in Google Colab")
except ImportError:
    IN_COLAB = False
    print("📍 Running locally")

if IN_COLAB:
    # Change to content directory in Colab
    os.chdir('/content')
    
    # Check if repository already exists
    if os.path.exists(REPO_NAME):
        print(f"📂 Repository '{REPO_NAME}' found - pulling latest changes...")
        os.chdir(REPO_NAME)
        
        # Pull latest changes
        success, stdout, stderr = run_command("git pull origin main")
        if success:
            print("✅ Successfully pulled latest changes!")
            if stdout.strip():
                print(f"📄 Git output: {stdout.strip()}")
        else:
            print(f"⚠️ Pull failed: {stderr}")
            print("🔄 Trying to reset and pull again...")
            run_command("git reset --hard HEAD")
            success, stdout, stderr = run_command("git pull origin main")
            if success:
                print("✅ Successfully pulled after reset!")
            else:
                print(f"❌ Still failed: {stderr}")
    else:
        print(f"📥 Cloning repository '{REPO_NAME}'...")
        success, stdout, stderr = run_command(f"git clone {REPO_URL}")
        if success:
            print("✅ Successfully cloned repository!")
            os.chdir(REPO_NAME)
        else:
            print(f"❌ Clone failed: {stderr}")
    
    # Show current status
    if os.path.exists('.git'):
        success, commit_hash, _ = run_command("git rev-parse --short HEAD")
        success2, branch, _ = run_command("git rev-parse --abbrev-ref HEAD")
        
        if success and success2:
            print(f"📍 Current: {branch.strip()} @ {commit_hash.strip()}")
        
        # Show recent commits
        success, log_output, _ = run_command("git log --oneline -3")
        if success:
            print(f"📋 Recent commits:")
            for line in log_output.strip().split('\n')[:3]:
                if line.strip():
                    print(f"   • {line.strip()}")
    
    print(f"📁 Working directory: {os.getcwd()}")
    print("🎯 Ready to run the NER benchmark notebook!")

else:
    print("💻 Running locally - skipping git operations")
    print("💡 Make sure you've pulled the latest changes manually if needed")

print("=" * 60)


In [None]:
# 🧪 Interactive Entity Extraction Testing
print("🧪 INTERACTIVE ENTITY EXTRACTION TESTING")
print("=" * 80)
print("💡 Test the models with your own text!")
print("📝 Enter any text and see both GLiNER and OpenAI extract entities")
print("🏷️ Entities: person, email, phone, organization")
print("=" * 80)

def format_extraction_results(results, model_name, extraction_time):
    """Format and display extraction results beautifully"""
    print(f"\n🤖 {model_name} Results (⏱️ {extraction_time:.4f}s):")
    print("─" * 50)
    
    found_entities = False
    for entity_type, entities in results.items():
        if entities:
            found_entities = True
            entities_str = ", ".join([f"'{entity}'" for entity in entities])
            icon = {"person": "👤", "email": "📧", "phone": "📞", "organization": "🏢"}.get(entity_type, "🏷️")
            print(f"   {icon} {entity_type.title()}: {entities_str}")
    
    if not found_entities:
        print("   ❌ No entities found")

def interactive_test():
    """Run interactive entity extraction test"""
    test_count = 0
    
    while True:
        test_count += 1
        print(f"\n🔍 TEST #{test_count}")
        print("─" * 30)
        
        # Get user input
        print("📝 Enter text to analyze (or 'quit' to exit):")
        user_text = input("➤ ").strip()
        
        if user_text.lower() in ['quit', 'exit', 'q', '']:
            print("👋 Goodbye! Thanks for testing!")
            break
            
        if len(user_text) < 3:
            print("⚠️ Please enter more text (at least 3 characters)")
            continue
            
        print(f"\n📄 Input Text:")
        print(f"   \"{user_text}\"")
        print("\n🚀 Extracting entities...")
        
        # Extract with GLiNER
        try:
            gliner_results, gliner_time = benchmark.extract_with_gliner(user_text)
            format_extraction_results(gliner_results, "GLiNER Large", gliner_time)
        except Exception as e:
            print(f"❌ GLiNER extraction failed: {e}")
            
        # Extract with OpenAI (if enabled)
        if RUN_OPENAI:
            try:
                openai_results, openai_time = benchmark.extract_with_openai(user_text)
                format_extraction_results(openai_results, "OpenAI GPT-4o-mini", openai_time)
                
                # Speed comparison
                if gliner_time > 0 and openai_time > 0:
                    speedup = openai_time / gliner_time
                    print(f"\n⚡ Speed: GLiNER is {speedup:.1f}x faster than OpenAI")
                    
            except Exception as e:
                print(f"❌ OpenAI extraction failed: {e}")
        else:
            print(f"\n💡 OpenAI comparison disabled - running GLiNER only mode")
            
        print("\n" + "="*50)

# Example test cases
print("\n💡 Example test cases you can try:")
examples = [
    "Dr. Sarah Johnson from TechCorp Inc. Contact: sarah.j@techcorp.com or +1-555-0123",
    "Michael Brown, Senior Developer at Innovation Labs. Email: m.brown@innolabs.org Phone: 555-0987",
    "Contact Lisa Wilson at Future Systems (lisa@future-sys.net) for support. Call 555-1234.",
    "John Smith works at Global Dynamics. Reach him at john.smith@globaldyn.com or 555-5678"
]

for i, example in enumerate(examples, 1):
    print(f"   {i}. {example}")

print(f"\n🎯 Choose one of the examples above, or enter your own text!")

# Start interactive testing
try:
    interactive_test()
except KeyboardInterrupt:
    print("\n\n🛑 Testing interrupted by user")
except Exception as e:
    print(f"\n❌ Error during interactive testing: {e}")
    print("💡 You can still run individual extractions manually using:")
    print("   benchmark.extract_with_gliner('your text here')")
    if RUN_OPENAI:
        print("   benchmark.extract_with_openai('your text here')")

print("\n✅ Interactive testing session completed!")
