In [1]:
# ============================================================
#  Named Entity Recognition from Dataset Files — SpaCy
# ============================================================
# Tasks:
# 1️⃣ Load CoNLL-style dataset files from Google Drive
# 2️⃣ Convert them into readable sentences
# 3️⃣ Perform NER using two spaCy models (sm + trf)
# 4️⃣ Visualize & compare entity extraction
# ------------------------------------------------------------

# --- 1. Install & Import Libraries ---
!pip install -q spacy pandas
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

import pandas as pd
import spacy
from spacy import displacy
from IPython.display import display, HTML

# --- 2. Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- 3. Dataset Paths ---
metadata_path = '/content/drive/MyDrive/Datasets/metadata'
train_path = '/content/drive/MyDrive/Datasets/train.txt'
test_path = '/content/drive/MyDrive/Datasets/test.txt'
valid_path = '/content/drive/MyDrive/Datasets/valid.txt'

# --- 4. Function: Convert CoNLL-like data to sentences ---
def load_and_clean_conll(path):
    sentences, current = [], []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("-DOCSTART-"):
                if current:
                    sentences.append(" ".join(current))
                    current = []
            else:
                parts = line.split()
                if len(parts) >= 1:
                    current.append(parts[0])
        if current:
            sentences.append(" ".join(current))
    return sentences

# --- 5. Load and Clean Data ---
train = load_and_clean_conll(train_path)
test = load_and_clean_conll(test_path)
valid = load_and_clean_conll(valid_path)

# Combine datasets for NER
all_texts = train + test + valid

# --- 6. Load SpaCy Models ---
nlp_small = spacy.load("en_core_web_sm")   # Small rule-based model
nlp_trf   = spacy.load("en_core_web_trf")  # Transformer-based model

# --- 7. Function to Extract & Visualize Entities ---
def extract_entities(texts, nlp_model, model_name="Model"):
    data = []
    print(f"\n=== Extracting entities using {model_name} ===\n")
    for i, doc in enumerate(nlp_model.pipe(texts, batch_size=20)):
        ents = [(ent.text, ent.label_) for ent in doc.ents]
        data.append({
            "Text": texts[i][:200] + "..." if len(texts[i]) > 200 else texts[i],
            "Entities": ents
        })
        # Display first 2 processed examples with highlighted entities
        if i < 2 and ents:
            html = displacy.render(doc, style="ent", jupyter=True)
            display(HTML(html))
    df = pd.DataFrame(data)
    display(df.head(5))
    return df

# --- 8. Extract Entities (Model Comparison) ---
df_small = extract_entities(all_texts[:50], nlp_small, "en_core_web_sm")   # limit to 50 for speed
df_trf   = extract_entities(all_texts[:50], nlp_trf, "en_core_web_trf")

# --- 9. Compare Entity Counts ---
def compare_entity_counts(df1, df2):
    from collections import Counter
    counts1 = Counter([label for ents in df1['Entities'] for _, label in ents])
    counts2 = Counter([label for ents in df2['Entities'] for _, label in ents])
    print("\n=== Entity Counts Comparison ===")
    print("\nSmall Model:", dict(counts1))
    print("\nTransformer Model:", dict(counts2))

compare_entity_counts(df_small, df_trf)

print("\n✅ Done! Entities extracted, visualized, and compared successfully.")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/12.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/12.8 MB[0m [31m177.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m12.8/12.8 MB[0m [31m206.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m12.8/12.8 MB[0m [31m206.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m107.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter 

<IPython.core.display.HTML object>

<IPython.core.display.HTML object>

Unnamed: 0,Text,Entities
0,EU rejects German call to boycott British lamb .,"[(EU, ORG), (German, NORP), (British, NORP)]"
1,Peter Blackburn,"[(Peter Blackburn, PERSON)]"
2,BRUSSELS 1996-08-22,"[(BRUSSELS, GPE), (1996-08-22, DATE)]"
3,The European Commission said on Thursday it di...,"[(The European Commission, ORG), (Thursday, DA..."
4,Germany 's representative to the European Unio...,"[(Germany, GPE), (the European Union 's, ORG),..."



=== Extracting entities using en_core_web_trf ===



<IPython.core.display.HTML object>

<IPython.core.display.HTML object>

Unnamed: 0,Text,Entities
0,EU rejects German call to boycott British lamb .,"[(EU, ORG), (German, NORP), (British, NORP)]"
1,Peter Blackburn,"[(Peter Blackburn, PERSON)]"
2,BRUSSELS 1996-08-22,"[(BRUSSELS, GPE), (1996-08-22, DATE)]"
3,The European Commission said on Thursday it di...,"[(The European Commission, ORG), (Thursday, DA..."
4,Germany 's representative to the European Unio...,"[(Germany, GPE), (the European Union 's, ORG),..."



=== Entity Counts Comparison ===

Small Model: {'ORG': 22, 'NORP': 20, 'PERSON': 24, 'GPE': 36, 'DATE': 29, 'LOC': 3, 'CARDINAL': 9, 'QUANTITY': 1, 'PERCENT': 5, 'MONEY': 6, 'LANGUAGE': 1, 'TIME': 1, 'ORDINAL': 1}

Transformer Model: {'ORG': 21, 'NORP': 20, 'PERSON': 24, 'GPE': 38, 'DATE': 28, 'LOC': 2, 'CARDINAL': 9, 'QUANTITY': 1, 'PERCENT': 6, 'MONEY': 6, 'WORK_OF_ART': 1, 'TIME': 2}

✅ Done! Entities extracted, visualized, and compared successfully.
