# **Task 4**

## **Import Libraries**

In [None]:
import kagglehub
import os
import pandas as pd

In [None]:
!pip install -q spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install -q spacy
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md

import spacy
from spacy import displacy

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
from IPython.display import display, HTML

## **Load the data and Exploration**

In [None]:
path = kagglehub.dataset_download("alaakhaled/conll003-englishversion")

print("Path to dataset files:", path)
print("Files in directory:", os.listdir(path))


Downloading from https://www.kaggle.com/api/v1/datasets/download/alaakhaled/conll003-englishversion?dataset_version_number=1...


100%|██████████| 960k/960k [00:00<00:00, 30.6MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/alaakhaled/conll003-englishversion/versions/1
Files in directory: ['train.txt', 'test.txt', 'metadata', 'valid.txt']





In [None]:
# Load CoNLL-2003 train set
file_path = os.path.join(path, 'train.txt')

In [None]:
# Parse CoNLL-2003 file
data = []
with open(file_path, 'r') as f:
    sentence = []
    for line in f:
        line = line.strip()
        if line:
            parts = line.split()
            if len(parts) == 4:
                word, pos, chunk, ner = parts
                sentence.append((word, ner))
        else:
            if sentence:
                data.append(sentence)
                sentence = []
    if sentence:
        data.append(sentence)

print(f"\nLoaded {len(data)} sentences.")



Loaded 14987 sentences.


## **Rule-based NER**

In [None]:
def rule_based_ner(sentence):
    entities = []
    for i, (word, ner) in enumerate(sentence):
        # Rule: Capitalized words (not first in sentence) are entities
        if word[0].isupper() and i != 0:
            entities.append((word, "POTENTIAL_ENTITY"))
    return entities

processed_data = []
for sentence in data:
    processed_sentence = []
    for word, ner in sentence:
        processed_sentence.append((word, ner))
    processed_data.append(processed_sentence)


In [None]:
print("\nRule-based NER results (first 2 sentences):")
for i, sent in enumerate(data[:5]):
    print(f"Sentence {i+1}: {rule_based_ner(sent)}")



Rule-based NER results (first 2 sentences):
Sentence 1: []
Sentence 2: [('German', 'POTENTIAL_ENTITY'), ('British', 'POTENTIAL_ENTITY')]
Sentence 3: [('Blackburn', 'POTENTIAL_ENTITY')]
Sentence 4: []
Sentence 5: [('European', 'POTENTIAL_ENTITY'), ('Commission', 'POTENTIAL_ENTITY'), ('Thursday', 'POTENTIAL_ENTITY'), ('German', 'POTENTIAL_ENTITY'), ('British', 'POTENTIAL_ENTITY')]


## **Model-based NER (spaCy)**

In [None]:
# Function to perform model-based NER
def model_based_ner(sentence_tokens):
    text = " ".join([word for word, tag in sentence_tokens])
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities

# Apply model-based NER to the first few processed sentences
print("Model-based NER results for the first 5 sentences:")
for i, sent in enumerate(processed_data[:5]):
    model_entities = model_based_ner(sent)
    print(f"Sentence {i+1}: {model_entities}")

Model-based NER results for the first 5 sentences:
Sentence 1: []
Sentence 2: [('EU', 'ORG'), ('German', 'NORP'), ('British', 'NORP')]
Sentence 3: [('Peter Blackburn', 'PERSON')]
Sentence 4: [('BRUSSELS', 'GPE'), ('1996-08-22', 'DATE')]
Sentence 5: [('The European Commission', 'ORG'), ('Thursday', 'DATE'), ('German', 'NORP'), ('British', 'NORP')]


## **Highlight and categorize entities**

In [None]:
def highlight_entities(sentence_tokens, entities):
    text = " ".join([word for word, tag in sentence_tokens])
    html_text = text

    # Sort entities by their order in text
    sorted_entities = sorted(entities, key=lambda x: text.find(x[0]))

    for entity_text, entity_type in sorted_entities:
        html_text = html_text.replace(
            entity_text,
            f'<mark style="background-color: #FFD700" data-entity="{entity_type}">'
            f'{entity_text} ({entity_type})</mark>'
        )
    return html_text

# Example of highlighting using model-based NER results for the first 5 sentences
print("Highlighted entities using Model-based NER:")
for i, sent in enumerate(processed_data[:5]):
    model_entities = model_based_ner(sent)
    highlighted_html = highlight_entities(sent, model_entities)
    print(f"Sentence {i+1}:")
    display(HTML(highlighted_html))

Highlighted entities using Model-based NER:
Sentence 1:


Sentence 2:


Sentence 3:


Sentence 4:


Sentence 5:


# **Bouns**

In [None]:
# Load two spaCy models
nlp_sm = spacy.load("en_core_web_sm")
nlp_md = spacy.load("en_core_web_md")

In [None]:
def model_based_ner(sentence_tokens, nlp_model):
    text = " ".join([word for word, tag in sentence_tokens])
    doc = nlp_model(text)
    return doc

In [None]:
# Pick first 2 sentences for visualization
sample_sentences = data[:2]

print("\nModel-based NER comparison:")

for i, sent in enumerate(sample_sentences):
    print(f"\n----- Sentence {i+1} -----")
    text = " ".join([w for w, t in sent])
    print("Text:", text)

    # Small model
    doc_sm = model_based_ner(sent, nlp_sm)
    print("Entities (sm):", [(ent.text, ent.label_) for ent in doc_sm.ents])

    # Medium model
    doc_md = model_based_ner(sent, nlp_md)
    print("Entities (md):", [(ent.text, ent.label_) for ent in doc_md.ents])

    # Visualize with displaCy
    print("\nVisualization (sm):")
    displacy.render(doc_sm, style="ent", jupyter=True)
    print("\nVisualization (md):")
    displacy.render(doc_md, style="ent", jupyter=True)


Model-based NER comparison:

----- Sentence 1 -----
Text: -DOCSTART-
Entities (sm): []
Entities (md): [('-DOCSTART-', 'ORG')]

Visualization (sm):



Visualization (md):



----- Sentence 2 -----
Text: EU rejects German call to boycott British lamb .
Entities (sm): [('EU', 'ORG'), ('German', 'NORP'), ('British', 'NORP')]
Entities (md): [('EU', 'ORG'), ('German', 'NORP'), ('British', 'NORP')]

Visualization (sm):



Visualization (md):
