In [1]:
!pip install spacy pandas




In [2]:
import pandas as pd

# Load your dataset
df = pd.read_csv("ner_large_unprocessed_dataset.csv")

# Preview
print(df.head())
print(df.columns)


                                            Sentence  \
0  Kindle was launched by Microsoft at the World ...   
1          Sarah visited Mumbai during the Olympics.   
2  The Last Supper is displayed at a museum in Pa...   
3                  Olivia donated €1 billion to WHO.   
4  The Sydney Opera House is a famous monument in...   

                                            Entities  
0  [(23, 32, 'ORG'), (0, 6, 'PRODUCT'), (40, 49, ...  
1  [(0, 5, 'PER'), (14, 20, 'LOC'), (32, 40, 'EVE...  
2          [(44, 49, 'LOC'), (0, 15, 'WORK_OF_ART')]  
3  [(0, 6, 'PER'), (29, 32, 'ORG'), (15, 25, 'MON...  
4  [(47, 52, 'LOC'), (4, 10, 'LOC'), (4, 22, 'FAC...  
Index(['Sentence', 'Entities'], dtype='object')


In [3]:
import ast

# Safely convert string of list of tuples to actual list
df['Entities'] = df['Entities'].apply(ast.literal_eval)

# Check example
print(df.iloc[0])


Sentence    Kindle was launched by Microsoft at the World ...
Entities    [(23, 32, ORG), (0, 6, PRODUCT), (40, 49, EVENT)]
Name: 0, dtype: object


In [4]:
import spacy
from spacy.tokens import DocBin

nlp = spacy.blank("en")  # blank English pipeline
doc_bin = DocBin()

for i, row in df.iterrows():
    text = row['Sentence']
    entities = row['Entities']

    doc = nlp.make_doc(text)
    ents = []
    
    # Sort entities by their start position to avoid overlap
    entities = sorted(entities, key=lambda x: x[0])

    # Ensure no overlaps in entities
    for start, end, label in entities:
        # Check for overlapping entities with existing ones
        is_overlapping = any(start < existing_end and end > existing_start for existing_start, existing_end, _ in ents)
        if is_overlapping:
            print(f"⚠️ Skipping overlapping span: {(start, end, label)} in '{text}'")
        else:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print(f"⚠️ Skipping invalid span: {(start, end, label)} in '{text}'")
            else:
                ents.append((start, end, label))  # Store as tuple for overlap check

    # Assign valid entity spans to doc
    doc.ents = [doc.char_span(start, end, label=label) for start, end, label in ents]
    doc_bin.add(doc)

# Save to file
doc_bin.to_disk("data.spacy")


⚠️ Skipping overlapping span: (4, 22, 'FACILITY') in 'The Sydney Opera House is a famous monument in Paris.'
⚠️ Skipping overlapping span: (4, 22, 'FACILITY') in 'The Sydney Opera House is a famous monument in Cairo.'
⚠️ Skipping overlapping span: (24, 26, 'ORG') in 'Michael donated $100 to UNICEF.'
⚠️ Skipping overlapping span: (23, 25, 'ORG') in 'iPhone was launched by UNICEF at the Super Bowl.'
⚠️ Skipping overlapping span: (4, 22, 'FACILITY') in 'The Sydney Opera House is a famous monument in Berlin.'
⚠️ Skipping overlapping span: (0, 2, 'ORG') in 'UNICEF announced a new iPhone on January 1st.'
⚠️ Skipping overlapping span: (29, 31, 'ORG') in 'Olivia donated $5 million to UNICEF.'
⚠️ Skipping overlapping span: (28, 30, 'ORG') in 'PlayStation was launched by UNICEF at the Comic-Con.'
⚠️ Skipping overlapping span: (23, 25, 'ORG') in 'iPhone was launched by UNICEF at the Super Bowl.'
⚠️ Skipping overlapping span: (23, 25, 'ORG') in 'Olivia donated €200 to UNICEF.'
⚠️ Skipping overlapp

In [5]:
import spacy
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

# Train-test split
train, dev = train_test_split(df, test_size=0.2)

# Initialize spaCy blank model
nlp = spacy.blank("en")

def create_spacy_file(df, filename):
    doc_bin = DocBin()
    for i, row in df.iterrows():
        text = row["Sentence"]
        entities = row["Entities"]
        
        doc = nlp.make_doc(text)
        ents = []
        
        # Sort entities by start position to avoid overlaps
        entities = sorted(entities, key=lambda x: x[0])

        # Create entity spans and check for overlaps
        for start, end, label in entities:
            # Check if the entity span overlaps with an existing one
            if any(start < existing_end and end > existing_start for existing_start, existing_end, _ in ents):
                print(f"⚠️ Skipping overlapping span: {(start, end, label)} in '{text}'")
            else:
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                if span:
                    ents.append((start, end, label))  # Store as tuple for overlap check

        # Set valid entity spans
        doc.ents = [doc.char_span(start, end, label=label) for start, end, label in ents]
        doc_bin.add(doc)

    doc_bin.to_disk(filename)

# Create spaCy files for train and dev sets
create_spacy_file(train, "train.spacy")
create_spacy_file(dev, "dev.spacy")


⚠️ Skipping overlapping span: (0, 2, 'ORG') in 'UNICEF raised €200 in April.'
⚠️ Skipping overlapping span: (30, 36, 'NATIONALITY') in 'Alice wrote about Guernica in French.'
⚠️ Skipping overlapping span: (27, 29, 'ORG') in 'John donated €1 billion to UNICEF.'
⚠️ Skipping overlapping span: (12, 18, 'NATIONALITY') in 'David is an French citizen working in Paris.'
⚠️ Skipping overlapping span: (0, 2, 'ORG') in 'UNICEF announced a new iPhone on December 2024.'
⚠️ Skipping overlapping span: (0, 2, 'ORG') in 'UNICEF announced a new iPhone on January 1st.'
⚠️ Skipping overlapping span: (0, 2, 'ORG') in 'UNICEF announced a new iPhone on April.'
⚠️ Skipping overlapping span: (29, 31, 'ORG') in 'Robert donated €1 billion to UNICEF.'
⚠️ Skipping overlapping span: (13, 19, 'NATIONALITY') in 'Sophia is an French citizen working in Tokyo.'
⚠️ Skipping overlapping span: (29, 35, 'NATIONALITY') in 'Liam wrote about Guernica in French.'
⚠️ Skipping overlapping span: (0, 2, 'ORG') in 'UNICEF announced 

In [7]:
import spacy

# Load the trained model
nlp = spacy.load("output/model-best")

# Test on a new sentence
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.label_)


Apple ORG
U.K. NATIONALITY
$1 billion MONEY


In [8]:
import spacy
nlp = spacy.load("en_pipeline")
