In [1]:
import pandas as pd
import spacy
import tensorflow as tf
import pickle
import re
from tqdm import tqdm
from spacy.training.example import Example
from spacy.pipeline.ner import EntityRecognizer
import random

In [2]:
df = pd.read_csv("ner_datasetreference.csv",encoding='latin-1')

In [3]:
df = df.ffill(axis = 0)
df.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [4]:
df['Tag'].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [5]:
# Creating the sentences and the entities in the below format
#("Apple is looking to buy a startup called XYZ Corp in New York.", {"entities": [(0, 5, "ORG"), (35, 42, "ORG"), (46, 54, "GPE")]}),
unique_sent_num =  df['Sentence #'].unique()

In [6]:
def sentence_entities(id):
    tempdf = df[df['Sentence #'] == id].reset_index(drop=True)
    #entities = [(sent.index(tempdf['Word'][i]),sent.index(tempdf['Word'][i]) + len(sent.index(tempdf['Word'][i])) , tempdf['Tag'][i]) for i in range(len(tempdf)) if tempdf['Tag'][i] != "O"]
    sent = " ".join(tempdf['Word']).lower()
    sent=sent.replace(".", "").strip()
    #df_dum = tempdf[tempdf['Tag'] !='O'].reset_index(drop=False)
    entities_dict =dict()
    entities_dict['entities']=[]
    sent_length = 0
    final_sent = []
    for j in range(len(tempdf)):
        word = tempdf['Word'][j].lower()
        pattern = r'\b' + re.escape(word) + r'\b'
        match = re.search(pattern, sent)
        if match:
            final_sent.append(word)
            str_ind = match.start()
            end_ind = match.end()
            entities_dict['entities'].append((str_ind+sent_length, end_ind+sent_length, tempdf['Tag'][j]))
            sent, sent_length = sent[end_ind:], len(sent[:end_ind])+sent_length
        if len(final_sent) != len(entities_dict['entities']):
            print(id)
    return (" ".join(final_sent), entities_dict)

In [7]:
create=False
if create:
    input = [] 
    for i in tqdm(unique_sent_num):
        input.append(sentence_entities(i))
    #input = list(map(sentence_entities,unique_sent_num))
    # Open a file and use dump()
    with open('ner_input_sep14.pkl', 'wb') as file:
        # A new file will be created
        pickle.dump(input, file)  
else:
    file = open("ner_input_sep14.pkl",'rb')
    input = pickle.load(file)

These labels appear to be in a more detailed format, including prefixes like "B-" and "I-" to indicate the beginning and inside of entities, respectively. Here's what each of these labels represents:

B-geo: Beginning of a geographical entity.
B-gpe: Beginning of a geopolitical entity.
B-per: Beginning of a person's name.
I-geo: Inside a geographical entity.
B-org: Beginning of an organization's name.
I-org: Inside an organization's name.
B-tim: Beginning of a time or date expression.
B-art: Beginning of an artifact (e.g., works of art).
I-art: Inside an artifact.
I-per: Inside a person's name.
I-gpe: Inside a geopolitical entity.
I-tim: Inside a time or date expression.
B-nat: Beginning of a natural phenomenon.
B-eve: Beginning of an event.
I-eve: Inside an event.
I-nat: Inside a n

This labeling scheme allows you to represent and recognize a wide range of named entities with different roles and structures within text data.atural phenomenon

In [8]:
# Step 2: Initialize a blank spaCy model
nlp = spacy.blank("en")

# Step 3: Create an EntityRecognizer and add it to the pipeline
ner = nlp.add_pipe("ner")

# Add custom labels (IOB tags)
labels = ['O','B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim', 'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve', 'I-eve', 'I-nat']
for label in labels:
    ner.add_label(label)

In [9]:
# Step 4: Data Preprocessing
def preprocess_data(data):
    processed_data = []
    for text, entities in tqdm(data):
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, {"entities": entities['entities']})
        processed_data.append(example)
    return processed_data

In [10]:
len(input)

47959

In [11]:
input = input[0:2000]

In [12]:
# Step 5: Split the data into training and validation sets
train_data = preprocess_data(input)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:02<00:00, 791.28it/s]


In [13]:
train_data[0]

{'doc_annotation': {'cats': {}, 'entities': ['U-O', 'U-O', 'U-O', 'U-O', 'U-O', 'U-O', 'U-B-geo', 'U-O', 'U-O', 'U-O', 'U-O', 'U-O', 'U-B-geo', 'U-O', 'U-O', 'U-O', 'U-O', 'U-O', 'U-B-gpe', 'U-O', 'U-O', 'U-O', 'U-O'], 'spans': {}, 'links': {}}, 'token_annotation': {'ORTH': ['thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'london', 'to', 'protest', 'the', 'war', 'in', 'iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'british', 'troops', 'from', 'that', 'country'], 'SPACY': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False], 'TAG': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'LEMMA': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'POS': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'MORPH': ['', '', '', '', '', '', '', '', '', '', '', '

In [14]:
train_data_copy =  train_data

In [15]:
train_data = train_data[0:2000]

In [16]:
# Step 6: Training
random.seed(1)
losses = {}
optimizer = nlp.begin_training()
for _ in range(10):  # Number of training iterations
    random.shuffle(train_data)
    for example in train_data:
        nlp.update([example], drop=0.5, losses=losses)

In [19]:
# Step 7: Validation and Metric Calculation
true_labels = []
predicted_labels = []

In [20]:
# Step 6s: Save the trained model
nlp.to_disk("custom_ner_model")

In [21]:
# Step 9: Inference
loaded_nlp = spacy.load("custom_ner_model")
text = "thousands of demonstrators have marched through india to protest the war in india and demand the withdrawal of indian troops from that country"
doc = loaded_nlp(text)
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

Entity: thousands, Label: O
Entity: of, Label: O
Entity: demonstrators, Label: O
Entity: have, Label: O
Entity: marched, Label: O
Entity: through, Label: O
Entity: india, Label: B-org
Entity: to, Label: O
Entity: protest, Label: O
Entity: the, Label: O
Entity: war, Label: O
Entity: in, Label: O
Entity: india, Label: B-geo
Entity: and, Label: O
Entity: demand, Label: O
Entity: the, Label: O
Entity: withdrawal, Label: O
Entity: of, Label: O
Entity: indian, Label: B-gpe
Entity: troops, Label: O
Entity: from, Label: O
Entity: that, Label: O
Entity: country, Label: O


In [27]:
from spacy import displacy

In [28]:
colors = {"B-org": "#F67DE3", "O": "#7DF6D9", "B-geo":"#FFFFFF"}
options = {"colors": colors} 
spacy.displacy.render(doc, style="ent", options= options, jupyter=True)