## TP3: POS & NER tagging

In [1]:
import spacy
from collections import Counter
from spacy.training.example import Example

In [2]:
nlp = spacy.blank("en")
def is_plural_noun(token):
    if token.pos_ in [ "PROPN", "VERB", "ADJ", "ADV" , "ADP", "AUX", "CONJ", "DET", "INTJ", "NUM", "PART", "PRON", "SCONJ", "SYM", "X" ]:
        return False
    if token.text.endswith('s') and len(token.text) > 1:
        if token.text.lower() not in ["is", "was", "has", "does" , "always"] : # exception
            if not (token.text.endswith("'s") or token.text.endswith("s'")): # possessive
                return True
    return False

# Define a custom pipeline component for the POS tagger
@nlp.component("custom_pos_tagger")
def custom_pos_tagger(doc):
    for token in doc:
        if is_plural_noun(token):
            token.tag_ = "NNS"  # overwrite the token tag
    return doc

# Add the custom component to the pipeline
nlp.add_pipe("custom_pos_tagger") # add last to the pipeline


<function __main__.custom_pos_tagger(doc)>

In [3]:
 # Prepare a training dataset with annotated POS tags
training_data = [
    ("Cats are interesting animals.", {"tags": ["NNS", "VBP", "JJ", "NNS", "."]}),
    ("Dogs and cats are friends.", {"tags": ["NNS", "CC", "NNS", "VBP", "NNS", "."]}),
    ("Cats are from Venus.", {"tags": ["NNS", "VBP", "IN", "NNP", "."]}),
    ("Dogs are from Mars.", {"tags": ["NNS", "VBP", "IN", "NNP", "."]}),
    ("I have three dogs.", {"tags": ["PRP", "VBP", "DT", "NNS", "."]}),
    ("I have five cars.", {"tags": ["PRP", "VBP", "DT", "NNS", "."]}),
    ("I have ten cats and a dog.", {"tags": ["PRP", "VBP", "DT", "NNS", "CC", "DT", "NN", "."]}),
    ("My cat's toys are green.", {"tags": ["PRP$", "NN", "POS", "NNS", "VBZ", "JJ", "."]}),
    ("I thought dogs' toys were green.", {"tags": ["PRP", "VBD", "NNS", "POS", "NNS", "VBD", "JJ", "."]}),
]

In [4]:
# Train the model
optimizer = nlp.begin_training()
losses = {}
for text, annotations in training_data:
    example = Example.from_dict(nlp.make_doc(text), annotations)
    nlp.update([example] ,  sgd=optimizer,drop=0.35, losses=losses)

In [5]:
# Test the custom POS tagger
doc = nlp("The city's parks are always bustling with people and their dogs. Children run and play, while adults enjoy picnics under the trees. The flowers in the gardens bloom in many colors, attracting bees and butterflies. It's a peaceful and lively place for everyone to relax and unwind")
test_results = [token.text if token.tag_ == "NNS" else "" for token in doc]

test_results = [(token.text, token.tag_) for token in doc if token.tag_ == "NNS"]
print(test_results)
#print((test_results, {"tags": ["NNS" if token.tag_ == "NNS" else "" for token in doc]}))

[('parks', 'NNS'), ('dogs', 'NNS'), ('adults', 'NNS'), ('picnics', 'NNS'), ('trees', 'NNS'), ('flowers', 'NNS'), ('gardens', 'NNS'), ('colors', 'NNS'), ('bees', 'NNS'), ('butterflies', 'NNS')]


In [8]:
from spacy.training.example import offsets_to_biluo_tags
import random
# to ignore warnings
import warnings
from spacy import displacy
warnings.filterwarnings("ignore")
# Create a spaCy model with the "ner" component
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Add the new entity label "COLOR" to the NER model
ner.add_label("COLOR")

1

In [9]:

TRAIN_DATA = [
    ("The sky is blue.", {"entities": [(11, 15, "COLOR")]}),
    ("The ocean is deep blue.", {"entities": [(20, 24, "COLOR")]}),
    ("His favorite color is purple.", {"entities": [(24, 30, "COLOR")]}),
    ("The ripe apple is red.", {"entities": [(17, 20, "COLOR")]}),
    ("The card is red.", {"entities": [(11, 15, "COLOR")]}),
    ("The leaves turned golden.", {"entities": [(18, 24, "COLOR")]}),
    ("The sky at sunset is orange.", {"entities": [(20, 26, "COLOR")]}),
    ("The flag is red, white, and blue.", {"entities": [(11, 14, "COLOR"), (17, 22, "COLOR"), (29, 33, "COLOR")]}),
    ("The walls are green.", {"entities": [(14, 19, "COLOR")]}),
    ("The flowers are pink.", {"entities": [(17, 21, "COLOR")]}),
    ("The book has a brown cover.", {"entities": [(17, 22, "COLOR")]}),
    ("The car is silver.", {"entities": [(12, 18, "COLOR")]}),
    ("The shirt is black.", {"entities": [(14, 19, "COLOR")]}),
    ("The clouds are gray.", {"entities": [(16, 20, "COLOR")]}),
    ("The banana is yellow.", {"entities": [(14, 20, "COLOR")]}),
    ("The sun is yellow.", {"entities": [(11, 17, "COLOR")]}),
    ("The grapes are purple.", {"entities": [(15, 21, "COLOR")]}),
    ("The door is white.", {"entities": [(13, 18, "COLOR")]}),
    ("The sky at dawn is pink.", {"entities": [(18, 22, "COLOR")]}),
    ("The sky at dusk is pink.", {"entities": [(18, 22, "COLOR")]}),
    ("The sky at midnight is black.", {"entities": [(22, 27, "COLOR")]}),
    ("The sky at noon is blue.", {"entities": [(18, 22, "COLOR")]}),
    ("The sky is blue and sometimes has clouds.", {"entities": [(11, 15, "COLOR")]}),
    ("The sky is blue and the sun is bright.", {"entities": [(11, 15, "COLOR")]}),
    ("The sky is pink and clear.", {"entities": [(11, 15, "COLOR")]}),
    ("The sky is blue and cloudless.", {"entities": [(11, 15, "COLOR")]}),
]

In [10]:

adjusted_train_data = []
for text, entities in TRAIN_DATA:
    doc = nlp.make_doc(text)
    biluo_tags = offsets_to_biluo_tags(doc, entities.get("entities"))

    corrected_entities = []
    for entity, tag in zip(entities.get("entities"), biluo_tags):
        if tag != 'U':  # Ignore single-character entities that couldn't be aligned
            start, end, label = entity
            adjusted_start = text.find(text[start:end])
            adjusted_end = adjusted_start + len(text[start:end])
            corrected_entities.append((adjusted_start, adjusted_end, label))

    adjusted_train_data.append((text, {"entities": corrected_entities}))
TRAIN_DATA = adjusted_train_data

optimizer = nlp.begin_training()
# Start the training loop
for _ in range(20):  # Number of epochs
    # Shuffle the training data
    random.shuffle(TRAIN_DATA)
    for text, annotations in TRAIN_DATA:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], drop=0.2) # Decrease dropout rate to 20%

# Save the model
nlp.to_disk("custom_ner_model")

In [11]:
# Load the custom NER model
nlp = spacy.load("custom_ner_model")

# Test sentences
test_sentences="The sky is blue, he sunflower is yellow, His favorite color is purple, The leaves turned golden, The walls are green and the flowers are pink"

colors = {"COLOR": "#7DF6D9"}

doc = nlp(test_sentences)
entities = [{"start": ent.start_char, "end": ent.end_char, "label": ent.label_} for ent in doc.ents]
options = {"colors": colors}
displacy.render(doc, style="ent", options=options, jupyter=True)