In [None]:
#1. use standard spacy and show working NER video
#2. modify #1 to train spacy to support a new entity called fruit and see results
#3. create standard spacy with CFR and see NER working, create #1 just with CFR model
#4. modify #3 train spacy with CFR to support a new entity called fruit
#5. the paper says NER with CFR is better than with non CFR so here you will see that #4 is better than #2

#6. ["text": "thththththththth "]

In [45]:
sample_txt = "Apple is a huge software company that has an average stock price of 200 dollars that was hit this past November. Oh and a fun fact about Apple, CEO Steve Job's favorite food is also Apples"

In [47]:
# Step 1
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sample_txt)

displacy.render(doc, style='ent')


In [133]:
from spacy.training.example import Example
import random

nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
ner.add_label("Fruit")

training_data = [
    ("I love apples and bananas.", {"entities": [(7, 13, "fruit"), (18, 25, "fruit")]}),
    ("Oranges are a great source of vitamin C.", {"entities": [(0, 7, "fruit")]}),
]

collected_training_data = []
for text, entity_map_item in training_data:
    converted_data = Example.from_dict(nlp.make_doc(text), entity_map_item)
    collected_training_data.append(converted_data)

nlp.begin_training()
for i in range(100):
    random.shuffle(collected_training_data)
    for data in collected_training_data:
        nlp.update([data], drop=0.5)

nlp.to_disk("custom_ner_model")

# Step 4: Test the Model
custom_ner_modl = spacy.load("custom_ner_model")
doc = custom_ner_modl(sample_txt)

# Specify a solid color for the entity
style = {
    "ents": ["Fruit"],
    "colors": {"Fruit": "green"},
}

# Render the visualization with custom colors
displacy.render(doc, style='ent', options=style)




In [156]:
import spacy
import ast
from spacy.training.example import Example

# load the training dataset
import pandas as pd
training_data_file_path = "/Users/shubham/Desktop/ner.csv"

# read csv into pandas DataFrame
df = pd.read_csv(training_data_file_path)
# Grab any basic information df.head() or: Summary statistics, Access a specific column, etc...

'''
One thing to note here: is that this NER dataset that I found is split into the following columns:
    - Sentence #, Sentence, POS(Word type description, NOUN, etc..), Tag(O-per with IOB-named entity)
But spaCy doesn't understand the data in this format spaCy understands data in the format of a tuple
    - (string, dict) where dict = {entities : [tuples of character indices of the start of the token(start, end, + IOB-named entity)]}
So in order to train the model with spaCy like the paper suggests we first need to reformat the dataFrame to match
into this form which is what convert_to_spacy is doing
'''
def convert_to_spacy_Part_One(df):
    tagged_spacy_data = []
    for index, row in df.iterrows():
        sentence = row['Sentence']
        ner_tags = row['Tag']

        # right now here ner_tags is of type string, so in order to get the elements lets convert to a list
        ner_tags = ast.literal_eval(ner_tags)

        entities = []
        start = 0
        end = 0
        for tagIndex, word in enumerate(sentence.split()):
            end = start + len(word)

            if tagIndex >= len(ner_tags): # general data validation check
                break
            if(ner_tags[tagIndex] != 'O'):
                entities.append((start, end, ner_tags[tagIndex]))

            start = end + 1
        
        if len(entities) > 0:
            tagged_spacy_data.append((sentence, {"entities": entities}))

    return tagged_spacy_data

'''
So basically the convert_to_spacy_Part_One gives you an output like this:
    ('The German firm works as a sub-contractor for Shell .', {'entities': [(4, 10, 'B-gpe'), (46, 51, 'B-org')]})
When we train a spaCy Model we need the form to be like this:
    ('The German firm works as a sub-contractor for Shell .', {'entities': [(4, 10, 'GPE'), (46, 51, 'ORG')]})
So the function convert_to_spaCy_Part_Two will .upper() the string [2:]
'''
def convert_to_spacy_Part_Two(spacy_data):
    spacy_formatted_data = []

    for sentence, entities_dict in spacy_data:

        entities_list = entities_dict["entities"]
        modified_entities = []

        for start, end, type in entities_list:
            currType = type
            formattedType = currType[2:].upper() # take 'B-Geo' -> 'GEO'
            modified_entities.append((start, end, formattedType))
        
        spacy_formatted_data.append((sentence, {"entities" : modified_entities}))

    return spacy_formatted_data


# Convert the DataFrame to spaCy format but still the tags are in there
spacy_training_data = convert_to_spacy_Part_One(df)

# remove the tags and now we are officially ready to train our model, spacy_formatted_data is a list of training data
spacy_formatted_data = convert_to_spacy_Part_Two(spacy_training_data)


# TESTING: random testing statements I used to make sure I get the right entities, types, etc...
# print(f"({start}, {end}, {ner_tags[tagIndex]} | {tagIndex}")
# for i in range(len(spacy_data)):
#   print(spacy_data[i])
# print(type(spacy_formatted_data))
# for i in range(len(spacy_formatted_data)):
#         print(spacy_formatted_data[i])
# print(spacy_training_data[0])
# print(spacy_training_data[0][0][48:54], spacy_training_data[0][0][77:81], spacy_training_data[0][0][111:118])

<class 'list'>


In [157]:
from spacy.training.example import Example
import random

sample_txt = "Albert Einstein was born on March 14, 1879, in Ulm, in the Kingdom of Württemberg in the German Empire. He made significant contributions to the field of theoretical physics, especially in the development of the theory of relativity. Einstein received the Nobel Prize in Physics in 1921 for his explanation of the photoelectric effect. He later moved to the United States, where he continued his scientific work and became a prominent figure in academia."

nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

collected_training_data = []
for text, entity_map_item in spacy_formatted_data:
    converted_data = Example.from_dict(nlp.make_doc(text), entity_map_item)
    collected_training_data.append(converted_data)

nlp.begin_training()
for i in range(100):
    random.shuffle(collected_training_data)
    for data in collected_training_data:
        nlp.update([data], drop=0.5)

nlp.to_disk("custom_ner_model")

# Step 4: Test the Model
custom_ner_modl = spacy.load("custom_ner_model")
doc = custom_ner_modl(sample_txt)

# Render the visualization with custom colors
displacy.render(doc, style='ent')

KeyboardInterrupt: 