In [1]:
import spacy
import ast
from spacy.training.example import Example

# load the training dataset
import pandas as pd
training_data_file_path = "/Users/shubham/Desktop/ner.csv"

# read csv into pandas DataFrame
df = pd.read_csv(training_data_file_path)
# Grab any basic information df.head() or: Summary statistics, Access a specific column, etc...

'''
One thing to note here: is that this NER dataset that I found is split into the following columns:
    - Sentence #, Sentence, POS(Word type description, NOUN, etc..), Tag(O-per with IOB-named entity)
But spaCy doesn't understand the data in this format. spaCy understands data in the format of a tuple
    - (string, dict) where dict = {entities : [tuples of (start, end, entity)]}
So in order to train the model with spaCy like the paper suggests we first need to reformat the dataFrame to match
into this form which is what taggedFormat(df) is doing
'''
def taggedFormat(df):
    tagged_spacy_data = []
    for index, row in df.iterrows():
        sentence = row['Sentence']
        ner_tags = row['Tag']

        # right now here ner_tags is of type string, so in order to get the elements lets convert to a list
        ner_tags = ast.literal_eval(ner_tags)

        entities = []
        start = 0
        end = 0
        for tagIndex, word in enumerate(sentence.split()):
            end = start + len(word)

            if tagIndex >= len(ner_tags): 
                break
            if(ner_tags[tagIndex] != 'O'):
                entities.append((start, end, ner_tags[tagIndex]))

            start = end + 1
        
        if len(entities) > 0:
            tagged_spacy_data.append((sentence, {"entities": entities}))

    return tagged_spacy_data

'''
1st Function(taggedFormat(df)): gives you a line of output like this, ignoring the sentence part in the beginning:
    - ({'entities': [(0, 4, 'B-gpe'), (12, 21, 'B-per'), (22, 29, 'I-per'), (30, 41, 'I-per'), (47, 54, 'B-tim'), (60, 68, 'B-gpe'), (100, 104, 'B-gpe'), (158, 165, 'B-gpe')]})

2nd Function(combineTaggedData(tagged_spacy_data)): looking at the above example notice how (12, 41) are all B,I,I-per, they are talking about the same named entity PER, so we need to combine these somehow to turn it into this
    - ({'entities': [(0, 4, 'B-gpe'), (12, 41, 'B-per???'), (47, 54, 'B-tim'), (60, 68, 'B-gpe'), (100, 104, 'B-gpe'), (158, 165, 'B-gpe')]})

3rd Function. But when we make our training data we dont include the tags: 'B-', 'I-', etc.. and we also capitalize the 'per' -> 'PER' to follow a standard so we need to do some processing of this formatted data
and put in this form(using .upper() and [2:] we can do this):
    - {'entities': [(0, 4, 'GPE'), (12, 41, 'PER'), (47, 54, 'TIM'), (60, 68, 'GPE'), (100, 104, 'GPE'), (158, 165, 'GPE')]}

This is one of the main techniques that the research paper proposes in the methodology section and how to extract sentence data to create a proper training data to build our model.
Obviously in the paper they go directly from HTML to sentence parsing my usecase is a bit different since I am using a dataset from kaggle but the technique here is still the same.
'''

def combineTaggedData(tagged_spacy_data):
    to_combine = []

    for sentence, entities_dict in tagged_spacy_data:
        entities_list = entities_dict["entities"]

        i = 0
        j = 1
        currArr = []
        found = False
        while j < len(entities_list):
            start1, end1, entity_type1 = entities_list[i]
            start2, end2, entity_type2 = entities_list[j]

            splitI = entity_type1.split('-')
            splitJ = entity_type2.split('-')

            if splitI[0] == 'B' and splitJ[0] == 'I':
                if j == len(entities_list) - 1:
                    currArr.append((start1, end2, entity_type1))
                    found = True
                j += 1
            else:
                _, endPrev, _ = entities_list[j-1]
                currArr.append((start1, endPrev, entity_type1))
                i = j
                j += 1
        
        if i == len(entities_list) - 1:
            currArr.extend(entities_list[i:])

        to_combine.append((sentence, {"entities" : currArr}))
    
    return to_combine

def convertSpacyFormat(to_combine):
    spacy_formatted_data = []

    for sentence, entities_dict in to_combine:

        entities_list = entities_dict["entities"]
        modified_entities = []

        for start, end, type in entities_list:
            currType = type
            formattedType = currType[2:].upper() # take 'B-Geo' -> 'GEO'
            modified_entities.append((start, end, formattedType))
        
        spacy_formatted_data.append((sentence, {"entities" : modified_entities}))

    return spacy_formatted_data


# Convert the DataFrame to spaCy format but still the tags are in there
spacy_training_data = taggedFormat(df)
# print(spacy_training_data[45][1])

# remove the tags and now we are officially ready to train our model, spacy_formatted_data is a list of training data
combined_spacy_data = combineTaggedData(spacy_training_data)
# print(combined_spacy_data[45])
# print(f"{combined_spacy_data[10][0][12:21]}, {combined_spacy_data[10][0][22:29]}, {combined_spacy_data[10][0][30:41]}")

spacy_formatted_data = convertSpacyFormat(combined_spacy_data)
# print(spacy_training_data[45][1])
# print(combined_spacy_data[45])

# print(spacy_training_data[6][1])
# print(combined_spacy_data[6])

# print(spacy_training_data[13][1])
# print(combined_spacy_data[13])

# print(spacy_training_data[1381][1])
# print(combined_spacy_data[1381])

# for rows in spacy_formatted_data:
#     print(f"{rows}")

# TESTING: make sure I get the right entities, types, etc...
# for i in range(len(spacy_data)):
#   print(spacy_data[i])
# print(type(spacy_formatted_data))
# for i in range(len(spacy_formatted_data)):
#         print(spacy_formatted_data[i])
# print(spacy_training_data[0])
# print(spacy_training_data[0][0][48:54], spacy_training_data[0][0][77:81], spacy_training_data[0][0][111:118])

In [130]:
def convert_to_spacy(df):
    spacy_data = []
    # for index, row in df.iterrows():
    #     sentence = row['Sentence']
    #     ner_tags = row['Tag']

    #     # print(ner_tags)
    #     # convert string -> list
    #     ner_tags = ast.literal_eval(ner_tags)

    #     # print(index, sentence)

    sentence = df['Sentence'][47591]
    print(sentence)

    totalWords = sentence.split()
    print(f"Words: {totalWords}")
    print(f"Word Count: {len(totalWords)}")


    ner_tags = df['Tag'][47591]
    ner_tags = ast.literal_eval(ner_tags)
    print(len(ner_tags))

    entities = []
    start = 0
    end = 0
    for tagIndex, word in enumerate(sentence.split()):
        end = start + len(word)
        if tagIndex >= len(ner_tags):
            break
        if(ner_tags[tagIndex] != 'O'):
            print(f"({start}, {end}, {ner_tags[tagIndex]} | {tagIndex}")
            entities.append((start, end, ner_tags[tagIndex]))

        start = end + 1
        

        # entities.append((start, end, ner_tags[tagIndex])) if(ner_tags[tagIndex] != 'O')
        

    return spacy_data

# Convert the DataFrame to spaCy format
spacy_training_data = convert_to_spacy(df)

U.S. weather forecasters say Hurricane Wilma has strengthened to a powerful category 5 storm and a key low-pressure measurement indicates it is the most powerful storm of the year .
Words: ['U.S.', 'weather', 'forecasters', 'say', 'Hurricane', 'Wilma', 'has', 'strengthened', 'to', 'a', 'powerful', 'category', '5', 'storm', 'and', 'a', 'key', 'low-pressure', 'measurement', 'indicates', 'it', 'is', 'the', 'most', 'powerful', 'storm', 'of', 'the', 'year', '.']
Word Count: 30
29
(0, 4, B-geo | 0


In [6]:
from spacy.training.example import Example
import random
import spacy
from spacy import displacy

sample_txt = "Albert Einstein was born on March 14, 1879, in Ulm, in the Kingdom of Württemberg in the German Empire. He made significant contributions to the field of theoretical physics, especially in the development of the theory of relativity. Einstein received the Nobel Prize in Physics in 1921 for his explanation of the photoelectric effect. He later moved to the United States, where he continued his scientific work and became a prominent figure in academia."

nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# midpoint = len(spacy_formatted_data) // 2
# dataset1 = spacy_formatted_data[:midpoint]
# dataset2 = spacy_formatted_data[midpoint:]

collected_training_data = []
for text, entity_map_item in spacy_formatted_data:
    converted_data = Example.from_dict(nlp.make_doc(text), entity_map_item)
    collected_training_data.append(converted_data)

nlp.begin_training()
for i in range(30):
    random.shuffle(collected_training_data)
    for data in collected_training_data:
        nlp.update([data], drop=0.4)

nlp.to_disk("custom_ner_model")

# Step 4: Test the Model
custom_ner_modl = spacy.load("custom_ner_model")
doc = custom_ner_modl(sample_txt)

# Render the visualization with custom colors
displacy.render(doc, style='ent')

In [38]:
import spacy
from collections import defaultdict

texts = ["John works at Microsoft."]

# Number of alternate analyses to consider. More is slower, and not necessarily better -- you need to experiment on your problem.
beam_width = 16

# This clips solutions at each step. 
# We multiply the score of the top-ranked action by this value, and use the result as a threshold.
# This prevents the parser from exploring options that look very unlikely, saving a bit of efficiency. 
# Accuracy may also improve, because we've trained on greedy objective.
beam_density = 0.0001 
nlp = spacy.load('en_core_web_sm')

docs = list(nlp.pipe(texts))
beams = nlp.get_pipe('ner').beam_parse(docs, beam_width=beam_width, beam_density=beam_density)

# Calculate entity scores
entity_scores = defaultdict(float)
for doc, beam in zip(docs, beams):
    for score, ents in nlp.get_pipe('ner').moves.get_beam_parses(beam):
        total_score = score
        print(ents)
        for start, end, label in ents:
            entity_scores[(start, end, label)] += score / total_score

# # Print entity scores
# for key, score in entity_scores.items():
#     print(f'Entity: {key}, Score: {score}')

# # # Calculate the total sum of probabilities
# total_prob_sum = sum(entity_scores.values())
# print(total_prob_sum)

# # # Normalize entity scores
# normalized_entity_scores = {key: prob / total_prob_sum for key, prob in entity_scores.items()}
# print(normalized_entity_scores)

# # Convert to a list of dictionaries
# entity_probabilities = [{'start': start, 'end': end, 'label': label, 'prob': prob}
#                         for (start, end, label), prob in normalized_entity_scores.items()]

# # Sort and print the normalized results
# for entity in sorted(entity_probabilities, key=lambda x: x['start']):
#     print(entity)


[(0, 1, 'PERSON'), (3, 4, 'ORG')]


In [26]:
from spacy.training.example import Example
import random
import spacy
from spacy import displacy

texts = 'John works at Microsoft.'
custom_ner_modl = spacy.load("en_core_web_sm")
doc = custom_ner_modl(texts)
displacy.render(doc, style='ent')



# sample_txt = "Albert Einstein was born on March 14, 1879, in Ulm, in the Kingdom of Württemberg in the German Empire. He made significant contributions to the field of theoretical physics, especially in the development of the theory of relativity. Einstein received the Nobel Prize in Physics in 1921 for his explanation of the photoelectric effect. He later moved to the United States, where he continued his scientific work and became a prominent figure in academia."


# custom_ner_modl = spacy.load("custom_ner_model")
# doc = custom_ner_modl(sample_txt)

# # Render the visualization with custom colors
# displacy.render(doc, style='ent')

In [47]:
import spacy
from collections import defaultdict

text = "John works at Microsoft."

# Number of alternate analyses to consider. More is slower, and not necessarily better -- you need to experiment on your problem.
beam_width = 16

# This clips solutions at each step.
# We multiply the score of the top-ranked action by this value, and use the result as a threshold.
# This prevents the parser from exploring options that look very unlikely, saving a bit of efficiency.
# Accuracy may also improve, because we've trained on greedy objective.
beam_density = 0.0001
nlp = spacy.load('en_core_web_sm')

doc = nlp(text)
beams = nlp.get_pipe('ner').beam_parse([doc], beam_width=beam_width, beam_density=beam_density)

# Calculate entity scores
entity_scores = defaultdict(float)

for beam in beams:
    for score, ents in nlp.get_pipe('ner').moves.get_beam_parses(beam):
        total_score = score
        print(ents)
        for ents in doc.ents:
            entity_scores[(ents.start_char, ents.end_char, label)] += score / total_score

# # Print entity scores
# for (start, end, label), score in entity_scores.items():
#     print(f'Entity: {label}, Start Char: {start}, End Char: {end}, Score: {score}')

# Print entity scores
for key, score in entity_scores.items():
    print(f'Entity: {key}, Score: {score}')

# # Calculate the total sum of probabilities
total_prob_sum = sum(entity_scores.values())
print(total_prob_sum)

# # Normalize entity scores
normalized_entity_scores = {key: prob / total_prob_sum for key, prob in entity_scores.items()}
print(normalized_entity_scores)

# # Convert to a list of dictionaries
# entity_probabilities = [{'start': start, 'end': end, 'label': label, 'prob': prob}
#                         for (start, end, label), prob in normalized_entity_scores.items()]

# # Sort and print the normalized results
# for entity in sorted(entity_probabilities, key=lambda x: x['start']):
#     print(entity)


[(0, 1, 'PERSON'), (3, 4, 'ORG')]
Entity: (0, 4, 'ORG'), Score: 1.0
Entity: (14, 23, 'ORG'), Score: 1.0
2.0
{(0, 4, 'ORG'): 0.5, (14, 23, 'ORG'): 0.5}


In [94]:
import spacy
from collections import defaultdict

text = "Shubham is a really cool guy who loves to eat apples and he likes to work at Google. Shubham is a really cool guy who loves to eat apples and he likes to work at Google. Shubham is a really cool guy who loves to eat apples and he likes to work at Google. Shubham is a really cool guy who loves to eat apples and he likes to work at Google."
beam_width = 16
beam_density = 0.0001

custom_ner_model_path = "/Users/shubham/ForwardDataLabTask1/task2_my_custom_ner_modelV2"
nlp = spacy.load(custom_ner_model_path)

doc = nlp(text)
beams = nlp.get_pipe('ner').beam_parse([doc], beam_width=beam_width, beam_density=beam_density)

# Calculate entity scores
entityScore = []
currEntityScore = []

for beam in beams:
    for score, ents in nlp.get_pipe('ner').moves.get_beam_parses(beam):
        for ent in doc.ents:
            ent_info = {'label' : ent.label_, 'start' : ent.start_char, 'end' : ent.end_char, 'score': score}
            currEntityScore.append(ent_info)

entityScore.append(currEntityScore)
print(entityScore)

# print(entityScore[0]['entities'])

# # Print entity scores
# for key, score in entity_scores.items():
#     print(f'Entity: {key}, Score: {score}')

# print(entities)

# # # Calculate the total sum of probabilities
# total_prob_sum = sum(entity_scores.values())
# print(total_prob_sum)

# # # Normalize entity scores
# normalized_entity_scores = {key: prob / total_prob_sum for key, prob in entity_scores.items()}
# print(normalized_entity_scores)



[[{'label': 'PER', 'start': 0, 'end': 7, 'score': 1.0}, {'label': 'ORG', 'start': 77, 'end': 83, 'score': 1.0}, {'label': 'GEO', 'start': 85, 'end': 92, 'score': 1.0}, {'label': 'ORG', 'start': 162, 'end': 168, 'score': 1.0}, {'label': 'GEO', 'start': 170, 'end': 177, 'score': 1.0}, {'label': 'ORG', 'start': 247, 'end': 253, 'score': 1.0}, {'label': 'GEO', 'start': 255, 'end': 262, 'score': 1.0}, {'label': 'ORG', 'start': 332, 'end': 338, 'score': 1.0}]]


In [97]:
import spacy
from spacy.training import Example

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Example training data
train_data = [
    ("Apple is a company.", {'entities': [(0, 5, 'ORG')]})
]

# Training loop
for epoch in range(10):  # Adjust the number of epochs as needed
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        gold = Example(doc, **annotations)
        loss, scores = nlp.update([gold.doc], [gold], drop=0.5)

        # Access scores for each token
        for token, score_dict in zip(doc, scores):
            print(f"Token: {token.text}, Entity Scores: {score_dict}")

# Now you can access the entity scores during training for each token


TypeError: __init__() takes exactly 2 positional arguments (1 given)

In [92]:
from spacy.training.example import Example
import random
import spacy
from spacy import displacy

text = "Shubham is a really cool guy who loves to eat apples and he likes to work at Google. Shubham is a really cool guy who loves to eat apples and he likes to work at Google. Shubham is a really cool guy who loves to eat apples and he likes to work at Google. Shubham is a really cool guy who loves to eat apples and he likes to work at Google."
custom_ner_model_path = "/Users/shubham/ForwardDataLabTask1/task2_my_custom_ner_modelV2"
nlp = spacy.load(custom_ner_model_path)
doc = nlp(text)
displacy.render(doc, style='ent')
