In [3]:
import spacy
import ast
from spacy.training.example import Example

# load the training dataset
import pandas as pd
training_data_file_path = "/Users/shubham/Desktop/ner.csv"

# read csv into pandas DataFrame
df = pd.read_csv(training_data_file_path)
# Grab any basic information df.head() or: Summary statistics, Access a specific column, etc...

'''
One thing to note here: is that this NER dataset that I found is split into the following columns:
    - Sentence #, Sentence, POS(Word type description, NOUN, etc..), Tag(O-per with IOB-named entity)
But spaCy doesn't understand the data in this format spaCy understands data in the format of a tuple
    - (string, dict) where dict = {entities : [tuples of character indices of the start of the token(start, end, + IOB-named entity)]}
So in order to train the model with spaCy like the paper suggests we first need to reformat the dataFrame to match
into this form which is what convert_to_spacy is doing
'''
def convert_to_spacy_Part_One(df):
    tagged_spacy_data = []
    for index, row in df.iterrows():
        sentence = row['Sentence']
        ner_tags = row['Tag']

        # right now here ner_tags is of type string, so in order to get the elements lets convert to a list
        ner_tags = ast.literal_eval(ner_tags)

        entities = []
        start = 0
        end = 0
        for tagIndex, word in enumerate(sentence.split()):
            end = start + len(word)

            if tagIndex >= len(ner_tags): 
                break
            if(ner_tags[tagIndex] != 'O'):
                # print(f"({start}, {end}, {ner_tags[tagIndex]} | {tagIndex}")
                entities.append((start, end, ner_tags[tagIndex]))

            start = end + 1
        
        if len(entities) > 0:
            tagged_spacy_data.append((sentence, {"entities": entities}))

    return tagged_spacy_data

'''
So basically the convert_to_spacy_Part_One gives u an output like this:
    ('The German firm works as a sub-contractor for Shell .', {'entities': [(4, 10, 'B-gpe'), (46, 51, 'B-org')]})
When we train a spaCy Model we need the form to be like this:
    ('The German firm works as a sub-contractor for Shell .', {'entities': [(4, 10, 'GPE'), (46, 51, 'ORG')]})
So the function convert_to_spaCy_Part_Two will .upper() the string [2:]

'''
def convert_to_spacy_Part_Two(spacy_data):
    spacy_formatted_data = []

    for sentence, entities_dict in spacy_data:

        entities_list = entities_dict["entities"]
        modified_entities = []

        for start, end, type in entities_list:
            currType = type
            formattedType = currType[2:].upper() # take 'B-Geo' -> 'GEO'
            modified_entities.append((start, end, formattedType))
        
        spacy_formatted_data.append((sentence, {"entities" : modified_entities}))

    return spacy_formatted_data


# Convert the DataFrame to spaCy format but still the tags are in there
spacy_training_data = convert_to_spacy_Part_One(df)

# remove the tags and now we are officially ready to train our model, spacy_formatted_data is a list of training data
spacy_formatted_data = convert_to_spacy_Part_Two(spacy_training_data)

print(spacy_formatted_data[0])

('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .', {'entities': [(48, 54, 'GEO'), (77, 81, 'GEO'), (111, 118, 'GPE')]})
