In [None]:
import json
import random
import logging
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from spacy.gold import GoldParse
from spacy.scorer import Scorer
import spacy

In [None]:
# Function to convert Dataturks JSON format to Spacy training data format
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines = []
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                point = annotation['points'][0]
                labels = annotation['label']
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    entities.append((point['start'], point['end'] + 1, label))

            training_data.append((text, {"entities": entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

In [None]:
# Function to train Spacy NER model
def train_spacy():
    # Convert Dataturks JSON format to Spacy training data format
    TRAIN_DATA = convert_dataturks_to_spacy("/Users/shivangidhiman/Documents/traindata.json")
    
    # Create a blank Spacy model and add NER pipeline
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
        
    # Add labels to the NER model
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    
    # Disable other pipelines and start training the NER model
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],
                    [annotations],
                    drop=0.2,
                    sgd=optimizer,
                    losses=losses)
            print(losses)
            
    # Load test data and evaluate the model's performance
    examples = convert_dataturks_to_spacy("/Users/shivangidhiman/Documents/testdata.json")
    d = {}
    c = 0
    for text, annot in examples:
        f = open("resume" + str(c) + ".txt", "w")
        doc_to_test = nlp(text)
        for ent in doc_to_test.ents:
            d[ent.label_] = []
        for ent in doc_to_test.ents:
            d[ent.label_].append(ent.text)

        # Process model's predictions and calculate evaluation metrics
        for i in set(d.keys()):
            f.write("\n\n")
            f.write(i + ":" + "\n")
            for j in set(d[i]):
                f.write(j.replace('\n', '') + "\n")
        d = {}
        for ent in doc_to_test.ents:
            d[ent.label_] = [0, 0, 0, 0, 0, 0]
        for ent in doc_to_test.ents:
            doc_gold_text = nlp.make_doc(text)
            gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
            y_true = [ent.label_ if ent.label_ in x else 'Not ' + ent.label_ for x in gold.ner]
            y_pred = [x.ent_type_ if x.ent_type_ == ent.label_ else 'Not ' + ent.label_ for x in doc_to_test]
            if d[ent.label_][0] == 0:
                (p, r, f, s) = precision_recall_fscore_support(y_true, y_pred, average='weighted')
                a = accuracy_score(y_true, y_pred)
                d[ent.label_][0] = 1
                d[ent.label_][1] += p
                d[ent.label_][2] += r
                d[ent.label_][3] += f
                d[ent.label_][4] += a
                d[ent.label_][5] += 1
        c += 1

    # Print evaluation metrics for each entity label
    for i in d:
        print("\nFor Entity " + i + "\n")
        print("Accuracy : " + str((d[i][4] / d[i][5]) * 100) + "%")
        print("Precision : " + str(d[i][1] / d[i][5]))
        print("Recall : " + str(d[i][2] / d[i][5]))
        print("F-score : " + str(d[i][3] / d[i][5]))

In [None]:
train_spacy()