In [5]:
import json
import spacy
import random
from pathlib import Path

model_dir = 'C:/Users/prati/source/repos/RnD/SkillSetModel'

def train_SkillSet_NER(model=None, output_dir=model_dir, n_iter=10):
                       
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
                       
    
    training_data = []
    lines=[]

    with open("SkillSet.json", encoding="utf8") as f:
        lines = f.readlines()
        #print (lines)

    for line in lines:
        print("Inside Lines {line}")
        data = json.loads(line)
        text = data['content']
        entities = []
        for annotation in data['annotation']:
            #only a single point in text annotation.
            point = annotation['points'][0]
            labels = annotation['label']
            # handle both list of labels or a single label.
            if not isinstance(labels, list):
                labels = [labels]

            for label in labels:
               #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
               entities.append((point['start'], point['end'] + 1 ,label))

            training_data.append((text, {"entities" : entities}))

    #print(nlp.pipe_names)

    if 'ner' not in nlp.pipe_names:
            ner = nlp.create_pipe('ner')
            nlp.add_pipe(ner, last=True)

    # add labels
    for _, annotations in training_data:
         for ent in annotations.get('entities'):
             ner.add_label(ent[2])

     # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()

        for itn in range(n_iter):
            print("Statring iteration " + str(itn))
            random.shuffle(training_data)
            losses = {}

            for text, annotations in training_data:
               nlp.update(
                   [text],  # batch of texts
                   [annotations],  # batch of annotations
                   drop=0.2,  # dropout - make it harder to memorise data
                   sgd=optimizer,  # callable to update weights
                   losses=losses)

               print(losses)   
                       
                       
                       
     # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model

In [6]:
train_SkillSet_NER()

Created blank 'en' model
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Inside Lines {line}
Statring iteration 0
{'ner': 16.71428418159485}
{'ner': 36.00258030742407}
{'ner': 56.58012779802084}
{'ner': 74.15323778986931}
{'ner': 93.33000287413597}
{'ner': 106.44837513566017}
{'ner': 122.35092070698738}
{'ner': 140.10013329982758}
{'ner': 150.3291722163558}
{'ner': 161.93349238485098}
{'ner': 169.07436922937632}
{'ner': 179.31172242760658}
{'ner': 185.7125839088112}
{'ner': 189.75262790359557}
{'ner': 191.76769285462797}
{'ner': 193.88012078932206}
{'ner': 206.60249129063868}
{'ner

{'ner': 54.00385360716792}
{'ner': 54.003853825876966}
{'ner': 54.00387741016831}
{'ner': 54.00387742063073}
{'ner': 54.003877445601056}
{'ner': 54.00387750498424}
{'ner': 54.0109772065593}
{'ner': 54.0110065144043}
{'ner': 54.01100651614111}
{'ner': 54.011033887898606}
{'ner': 54.01112593585752}
{'ner': 58.16399639144652}
{'ner': 58.16400110098047}
{'ner': 58.16400135151621}
{'ner': 58.16436488034282}
{'ner': 76.51676817664327}
{'ner': 76.51729059718296}
{'ner': 76.5172906093852}
{'ner': 79.30265111631763}
{'ner': 79.30278124036089}
{'ner': 79.30278525174442}
Statring iteration 4
{'ner': 3.134905189636608e-06}
{'ner': 0.0015930687908872045}
{'ner': 0.0015931414709309396}
{'ner': 0.0016051445955958992}
{'ner': 0.0016194038067082381}
{'ner': 0.0016195437382055754}
{'ner': 0.0016221151832054408}
{'ner': 0.0016221184014582352}
{'ner': 0.002563196828612038}
{'ner': 0.004684285777421729}
{'ner': 0.004707184255248322}
{'ner': 0.008670507507896733}
{'ner': 2.441009161539996}
{'ner': 2.4416458

{'ner': 13.103217133126428}
{'ner': 13.122661845033077}
{'ner': 13.12346088460561}
{'ner': 13.251317033976015}
{'ner': 13.251319443703022}
{'ner': 13.251319724227395}
{'ner': 13.251691625113068}
{'ner': 15.249282413068224}
{'ner': 15.249282413076228}
{'ner': 15.249309711992206}
{'ner': 15.249858365740701}
{'ner': 15.249858491150333}
{'ner': 15.329141547215412}
{'ner': 15.329141547256935}
{'ner': 15.329589583698626}
{'ner': 15.329934243797759}
{'ner': 15.329935526728349}
{'ner': 15.32993559283413}
{'ner': 15.32993559452584}
{'ner': 15.32994075327026}
{'ner': 15.329940753569492}
{'ner': 15.329940825049542}
{'ner': 16.315310023581215}
{'ner': 16.31531005640203}
{'ner': 16.315310056402335}
{'ner': 16.31531006518312}
{'ner': 16.31531046994967}
{'ner': 16.315311631569514}
{'ner': 16.315311631569646}
{'ner': 16.315311631576293}
Statring iteration 8
{'ner': 1.9406394415830974e-12}
{'ner': 1.909552271112702}
{'ner': 1.9095523689961262}
{'ner': 1.9095525345391924}
{'ner': 1.9095525435736607}
{'n

In [8]:
print("Loading trained model from:", model_dir)
nlp2 = spacy.load(model_dir)
doc2 = nlp2("""
JPA Hibernate framework, RESTful web services, JDK, SQL and No-SQL
""")

print ("Entities= " + str(["" + str(ent.text) + " -> " + str(ent.label_) for ent in doc2.ents]))

for token in doc2:
    print(token, token.ent_type_)

Loading trained model from: C:/Users/prati/source/repos/RnD/SkillSetModel
Entities= ['Hibernate -> Skillset', 'RESTful -> Skillset', 'JDK -> Skillset', 'SQL -> Skillset', 'SQL -> Skillset']

 
JPA 
Hibernate Skillset
framework 
, 
RESTful Skillset
web 
services 
, 
JDK Skillset
, 
SQL Skillset
and 
No 
- 
SQL Skillset

 
