In [4]:
import spacy
import random
TRAIN_DATA = [
    ("Facebook has been accused for leaking personal data of users.", {'entities': [(0, 8, 'ORG')]}),
    ("Tinder uses sophisticated algorithms to find the perfect match.",{'entities': [(0, 6, "ORG")]})
    ]

nlp = spacy.blank('en')
optimizer = nlp.begin_training()


In [6]:
for i in range(20):
	random.shuffle(TRAIN_DATA)
	for text, annotation in TRAIN_DATA:
		nlp.update([text], [annotation], sgd=optimizer)
nlp.to_disk('/content')

In [7]:
#A simple example for training a part-of-speech tagger with a custom tag map.


import plac
import random
import spacy
from spacy.util import minibatch, compounding
from pathlib import Path

In [9]:
TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
#or
#it can be in the below formate
#("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})

In [10]:
TRAIN_DATA = [
    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
]

In [25]:
plac.annotations( lang=("ISO Code of language to use", "option", "l", str),
              output_dir=("Optional output directory", "option", "o", Path), 
              n_iter=("Number of training iterations", "option", "n", int),
              )

<function plac_core.annotations.<locals>.annotate>

In [44]:
def main(lang="en", output_dir=None, n_iter=25):
    """Create a new model, set up the pipeline and train the tagger. In order to
    train the tagger with a custom tag map, we're creating a new Language
    instance with a custom vocab.
    """
    nlp=spacy.blank(lang)
    tagger = nlp.create_pipe("tagger")

    for tag,values in TAG_MAP.items():
      tagger.add_label(tag,values)
    
    nlp.add_pipe(tagger)
    optimizer = nlp.begin_training()


    for i in range(n_iter):
      random.shuffle(TRAIN_DATA)
      losses = {}

      batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
      for batch in batches:
        texts,annotations = zip(*batch)
        nlp.update(texts, annotations,sgd=optimizer,losses=losses)
      print('looses', losses)

    test_text = "I like blue eggs"
    doc = nlp(test_text)
    print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

    if output_dir is not None:
      output_dir =  path(output_dir)
      if not output_dir.exists():
        output_dir.mkdir()
      nlp.to_disk(output_dir)
      print("Saved model to", output_dir)

      # test the save model
      print("Loading from", output_dir)
      nlp2 = spacy.load(output_dir)
      doc = nlp2(test_text)
      print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

In [45]:
if __name__ == "__main__":
    #plac.call(main)
    main()

  "__main__", mod_spec)


looses {'tagger': 6.883332252502441}
looses {'tagger': 6.835843086242676}
looses {'tagger': 6.685646057128906}
looses {'tagger': 6.38253116607666}
looses {'tagger': 5.830503463745117}
looses {'tagger': 4.940542221069336}
looses {'tagger': 3.8143503665924072}
looses {'tagger': 2.6712522506713867}
looses {'tagger': 1.645768642425537}
looses {'tagger': 0.8605381846427917}
looses {'tagger': 0.3807932436466217}
looses {'tagger': 0.15221552550792694}
looses {'tagger': 0.05709473043680191}
looses {'tagger': 0.020614877343177795}
looses {'tagger': 0.0072023263201117516}
looses {'tagger': 0.0024954171385616064}
looses {'tagger': 0.0008693357813172042}
looses {'tagger': 0.00030711828731000423}
looses {'tagger': 0.00010984686377923936}
looses {'tagger': 4.0378774428972974e-05}
looses {'tagger': 1.5559327948722057e-05}
looses {'tagger': 6.347153430397157e-06}
looses {'tagger': 2.7576165848586243e-06}
looses {'tagger': 1.2761089465129771e-06}
looses {'tagger': 6.28177986072842e-07}
Tags [('I', 'N',