In [1]:
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [2]:
nlp = spacy.blank('en')  # create blank Language class
print("Created blank 'en' model")

Created blank 'en' model


In [3]:
labels = ['REFNO', 'PNR', 'TTIME', 'SRC', 'DES', 'AGENT', 'PASSENGER', 'FLTNO']

In [4]:
# add the parser to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy

if 'parser' not in nlp.pipe_names:
    parser = nlp.create_pipe('parser')
    nlp.add_pipe(parser, first=True)
# otherwise, get it, so we can add labels to it
else:
    parser = nlp.get_pipe('parser')

In [5]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe('ner')

In [6]:
for label in labels:
    ner.add_label(label)
model = None

In [7]:
if model is None:
    optimizer = nlp.begin_training()
else:
    # Note that 'begin_training' initializes the models, so it'll zero out
    # existing entity types.
    optimizer = nlp.entity.create_optimizer()



In [25]:
TRAIN_DATA = [
    ## Booking ID Samples
    
    ("Booking Id: NF22696100810981", {
        'entities': [(12, 28, 'REFNO')]
    }),

    ("Booking Id: NF22696100810981", {
        'entities': [(12, 28, 'REFNO')]
    }),

    ("Booking Id: NF22695116846185", {
        'entities': [(12, 28, 'REFNO')]
    }),

    ("Booking Id: NF22995132466090", {
        'entities': [(12, 28, 'REFNO')]
    }),

    ("Booking Id: NF22996135146142", {
        'entities': [(12, 28, 'REFNO')]
    }),
    
    ## Time and Date Samples

    ("17:00 hrs, 18 Mar", {
        'entities': [(0, 17, 'TTIME')]
    }),

    ("19:45 hrs, 18 Mar", {
        'entities': [(0, 17, 'TTIME')]
    }),

    ("07:15 hrs, 03 Feb", {
        'entities': [(0, 17, 'TTIME')]
    }),

    ("10:05 hrs, 03 Feb", {
        'entities': [(0, 17, 'TTIME')]
    }),

    ("08:35 hrs, 18 Aug", {
        'entities': [(0, 17, 'TTIME')]
    }),

    ("10:55 hrs, 18 Aug", {
        'entities': [(0, 17, 'TTIME')]
    }),
    
    ## Flight Number Samples

    ("G8-118", {
        'entities': [(0, 6, 'FLTNO')]
    }),

    ("G8-557", {
        'entities': [(0, 6, 'FLTNO')]
    }),

    ("G8-2847", {
        'entities': [(0, 7, 'FLTNO')]
    }),

    ("G8-284/ 4427", {
        'entities': [(0, 12, 'FLTNO')]
    }),

    ("6E-3794/ 154", {
        'entities': [(0, 12, 'FLTNO')]
    }),


    ("6E-606", {
        'entities': [(0, 6, 'FLTNO')]
    }),


    ("G8-557", {
        'entities': [(0, 6, 'FLTNO')]
    }),


    ("SG-185", {
        'entities': [(0, 6, 'FLTNO')]
    }),

    ("6E-528", {
        'entities': [(0, 6, 'FLTNO')]
    }),

    ("I5-247", {
        'entities': [(0, 6, 'FLTNO')]
    }),

    ("I5-722", {
        'entities': [(0, 6, 'FLTNO')]
    }),

    ("G6-142", {
        'entities': [(0, 6, 'FLTNO')]
    }),

    ("G6-411", {
        'entities': [(0, 6, 'FLTNO')]
    }),

    ("UK-817", {
        'entities': [(0, 6, 'FLTNO')]
    }),

    ("AI-503", {
        'entities': [(0, 6, 'FLTNO')]
    }),


    ("1. Shahrukh Khan, Adult", {
        'entities': [(3, 16, 'PASSENGER')]
    }),

    ("2. Ramesh Natarajan, Adult", {
        'entities': [(3, 19, 'PASSENGER')]
    }),

    ("1. Ranbir Kapoor, Adult", {
        'entities': [(3, 16, 'PASSENGER')]
    }),
    ("2. Vivek Kumar, Child", {
        'entities': [(3, 14, 'PASSENGER')]
    }),

    ("2. Manohar P, Adult", {
        'entities': [(3, 12, 'PASSENGER')]
    }),

    ("1. Vinay I, Child", {
        'entities': [(3, 10, 'PASSENGER')]
    }),

    ("3. Kavita Venkat, Adult", {
        'entities': [(3, 16, 'PASSENGER')]
    }),

    ("3. Nitin Datta, Adult", {
        'entities': [(3, 14, 'PASSENGER')]
    }),

    ("1. Aditya Khar, Child", {
        'entities': [(3, 14, 'PASSENGER')]
    }),

    ("2. Abhishek Khare, Adult", {
        'entities': [(3, 17, 'PASSENGER')]
    }),

    ("3. Manish Sharma, Adult", {
        'entities': [(3, 16, 'PASSENGER')]
    }),

    ("4. Virat Kohli, Child", {
        'entities': [(3, 14, 'PASSENGER')]
    }),

    ("2. Vivek Kumar, Child", {
        'entities': [(3, 14, 'PASSENGER')]
    }),

    ("2. Mr. MS Dhoni, Adult", {
        'entities': [(3, 15, 'PASSENGER')]
    }),

    ## Source and Destination
    
    ("DELHI TO PUNE", {
        'entities': [(0, 5, 'SRC'), (9, 13, 'DES')]
    }),

    ("PUNE TO DELHI", {
        'entities': [(0, 4, 'SRC'), (8, 13, 'DES')]
    }),

    ("BANGALORE TO CHENNAI", {
        'entities': [(0, 9, 'SRC'), (13, 20, 'DES')]
    }),

    ("DELHI TO CHENNAI ", {
        'entities': [(0, 5, 'SRC'), (9, 16, 'DES')]
    }),

    ("CHENNAI TO DELHI", {
        'entities': [(0, 7, 'SRC'), (11, 16, 'DES')]
    }),
    
    ("EDYTGT", {
        'entities': [(0, 6, 'PNR')]
    }),

    ("AGARTALA TO JAIPUR", {
        'entities': [(0, 8, 'SRC'), (12, 18, 'DES')]
    }),
    
    ## PNR no. samples

    ("ZH17NW", {
        'entities': [(0, 6, 'PNR')]
    }),

    ("E758YP", {
        'entities': [(0, 6, 'PNR')]
    }),

    ("MI7WUQ", {
        'entities': [(0, 6, 'PNR')]
    })
]

In [26]:
n_iter = 20
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                       losses=losses)
        print('Losses', losses)


Losses {'ner': 0.4865384628035122}
Losses {'ner': 0.25775649883419605}
Losses {'ner': 0.0646258525680429}
Losses {'ner': 2.8932435333819317e-05}
Losses {'ner': 0.07439277803996651}
Losses {'ner': 0.0002331116880010124}
Losses {'ner': 0.11948947655922801}
Losses {'ner': 2.0293938753382582e-05}
Losses {'ner': 0.011089605017763017}
Losses {'ner': 3.1763118646344514e-08}
Losses {'ner': 2.1275904832153723e-11}
Losses {'ner': 0.001316615893689609}
Losses {'ner': 0.09674176801769263}
Losses {'ner': 0.0027293836074012484}
Losses {'ner': 0.12499685592427243}
Losses {'ner': 0.007722765623325879}
Losses {'ner': 6.514743453469244e-13}
Losses {'ner': 0.009697374375991606}
Losses {'ner': 0.10343323699448624}
Losses {'ner': 6.410270944430178e-09}


In [27]:
doc1 = nlp("AI-48")
for ent in doc1.ents:
    print(ent.label_, ent.text)

FLTNO AI-48


In [28]:
doc2 = nlp("SG-185")
for ent in doc2.ents:
    print(ent.label_, ent.text)

FLTNO SG-185


In [29]:
doc3 = nlp("6E-1854")
for ent in doc3.ents:
    print(ent.label_, ent.text)

FLTNO 6E-1854


In [30]:
doc4 = nlp("PUNE TO CHENNAI")
for ent in doc4.ents:
    print(ent.label_, ent.text)

SRC PUNE
DES CHENNAI


In [31]:
doc5 = nlp("CHANDIGARH TO HYDERABAD")
for ent in doc5.ents:
    print(ent.label_, ent.text)

SRC CHANDIGARH
DES HYDERABAD


In [32]:
doc6 = nlp("DELHI TO BANGALORE")
for ent in doc6.ents:
    print(ent.label_, ent.text)

SRC DELHI
DES BANGALORE


In [33]:
doc7 = nlp("2. Brian Lara, Adult")
for ent in doc7.ents:
    print(ent.label_, ent.text)

TTIME Brian


In [34]:
doc8 = nlp("1. Ricky Ponting, Adult")
for ent in doc8.ents:
    print(ent.label_, ent.text)

PNR Ricky


In [35]:
doc9 = nlp("3. Sachin Tendulkar, Adult")
for ent in doc9.ents:
    print(ent.label_, ent.text)

PNR Sachin


In [36]:
doc10 = nlp("KB6RAE")
for ent in doc10.ents:
    print(ent.label_, ent.text)

PNR KB6RAE


In [20]:
doc11 = nlp("PWRHSO")
for ent in doc11.ents:
    print(ent.label_, ent.text)

PNR PWRHSO


In [41]:
doc12 = nlp("KB6RAE")
for ent in doc12.ents:
    print(ent.label_, ent.text)

PNR KB6RAE


In [42]:
doc13 = nlp("Booking Id: NF22696100810981")
for ent in doc13.ents:
    print(ent.label_, ent.text)

REFNO NF22696100810981


In [43]:
doc14 = nlp("Booking Id: NF2204240022047")
for ent in doc14.ents:
    print(ent.label_, ent.text)

REFNO NF2204240022047


In [44]:
doc15 = nlp("Booking Id: NF2202139868170")
for ent in doc15.ents:
    print(ent.label_, ent.text)

REFNO NF2202139868170
