## Installing and importing MITIE
- Clone the MITIE repository
- Generate the binaries with `make`
```
git clone https://github.com/mit-nlp/MITIE.git
cd MITIE
make
cd mitielib && echo $PWD
```

- Add the path of `mitielib` to sys path before importing
- Download and extract models.
```
wget https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2
tar xvf MITIE-models-v0.2.tar.bz2
```

In [2]:
import sys, os
sys.path.append('/home/b/gitpository/MITIE/mitielib')
MODELS_PATH = "/home/b/gitpository/MITIE/MITIE-models/english/"
import mitie

## Importing and displaying data

In [3]:
from utils import fetch_data, read_method
import pandas as pd
import numpy as np
import random

tr_sents,tr_labels,tr_intents = fetch_data('data2/atis.train.w-intent.iob')

def display(n,intents,sents,labels):
    sense = []
    print ("INTENT : ",intents[n])
    for i in range(len(sents[n])):
        sense.append({"word":sents[n][i],"label":labels[n][i]})
    return pd.DataFrame(sense)

print ("Number of sentences :",len(tr_sents))

Number of sentences : 4978


In [4]:
display(random.randint(0,len(tr_sents)),tr_intents,tr_sents,tr_labels)

INTENT :  atis_flight


Unnamed: 0,label,word
0,O,show
1,O,me
2,O,flights
3,O,from
4,B-fromloc.city_name,boston
5,O,to
6,B-toloc.city_name,philadelphia
7,O,on
8,O,a
9,B-depart_date.day_name,monday


## Getting entity tuples from label lists

In [5]:
def get_entities(labels):
    idx = 0
    last_begin = -1
    entity = ""
    entities = []
    while idx < len(labels):
        if labels[idx].startswith('B'):
            last_begin = idx;
            entity = labels[idx][2:]
        elif labels[idx].startswith('O'):
            if last_begin > 0:
                entities.append((last_begin-1, idx-1, entity))
                last_begin = -1
        idx += 1
    if last_begin > 0:
        entities.append((last_begin-1, idx-1, entity))

    return entities

In [6]:
label_sample = tr_labels[0]
print ("List of label :\n",label_sample)
print ("List of entities :\n",get_entities(label_sample))

List of label :
 ['O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-depart_time.time', 'I-depart_time.time', 'O', 'O', 'O', 'B-toloc.city_name', 'O', 'B-arrive_time.time', 'O', 'O', 'B-arrive_time.period_of_day']
List of entities :
 [(5, 6, 'fromloc.city_name'), (7, 9, 'depart_time.time'), (12, 13, 'toloc.city_name'), (14, 15, 'arrive_time.time'), (17, 18, 'arrive_time.period_of_day')]


In [7]:
# Load trainer from total_word_feature_extractor.dat
trainer = mitie.ner_trainer(MODELS_PATH+"total_word_feature_extractor.dat")

# Adding sentences and labels to get trained
limit = 10
for sentence, labels in zip(tr_sents[:limit], tr_labels):
    sample = mitie.ner_training_instance(sentence)    
    for entity in get_entities(labels):
        sample.add_entity(range(entity[0], entity[1]), entity[2])
    trainer.add(sample)

# Training the NER model
trainer.num_threads = 4
ner_atis = trainer.train()

## Comparing NER learnt on ATIS dataset with generic NER

In [8]:
# Loading generic NER model
ner_gen = mitie.named_entity_extractor(MODELS_PATH+"ner_model.dat")

### Labels 

In [9]:
print("Generic NER tags:\n", ner_gen.get_possible_ner_tags())
print("ATIS-trained NER tags:\n", ner_atis.get_possible_ner_tags())

Generic NER tags:
 ['PERSON', 'LOCATION', 'ORGANIZATION', 'MISC']
ATIS-trained NER tags:
 ['fromloc.city_name', 'depart_time.time', 'toloc.city_name', 'arrive_time.time', 'arrive_time.period_of_day', 'depart_time.period_of_day', 'flight_time', 'fare_amount', 'depart_date.today_relative', 'depart_date.day_name', 'city_name']


### Entities Recognised

In [14]:
from IPython.display import display, HTML
pd.set_option('display.max_colwidth',300)

# To print multi-line columns in DataFrame
def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n","<br>")))

# Loading test dataset
te_sents,te_labels,te_intents = fetch_data('data2/atis.train.w-intent.iob')
# te_sents,te_labels,te_intents = fetch_data('data2/atis.test.w-intent.iob')

# Returns string of `\n`-seperated entities detected by given ner on sentence
def get_ner_results(ner,sentence):
    entities = ner.extract_entities(sentence)
    ent_strings = []
    for e in entities:
        ent_strings.append(e[1] + ": " + " ".join(sentence[i] for i in e[0]))
    ents_str = '\n'.join(ent_strings)
    return ents_str

# NER Results on test dataset
results = []
limit = 1
for sentence, labels in zip(te_sents[:limit], te_labels):
    sent_str = ' '.join(sentence)
    results.append({"Sentence":sent_str,
                    "Generic-NER : Entities":get_ner_results(ner_gen,sentence),
                    "ATIS-NER : Entities":get_ner_results(ner_atis,sentence),
                   })

pretty_print(pd.DataFrame(results)[["Sentence","Generic-NER : Entities","ATIS-NER : Entities"]])

Unnamed: 0,Sentence,Generic-NER : Entities,ATIS-NER : Entities
0,i want to fly from boston at 838 am and arrive in denver at 1110 in the morning,LOCATION: boston,fromloc.city_name: boston depart_time.time: 838 am toloc.city_name: denver arrive_time.time: 1110 arrive_time.period_of_day: morning


## Conclusion
- Trained NER is capable of identifying more types of entities
    - Generic NER is only capable of detecting PERSON, LOCATION, ORGANIZATION and MISC
    - Trained NER is capable of detecting City Names, Date & Time and Fare Amount
- Trained NER is capable of identifying more refined entities
    - Generic NER can only detect `LOCATION`
        - For example : *i want to fly from boston at 838 am and arrive in denver at 1110 in the morning*
        - Generic NER considers *boston* and *denver* to be the same entity (`LOCATION`)
    - Trained NER can detect `fromloc.city_name`, `toloc.city_name` and `city_name`
        - For example : *i would like to find a flight from charlotte to las vegas that makes a stop in st. louis*
        - Trained NER considers *boston* to be `fromloc.city_name` and *denver* to be `toloc.city_name`

#### Reference
- https://github.com/mit-nlp/MITIE/blob/master/examples/python/train_ner.py