# Name Entity Recognition
## Bidirectional-LSTM-CRF model

In [1]:
import pandas as pd
import numpy as np
import os

### Download Entity-Annotated-Corpus

#### Option 1: through Google Drive
* Download corpus dataset from link: https://drive.google.com/file/d/1JZ4JXuJrEG1e9OiM1PEoRVtd9wcAIVz9/view?usp=sharing
* Upload corpus dataset back to your Google Colab under **/content** directory


#### Option 2: using Kaggle API 
* Generate and download Kaggle API token as **kaggle.json** file to **/content** directory of Google Colab
* Move **kaggle.json** to **~/.kaggle/kaggle.json** by command: **!mv kaggle.json ~/.kaggle/kaggle.json**
* Provide access by command: **!chmod 600 ~/.kaggle/kaggle.json**
* Download corpus dataset: **!kaggle datasets download -d abhinavwalia95/entity-annotated-corpus**

In [3]:
try:
    !mv kaggle.json ~/.kaggle.json
    !chmod 600 ~/.kaggle/kaggle.json
    !kaggle datasets download -d abhinavwalia95/entity-annotated-corpus --unzip --force
except:
    print("Please see Option 1 to get Entity-Annotated-Corpus")

entity-annotated-corpus.zip: Skipping, found more recently modified local copy (use --force to force download)


### Import corpus

In [26]:
data = pd.read_csv('ner_dataset.csv', encoding = 'latin1')
data = data.fillna(method="ffill")

In [27]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [28]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [29]:
getter = SentenceGetter(data)
sent = getter.get_next()

In [30]:
print(sent)

[('Families', 'NNS', 'O'), ('of', 'IN', 'O'), ('soldiers', 'NNS', 'O'), ('killed', 'VBN', 'O'), ('in', 'IN', 'O'), ('the', 'DT', 'O'), ('conflict', 'NN', 'O'), ('joined', 'VBD', 'O'), ('the', 'DT', 'O'), ('protesters', 'NNS', 'O'), ('who', 'WP', 'O'), ('carried', 'VBD', 'O'), ('banners', 'NNS', 'O'), ('with', 'IN', 'O'), ('such', 'JJ', 'O'), ('slogans', 'NNS', 'O'), ('as', 'IN', 'O'), ('"', '``', 'O'), ('Bush', 'NNP', 'B-per'), ('Number', 'NN', 'O'), ('One', 'CD', 'O'), ('Terrorist', 'NN', 'O'), ('"', '``', 'O'), ('and', 'CC', 'O'), ('"', '``', 'O'), ('Stop', 'VB', 'O'), ('the', 'DT', 'O'), ('Bombings', 'NNS', 'O'), ('.', '.', 'O'), ('"', '``', 'O')]


In [37]:
from sklearn.utils import shuffle
shuffle(getter.grouped).reset_index().rename(columns = {0 : 'sentence'})

Unnamed: 0,Sentence #,sentence
0,Sentence: 846,"[(A, DT, O), (short, JJ, O), (time, NN, O), (l..."
1,Sentence: 43712,"[(Experts, NNS, O), (say, VBP, O), (global, JJ..."
2,Sentence: 33758,"[(Ms., NNP, B-per), (Aslam, NNP, I-per), (warn..."
3,Sentence: 8220,"[(He, PRP, O), (denied, VBD, O), (accusations,..."
4,Sentence: 20757,"[(The, DT, O), (migrants, NNS, O), (had, VBD, ..."
5,Sentence: 35823,"[(Afghanistan, NNP, B-geo), ('s, POS, O), (eco..."
6,Sentence: 45064,"[(He, PRP, O), (said, VBD, O), (Japan, NNP, B-..."
7,Sentence: 20127,"[(They, PRP, O), (also, RB, O), (called, VBD, ..."
8,Sentence: 26782,"[(At, IN, O), (least, JJS, O), (38, CD, O), (p..."
9,Sentence: 36987,"[(On, IN, O), (the, DT, O), (economy, NN, O), ..."


In [17]:
getter.sentences

[[('Thousands', 'NNS', 'O')],
 [('Iranian', 'JJ', 'B-gpe')],
 [('Helicopter', 'NN', 'O')],
 [('They', 'PRP', 'O')],
 [('U.N.', 'NNP', 'B-geo')],
 [('Mr.', 'NNP', 'B-per')],
 [('He', 'PRP', 'O')],
 [('Some', 'DT', 'O')],
 [('Aid', 'NNP', 'O')],
 [('Lebanese', 'JJ', 'B-gpe')],
 [('In', 'IN', 'O')],
 [('One', 'CD', 'O')],
 [('Lebanon', 'NNP', 'B-geo')],
 [('Syria', 'NNP', 'B-geo')],
 [('The', 'DT', 'O')],
 [('Israeli', 'JJ', 'B-gpe')],
 [('Doctors', 'NNS', 'O')],
 [('The', 'DT', 'O')],
 [('Doctors', 'NNS', 'O')],
 [('Mr.', 'NNP', 'B-per')],
 [('Doctors', 'NNS', 'O')],
 [('The', 'DT', 'O')],
 [('SpaceShipOne', 'NNP', 'B-art')],
 [('To', 'TO', 'O')],
 [('The', 'DT', 'O')],
 [('Three', 'CD', 'O')],
 [('The', 'DT', 'O')],
 [('SpaceShipOne', 'NNP', 'B-art')],
 [('North', 'NNP', 'B-geo')],
 [('The', 'DT', 'O')],
 [('It', 'PRP', 'O')],
 [('The', 'DT', 'O')],
 [('Most', 'JJS', 'O')],
 [('Last', 'JJ', 'O')],
 [('A', 'DT', 'O')],
 [('The', 'DT', 'O')],
 [('Prime', 'JJ', 'O')],
 [('The', 'DT', 'O')]