In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")

In [None]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,,impact,NN,O
1048566,,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,,forces,NNS,O
1048569,,said,VBD,O
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O
1048574,,attack,NN,O


In [None]:
data = data.fillna(method='ffill')

In [None]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [None]:
data.tail()

Unnamed: 0,Sentence #,Word,POS,Tag
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [None]:
data.isnull().sum()

Sentence #    0
Word          0
POS           0
Tag           0
dtype: int64

In [None]:
# retrieve a sentence from the dataset
len(list(set(data["Word"].values)))

35178

In [None]:
class Sentencegetter(object):
  def __init__(self, data):
    self.n_sent = 1
    self.data = data
    self.empty = False
  def get_next(self):
        try:
          s = self.data[self.data["Sentence #"]=="Sentence: {}".format(self.n_sent)]
          self.n_sent += 1
          return s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].values.tolist()
        except:
          self.empty = True
          return None


In [None]:
getter = Sentencegetter(data)
sent, pos, tag = getter.get_next()

In [None]:
print(sent); print(pos); print(tag)

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [None]:
tagger = MemoryTagger()

In [None]:
tagger.fit(sent,tag)

In [None]:
print(tagger.predict(sent))

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [None]:
tagger.memory

{'.': 'O',
 'British': 'B-gpe',
 'Iraq': 'B-geo',
 'London': 'B-geo',
 'Thousands': 'O',
 'and': 'O',
 'country': 'O',
 'demand': 'O',
 'demonstrators': 'O',
 'from': 'O',
 'have': 'O',
 'in': 'O',
 'marched': 'O',
 'of': 'O',
 'protest': 'O',
 'that': 'O',
 'the': 'O',
 'through': 'O',
 'to': 'O',
 'troops': 'O',
 'war': 'O',
 'withdrawal': 'O'}

In [None]:
import sklearn
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [None]:
words = data["Word"].values.tolist()
tags = data["Tag"].values.tolist()

In [None]:
len(words)

1048575

In [None]:
words[:3]

['Thousands', 'of', 'demonstrators']

In [None]:
tags[:10]

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O']

In [None]:
pred = cross_val_predict(estimator=MemoryTagger(), X=words, y=tags, cv=5)


In [None]:
report = classification_report(y_true=tags, y_pred=pred)
print(report)



              precision    recall  f1-score   support

       B-art       0.20      0.05      0.09       402
       B-eve       0.54      0.25      0.34       308
       B-geo       0.78      0.85      0.81     37644
       B-gpe       0.94      0.93      0.94     15870
       B-nat       0.42      0.28      0.33       201
       B-org       0.67      0.49      0.56     20143
       B-per       0.78      0.65      0.71     16990
       B-tim       0.87      0.77      0.82     20333
       I-art       0.04      0.01      0.01       297
       I-eve       0.39      0.12      0.18       253
       I-geo       0.73      0.58      0.65      7414
       I-gpe       0.62      0.45      0.52       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.69      0.53      0.60     16784
       I-per       0.73      0.65      0.69     17251
       I-tim       0.58      0.13      0.21      6528
           O       0.97      0.99      0.98    887908

    accuracy              

In [None]:
def feature_map(word):
    '''Simple feature map.'''
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word), word.isdigit(), word.isalpha()])


In [None]:
words = [feature_map(w) for w in data["Word"].values.tolist()]


In [None]:
words[:3]

[array([1, 0, 0, 9, 0, 1]),
 array([0, 1, 0, 2, 0, 1]),
 array([ 0,  1,  0, 13,  0,  1])]

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
pred = cross_val_predict(RandomForestClassifier(n_estimators=20), X=words, y=tags, cv=5)

In [None]:
pred

array(['O', 'O', 'O', ..., 'O', 'O', 'O'], dtype='<U5')

In [None]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00       402
       B-eve       0.00      0.00      0.00       308
       B-geo       0.26      0.79      0.40     37644
       B-gpe       0.26      0.06      0.09     15870
       B-nat       0.00      0.00      0.00       201
       B-org       0.65      0.17      0.27     20143
       B-per       0.97      0.20      0.33     16990
       B-tim       0.29      0.32      0.30     20333
       I-art       0.00      0.00      0.00       297
       I-eve       0.00      0.00      0.00       253
       I-geo       0.00      0.00      0.00      7414
       I-gpe       0.00      0.00      0.00       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.36      0.03      0.06     16784
       I-per       0.46      0.02      0.04     17251
       I-tim       0.50      0.06      0.11      6528
           O       0.97      0.98      0.97    887908

    accuracy              

In [None]:
data["POS"].value_counts()

NN      145807
NNP     131426
IN      120996
DT       98454
JJ       78412
NNS      75840
.        47831
VBD      39379
,        32757
VBN      32328
VBZ      24960
CD       24695
VB       24211
CC       23716
TO       23061
RB       20252
VBG      19125
VBP      16158
PRP      13318
POS      11257
PRP$      8655
MD        6973
``        3728
WDT       3698
JJS       3034
JJR       2967
WP        2542
NNPS      2521
RP        2490
WRB       2184
$         1149
RBR       1055
:          795
RRB        679
LRB        678
EX         663
RBS        296
;          214
PDT        147
WP$         99
UH          24
FW           1
Name: POS, dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder

class FeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.memory_tagger = MemoryTagger()
        self.tag_encoder = LabelEncoder()
        self.pos_encoder = LabelEncoder()
        
    def fit(self, X, y):
        words = X["Word"].values.tolist()
        self.pos = X["POS"].values.tolist()
        tags = X["Tag"].values.tolist()
        self.memory_tagger.fit(words, tags)
        self.tag_encoder.fit(tags)
        self.pos_encoder.fit(self.pos)
        return self
    
    def transform(self, X, y=None):
        def pos_default(p):
            if p in self.pos:
                return self.pos_encoder.transform([p])[0]
            else:
                return -1
        
        pos = X["POS"].values.tolist()
        words = X["Word"].values.tolist()
        out = []
        for i in range(len(words)):
            w = words[i]
            print(w)
            p = pos[i]
            if i < len(words) - 1:
                wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i+1]]))[0]
                print(wp)
                posp = pos_default(pos[i+1])
            else:
                wp = self.tag_encoder.transform(['O'])[0]
                posp = pos_default(".")
            if i > 0:
                if words[i-1] != ".":
                    wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i-1]]))[0]
                    posm = pos_default(pos[i-1])
                else:
                    wm = self.tag_encoder.transform(['O'])[0]
                    posm = pos_default(".")
            else:
                posm = pos_default(".")
                wm = self.tag_encoder.transform(['O'])[0]
            out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
                                 self.tag_encoder.transform(self.memory_tagger.predict([w]))[0],
                                 pos_default(p), wp, wm, posp, posm]))
        return out


In [None]:
ft = FeatureTransformer()


In [None]:
#ft.fit_transform(X=data, y=tags)

In [None]:
from sklearn.pipeline import Pipeline


In [None]:
pred = cross_val_predict(Pipeline([('feature_map', FeatureTransformer()),
                                   ('clf', RandomForestClassifier(n_estimators=20, n_jobs=3))]),
                         X=data, y=tags, cv=5)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
cafes
16
in
16
the
16
capital
16
's
2
Old
10
Town
16
quarter
16
that
16
is
16
popular
16
with
16
tourists
16
.
16
Drunken
16
fans
16
hurled
16
stones
16
at
16
police
16
,
16
who
16
responded
16
with
16
tear
16
gas
16
and
16
water
16
cannon
16
.
16
Football-related
16
violence
16
has
16
been
16
a
16
growing
16
problem
16
in
2
Poland
16
,
16
prompting
16
concerns
16
that
3
Polish
16
fans
16
could
16
cause
16
trouble
16
at
16
next
16
month
16
's
16
World
16
Cup
16
in
2
Germany
16
.
16
Top
16
aides
16
of
16
the
14
Bush
16
administration
16
have
16
met
16
with
3
African
3
American
16
leaders
16
amid
16
criticism
16
of
16
the
16
federal
16
government
16
's
16
response
16
to
16
Hurricane
16
Katrina
16
.
16
The
16
meeting
16
,
16
held
16
at
16
the
5
White
13
House
16
,
16
came
16
a
16
day
16
after
2
U.S.
16
congressman
6
Elijah
14
Cummings
16
,
16
a
16
Democrat
16
from
2
Maryland
16
,
16
complained
16
about
16
a
16
slow
16
respon