In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set(font_scale=1)
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import scipy.stats

In [2]:
df = pd.read_csv('data/simple_ner.csv', encoding="latin1")

In [3]:
df = df.rename(columns={"Sentence #": "sentence#"})

In [4]:
df.head()

Unnamed: 0,sentence#,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
hmm = df.drop(['POS'], axis=1)
hmm = df.groupby('sentence#').agg(list)
hmm = df.reset_index(drop=True)
hmm.head()

Unnamed: 0,sentence#,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [6]:
df.shape

(1048575, 4)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentence#  47959 non-null    object
 1   Word       1048575 non-null  object
 2   POS        1048575 non-null  object
 3   Tag        1048575 non-null  object
dtypes: object(4)
memory usage: 32.0+ MB


In [8]:
#so we are basically having only those rows where sentence column is not null
#data = df[df['sentence#'].notnull()]
data = df

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentence#  47959 non-null    object
 1   Word       1048575 non-null  object
 2   POS        1048575 non-null  object
 3   Tag        1048575 non-null  object
dtypes: object(4)
memory usage: 32.0+ MB


## Exploring / Visualizing our data

In [10]:
data.head()

Unnamed: 0,sentence#,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [11]:
# A class to retrieve the sentences from the dataset
class getsentence(object):
    
    def __init__(self, data):
        self.n_sent = 1.0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence#").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [12]:
data.head()

Unnamed: 0,sentence#,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [13]:
getter = getsentence(data)

In [14]:
#Words tagged as B-org
data.loc[data['Tag'] == 'B-org', 'Word'].head()

97             Labor
154    International
215             IAEA
234         European
248             U.N.
Name: Word, dtype: object

In [15]:
#Words tagged as I-org
data.loc[data['Tag'] == 'I-org', 'Word'].head()

98      Party
155    Atomic
156    Energy
157    Agency
235     Union
Name: Word, dtype: object

In [16]:
#Words tagged as B-per
data.loc[data['Tag'] == 'B-per', 'Word'].head()

42          Bush
270    President
331       Thomas
443    President
965      Prophet
Name: Word, dtype: object

In [17]:
#Words tagged as I-per
data.loc[data['Tag'] == 'I-per', 'Word'].head()

271        Mahmoud
272    Ahmadinejad
332        Horbach
444      Abdullahi
445          Yusuf
Name: Word, dtype: object

In [18]:
#Words tagged as B-geo
data.loc[data['Tag'] == 'B-geo', 'Word'].head()


6        London
12         Iraq
65         Hyde
94      Britain
106    Brighton
Name: Word, dtype: object

In [19]:
#Words tagged as I-geo
data.loc[data['Tag'] == 'I-geo', 'Word'].head()

66      Park
347    State
350    State
381    Delta
561     Arab
Name: Word, dtype: object

In [20]:
#Words tagged as I-geo
data.loc[data['Tag'] == 'O', 'Word'].head()

0        Thousands
1               of
2    demonstrators
3             have
4          marched
Name: Word, dtype: object

## **Modeling the Data**

In [21]:
#Simple feature map to feed arrays into the classifier. 
def feature_map(word):
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha()])

In [22]:
words = [feature_map(w) for w in data["Word"].values.tolist()]
tags = data["Tag"].values.tolist()

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(words, tags, test_size=0.33, random_state=0)

In [24]:
rf = RandomForestClassifier(n_estimators=20)

In [25]:
#Random Forest classifier
pred = cross_val_predict(rf,X=words, y=tags, cv=5)

In [26]:
all_tags = df.Tag.values
classes = np.unique(all_tags)
classes = classes.tolist()

In [27]:
#Lets check the performance 
from sklearn.metrics import classification_report
report = classification_report(y_pred=pred, y_true=tags, labels=classes)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00       402
       B-eve       0.00      0.00      0.00       308
       B-geo       0.26      0.80      0.40     37644
       B-gpe       0.25      0.03      0.05     15870
       B-nat       0.00      0.00      0.00       201
       B-org       0.65      0.17      0.27     20143
       B-per       0.97      0.20      0.33     16990
       B-tim       0.29      0.32      0.30     20333
       I-art       0.00      0.00      0.00       297
       I-eve       0.00      0.00      0.00       253
       I-geo       0.00      0.00      0.00      7414
       I-gpe       0.00      0.00      0.00       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.36      0.03      0.06     16784
       I-per       0.47      0.02      0.04     17251
       I-tim       0.50      0.06      0.11      6528
           O       0.97      0.98      0.97    887908

    accuracy              

In [28]:
new_classes = classes.copy()
new_classes.pop()

'O'

In [29]:
#Lets check the performance 
from sklearn.metrics import classification_report
report = classification_report(y_pred=pred, y_true=tags, labels=new_classes)
print(report)

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00       402
       B-eve       0.00      0.00      0.00       308
       B-geo       0.26      0.80      0.40     37644
       B-gpe       0.25      0.03      0.05     15870
       B-nat       0.00      0.00      0.00       201
       B-org       0.65      0.17      0.27     20143
       B-per       0.97      0.20      0.33     16990
       B-tim       0.29      0.32      0.30     20333
       I-art       0.00      0.00      0.00       297
       I-eve       0.00      0.00      0.00       253
       I-geo       0.00      0.00      0.00      7414
       I-gpe       0.00      0.00      0.00       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.36      0.03      0.06     16784
       I-per       0.47      0.02      0.04     17251
       I-tim       0.50      0.06      0.11      6528

   micro avg       0.30      0.28      0.29    160667
   macro avg       0.23   

In [30]:
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=20)

In [31]:
import eli5
from eli5.sklearn import *
explain_rf_feature_importance(rf, top=10)

Weight,Feature
0.5053  ± 0.5741,x0
0.2103  ± 0.5037,x1
0.1671  ± 0.1397,x3
0.0507  ± 0.0909,x5
0.0401  ± 0.0292,x2
0.0266  ± 0.0219,x4
