In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set(font_scale=1)
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import scipy.stats

In [3]:
df = pd.read_csv('data/simple_ner.csv', encoding="latin1")

In [4]:
df = df.rename(columns={"Sentence #": "sentence#"})

In [5]:
df.head()

Unnamed: 0,sentence#,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [6]:
hmm = df.drop(['POS'], axis=1)
hmm = df.groupby('sentence#').agg(list)
hmm = df.reset_index(drop=True)
hmm.head()

Unnamed: 0,sentence#,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [7]:
df.shape

(1048575, 4)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentence#  47959 non-null    object
 1   Word       1048575 non-null  object
 2   POS        1048575 non-null  object
 3   Tag        1048575 non-null  object
dtypes: object(4)
memory usage: 32.0+ MB


In [9]:
#so we are basically having only those rows where sentence column is not null
data = df[df['sentence#'].notnull()]

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47959 entries, 0 to 1048567
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentence#  47959 non-null  object
 1   Word       47959 non-null  object
 2   POS        47959 non-null  object
 3   Tag        47959 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


## Exploring / Visualizing our data

In [10]:
data.head()

Unnamed: 0,sentence#,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
24,Sentence: 2,Families,NNS,O
54,Sentence: 3,They,PRP,O
68,Sentence: 4,Police,NNS,O
83,Sentence: 5,The,DT,O


In [11]:
# A class to retrieve the sentences from the dataset
class getsentence(object):
    
    def __init__(self, data):
        self.n_sent = 1.0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence#").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [12]:
data.head()

Unnamed: 0,sentence#,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
24,Sentence: 2,Families,NNS,O
54,Sentence: 3,They,PRP,O
68,Sentence: 4,Police,NNS,O
83,Sentence: 5,The,DT,O


In [13]:
getter = getsentence(data)

In [21]:
#Words tagged as B-org
data.loc[data['Tag'] == 'B-org', 'Word'].head()

1371    Halliburton
1482            VOA
1623           ABAC
1682           ABAC
1800           ABAC
Name: Word, dtype: object

In [22]:
#Words tagged as I-org
data.loc[data['Tag'] == 'I-org', 'Word'].head()

Series([], Name: Word, dtype: object)

In [23]:
#Words tagged as B-per
data.loc[data['Tag'] == 'B-per', 'Word'].head()

1296        Khan
1531         Tim
1711    Harcourt
1860     Foreign
1890         Mr.
Name: Word, dtype: object

In [24]:
#Words tagged as I-per
data.loc[data['Tag'] == 'I-per', 'Word'].head()

Series([], Name: Word, dtype: object)

In [25]:
#Words tagged as B-geo
data.loc[data['Tag'] == 'B-geo', 'Word'].head()


617        Mosul
2589      Russia
3316    Pakistan
3747        U.S.
4819       Japan
Name: Word, dtype: object

In [26]:
#Words tagged as I-geo
data.loc[data['Tag'] == 'I-geo', 'Word'].head()

Series([], Name: Word, dtype: object)

In [27]:
#Words tagged as I-geo
data.loc[data['Tag'] == 'O', 'Word'].head()

23     Families
53         They
67       Police
82          The
107         The
Name: Word, dtype: object

## **Modeling the Data**

In [11]:
#Simple feature map to feed arrays into the classifier. 
def feature_map(word):
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha()])

In [12]:
words = [feature_map(w) for w in data["Word"].values.tolist()]
tags = data["Tag"].values.tolist()

In [14]:
#Random Forest classifier
pred = cross_val_predict(RandomForestClassifier(n_estimators=20),X=words, y=tags, cv=5)

In [18]:
#Lets check the performance 
from sklearn.metrics import classification_report
report = classification_report(y_pred=pred, y_true=tags)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        18
       B-eve       0.00      0.00      0.00        10
       B-geo       0.52      0.10      0.16      3335
       B-gpe       0.00      0.00      0.00      2989
       B-nat       0.00      0.00      0.00        11
       B-org       0.80      0.20      0.33      2752
       B-per       0.99      0.39      0.56      4019
       B-tim       0.00      0.00      0.00       515
           O       0.76      1.00      0.86     34310

    accuracy                           0.76     47959
   macro avg       0.34      0.19      0.21     47959
weighted avg       0.71      0.76      0.69     47959



  _warn_prf(average, modifier, msg_start, len(result))
