### NER with scikit learn

In [20]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline # Very important for Production step
from tqdm import tqdm # Progress bar

In [21]:
os.listdir("../data/")

['trivia10k13.csv', 'trivia10k13.bio']

In [22]:
dataset_file = "../data/trivia10k13.bio"
dataset_file_csv = "../data/trivia10k13.csv"

In [23]:
# Build dataset
dataset = []
sentence_idx = 0

if not os.path.exists(dataset_file_csv):
    with open(dataset_file, encoding="utf-8") as f:
        flines = f.readlines()

        for line in tqdm(flines, desc="Building dataset..."):
            line = line.strip()
            if line == "":
                sentence_idx += 1
                continue
            targ, word = line.split("\t")
            dataset.append(["Sentence: {}".format(sentence_idx), word, targ])
            

    df = pd.DataFrame(dataset, columns=["Sentence #", "Word", "Tag"])
    df.to_csv(dataset_file_csv, index=False)


In [24]:
max_samples_rate = 25

In [25]:
df = pd.read_csv(dataset_file_csv)
df = df[: int(len(df) * max_samples_rate / 100)]
df.isnull().sum()
df = df.fillna(method='ffill')

In [26]:
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(1960, 5617, 25)

In [27]:
df.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 0,steve,B-Actor
1,Sentence: 0,mcqueen,I-Actor
2,Sentence: 0,provided,O
3,Sentence: 0,a,O
4,Sentence: 0,thrilling,B-Plot


In [28]:
X = df.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
X.shape

(39705, 7577)

In [29]:
y = df.Tag.values
classes = np.unique(y)
classes = classes.tolist()
classes

['B-Actor',
 'B-Award',
 'B-Character_Name',
 'B-Director',
 'B-Genre',
 'B-Opinion',
 'B-Origin',
 'B-Plot',
 'B-Quote',
 'B-Relationship',
 'B-Soundtrack',
 'B-Year',
 'I-Actor',
 'I-Award',
 'I-Character_Name',
 'I-Director',
 'I-Genre',
 'I-Opinion',
 'I-Origin',
 'I-Plot',
 'I-Quote',
 'I-Relationship',
 'I-Soundtrack',
 'I-Year',
 'O']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [31]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-Actor',
 'B-Award',
 'B-Character_Name',
 'B-Director',
 'B-Genre',
 'B-Opinion',
 'B-Origin',
 'B-Plot',
 'B-Quote',
 'B-Relationship',
 'B-Soundtrack',
 'B-Year',
 'I-Actor',
 'I-Award',
 'I-Character_Name',
 'I-Director',
 'I-Genre',
 'I-Opinion',
 'I-Origin',
 'I-Plot',
 'I-Quote',
 'I-Relationship',
 'I-Soundtrack',
 'I-Year']

### Check best model from sklearn

In [35]:
models = [
    ('SGDClassifier', SGDClassifier()),
    ('PassiveAggressiveClassifier', PassiveAggressiveClassifier()),
    ('MultinomialNB', MultinomialNB())
]

for name, model in models:
    vectorizer = DictVectorizer(sparse=False)
    clf = model
    clf.partial_fit(X_train, y_train, classes)
    y_pred = clf.predict(X_test)
    print(f"{name} Accuracy: {clf.score(X_test, y_test)}")

SGDClassifier Accuracy: 0.7051820193848737
PassiveAggressiveClassifier Accuracy: 0.6982370449515378
MultinomialNB Accuracy: 0.6681675952072045


In [36]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

                  precision    recall  f1-score   support

         B-Actor       0.64      0.58      0.61       383
         B-Award       0.65      0.37      0.47        30
B-Character_Name       0.08      0.02      0.03        56
      B-Director       0.66      0.46      0.54       138
         B-Genre       0.50      0.63      0.56       237
       B-Opinion       0.25      0.28      0.26        68
        B-Origin       0.00      0.00      0.00        54
          B-Plot       0.09      0.02      0.03       536
         B-Quote       0.00      0.00      0.00        12
  B-Relationship       0.58      0.36      0.44        42
    B-Soundtrack       0.00      0.00      0.00         9
          B-Year       0.91      0.68      0.78       214
         I-Actor       0.84      0.45      0.58       455
         I-Award       0.54      0.53      0.53        57
I-Character_Name       0.50      0.03      0.06        60
      I-Director       0.87      0.40      0.55       129
         I-Ge

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
pipe = Pipeline([
    ("embedding", v),
    ("model", sgd)
])

In [16]:
# Predict for single word
pipe.predict([{"Sentence #": "Sentence: 1", "Word": "Avengers"}, {"Sentence #": "Sentence: 1", "Word": "Endgame"}])

array(['I-Plot', 'I-Plot'], dtype='<U16')

In [17]:
# Save model and vectorizer
joblib.dump(pipe, "../models/pipe.joblib")
joblib.dump(v, "../models/vectorizer.joblib")

['../models/vectorizer.joblib']

In [18]:
# Load pipe and test for single word
pipe = joblib.load("../models/pipe.joblib")
pipe.predict([{"Sentence #": "Sentence: 1", "Word": "Avengers"}])

array(['I-Plot'], dtype='<U16')