## Necessary Imports

In [9]:
import pandas as pd
import numpy as np
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

import os
import sys
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ilke/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Read Data

In [2]:
DATA_PATH = "/home/ilke/Desktop/ner-interview-question/data/"
train = pd.read_excel("%sner_train.xlsx" % DATA_PATH)
test = pd.read_excel("%sner_test.xlsx" % DATA_PATH)

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,Sentence Number,Word,Tag,POS
0,0,Sentence: 0,what,O,WP
1,1,Sentence: 0,movies,O,NNS
2,2,Sentence: 0,star,O,VBP
3,3,Sentence: 0,bruce,B-ACTOR,NN
4,4,Sentence: 0,willis,I-ACTOR,NN


In [4]:
test.head()

Unnamed: 0.1,Unnamed: 0,Sentence Number,Word,Tag,POS
0,0,Sentence: 0,are,O,VBP
1,1,Sentence: 0,there,O,RB
2,2,Sentence: 0,any,O,DT
3,3,Sentence: 0,good,O,JJ
4,4,Sentence: 0,romantic,B-GENRE,JJ


## Data Characteristics

In [5]:
train.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-ACTOR,8230
1,B-AWARD,309
2,B-CHARACTER,1409
3,B-DIRECTOR,3507
4,B-GENRE,7738
5,B-OPINION,810
6,B-ORIGIN,779
7,B-PLOT,8395
8,B-QUOTE,126
9,B-RATING,3876


In [7]:
test.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-ACTOR,2086
1,B-AWARD,66
2,B-CHARACTER,373
3,B-DIRECTOR,881
4,B-GENRE,1906
5,B-OPINION,195
6,B-ORIGIN,190
7,B-PLOT,2068
8,B-QUOTE,47
9,B-RATING,951


## Data Preparation

In [10]:
X_train = train.drop('Tag',  axis=1)
X_test = test.drop('Tag', axis=1)

y_train = train.Tag.values
y_test = test.Tag.values

classes = np.unique(y_train)
classes = classes.tolist()

In [11]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-ACTOR',
 'B-AWARD',
 'B-CHARACTER',
 'B-DIRECTOR',
 'B-GENRE',
 'B-OPINION',
 'B-ORIGIN',
 'B-PLOT',
 'B-QUOTE',
 'B-RATING',
 'B-RELATIONSHIP',
 'B-REVIEW',
 'B-SONG',
 'B-TITLE',
 'B-TRAILER',
 'B-YEAR',
 'I-ACTOR',
 'I-AWARD',
 'I-CHARACTER',
 'I-DIRECTOR',
 'I-GENRE',
 'I-OPINION',
 'I-ORIGIN',
 'I-PLOT',
 'I-QUOTE',
 'I-RATING',
 'I-RELATIONSHIP',
 'I-REVIEW',
 'I-SONG',
 'I-TITLE',
 'I-TRAILER',
 'I-YEAR']

### Helper Class for Feature Extraction

In [15]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence Number').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [16]:
getter = SentenceGetter(train)
train_sentences = getter.sentences

In [17]:
getter = SentenceGetter(test)
test_sentences = getter.sentences

### Helper Functions for Feature Extraction

In [19]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

## Feature Extraction

In [20]:
X_train = [sent2features(s) for s in train_sentences]
y_train = [sent2labels(s) for s in train_sentences]

X_test = [sent2features(s) for s in test_sentences]
y_test = [sent2labels(s) for s in test_sentences]

## Training

In [22]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [23]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

  'precision', 'predicted', average, warn_for)


                precision    recall  f1-score   support

       B-ACTOR       0.92      0.93      0.93      2086
       B-AWARD       0.65      0.65      0.65        66
   B-CHARACTER       0.70      0.48      0.57       373
    B-DIRECTOR       0.86      0.85      0.86       881
       B-GENRE       0.89      0.90      0.89      1906
     B-OPINION       0.45      0.35      0.39       195
      B-ORIGIN       0.46      0.38      0.42       190
        B-PLOT       0.51      0.47      0.49      2068
       B-QUOTE       0.78      0.30      0.43        47
      B-RATING       0.94      0.92      0.93       951
B-RELATIONSHIP       0.75      0.55      0.63       171
      B-REVIEW       0.33      0.11      0.16        56
        B-SONG       0.68      0.48      0.57        62
       B-TITLE       0.74      0.69      0.72       562
     B-TRAILER       0.82      0.77      0.79        30
        B-YEAR       0.95      0.96      0.96      1381
       I-ACTOR       0.92      0.93      0.93  

In [24]:
from sklearn.externals import joblib



In [26]:
final = X_train + X_test

In [27]:
final_y = y_train + y_test

In [28]:
crf.fit(final, final_y)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [29]:
joblib.dump(crf, "ner_crf.joblib")

['ner_crf.joblib']