In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import scipy
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
training_data_path = './conll003-en/train.txt'
testing_data_path = './conll003-en/test.txt'

In [3]:
import requests

def get_url_content(url):
    content = None
    if not url:
        return content
    else:
        content = requests.get(url).text
        return content
    
#get_file(training_data_url)

In [4]:
def get_file(file_path):
    content = None
    if not file_path:
        return content
    else:
        with open(file_path, 'r') as f:
            content = f.read()
        return content

### Question 2.1: Read the tagged ConLL 2003 Data from this link:

In [5]:
training_data_content = get_file(training_data_path)
testing_data_content = get_file(testing_data_path)

In [6]:
def split_data(text):
    data = []
    if not text:
        return list()
    else:
        split_text = text.split('\n')[2:]
        sub_data = []
        for item in split_text:
            if item == '':
                data.append(sub_data)
                sub_data = []
            else:
                sub_data.append(tuple(item.split()))
        return data

In [7]:
training_data = split_data(training_data_content)
testing_data = split_data(testing_data_content)
print(training_data[0])
print(testing_data[0])

[('EU', 'NNP', 'B-NP', 'B-ORG'), ('rejects', 'VBZ', 'B-VP', 'O'), ('German', 'JJ', 'B-NP', 'B-MISC'), ('call', 'NN', 'I-NP', 'O'), ('to', 'TO', 'B-VP', 'O'), ('boycott', 'VB', 'I-VP', 'O'), ('British', 'JJ', 'B-NP', 'B-MISC'), ('lamb', 'NN', 'I-NP', 'O'), ('.', '.', 'O', 'O')]
[('SOCCER', 'NN', 'B-NP', 'O'), ('-', ':', 'O', 'O'), ('JAPAN', 'NNP', 'B-NP', 'B-LOC'), ('GET', 'VB', 'B-VP', 'O'), ('LUCKY', 'NNP', 'B-NP', 'O'), ('WIN', 'NNP', 'I-NP', 'O'), (',', ',', 'O', 'O'), ('CHINA', 'NNP', 'B-NP', 'B-PER'), ('IN', 'IN', 'B-PP', 'O'), ('SURPRISE', 'DT', 'B-NP', 'O'), ('DEFEAT', 'NN', 'I-NP', 'O'), ('.', '.', 'O', 'O')]


In [8]:
training_data[:3]

[[('EU', 'NNP', 'B-NP', 'B-ORG'),
  ('rejects', 'VBZ', 'B-VP', 'O'),
  ('German', 'JJ', 'B-NP', 'B-MISC'),
  ('call', 'NN', 'I-NP', 'O'),
  ('to', 'TO', 'B-VP', 'O'),
  ('boycott', 'VB', 'I-VP', 'O'),
  ('British', 'JJ', 'B-NP', 'B-MISC'),
  ('lamb', 'NN', 'I-NP', 'O'),
  ('.', '.', 'O', 'O')],
 [('Peter', 'NNP', 'B-NP', 'B-PER'), ('Blackburn', 'NNP', 'I-NP', 'I-PER')],
 [('BRUSSELS', 'NNP', 'B-NP', 'B-LOC'), ('1996-08-22', 'CD', 'I-NP', 'O')]]

In [9]:
def create_data_frame(data):
    data_frames = []
    for sentences in data:
        df = pd.DataFrame(data=sentences, columns=['word', 'pos', 'phrase', 'ner'])
        data_frames.append(df)
    return data_frames

### Question 2.2: Define a CRF model with following parameters:

In [10]:
training_dfs = create_data_frame(training_data)
testing_dfs = create_data_frame(testing_data)

In [11]:
training_dfs[0]

Unnamed: 0,word,pos,phrase,ner
0,EU,NNP,B-NP,B-ORG
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,B-MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O
5,boycott,VB,I-VP,O
6,British,JJ,B-NP,B-MISC
7,lamb,NN,I-NP,O
8,.,.,O,O


In [12]:
def get_labels(sentences):
    all_labels = []
    
    for s_df in sentences:
        labels = s_df.loc[:, 'ner'].tolist()
        all_labels.append(labels)
        
    return all_labels 


def word2features(i, single_sent_df):
    
    word, postag = single_sent_df.iloc[i].loc[['word', 'pos']]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1, postag1 = single_sent_df.iloc[i-1].loc[['word', 'pos']]
        
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < (single_sent_df.shape[0] - 1):
        word1, postag1 = single_sent_df.iloc[i+1].loc[['word', 'pos']]
        
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(s_df):
    features = s_df.index.map(lambda word_idx: word2features(word_idx, s_df))
    return features.tolist()

def get_feature_values(all_sents):
    
    all_features = [sent2features(s) for s in all_sents]    
    return all_features

In [13]:
X_train = get_feature_values(training_dfs)
X_test = get_feature_values(testing_dfs)

In [14]:
y_train, y_test = get_labels(training_dfs), get_labels(testing_dfs)

In [16]:
%%time

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=400
)

crf.fit(X_train, y_train)

Wall time: 1min 58s




CRF(algorithm='lbfgs', c1=0.1, c2=0.1, keep_tempfiles=None, max_iterations=400)

In [17]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']

In [18]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.8010650592398317

In [19]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
report = metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)
print(report)



              precision    recall  f1-score   support

       B-LOC      0.856     0.809     0.832      1668
       I-LOC      0.761     0.619     0.682       257
      B-MISC      0.820     0.758     0.788       702
      I-MISC      0.686     0.667     0.676       216
       B-ORG      0.768     0.731     0.749      1661
       I-ORG      0.671     0.738     0.703       835
       B-PER      0.823     0.860     0.841      1617
       I-PER      0.862     0.951     0.904      1156

   micro avg      0.802     0.802     0.802      8112
   macro avg      0.781     0.766     0.772      8112
weighted avg      0.803     0.802     0.801      8112



### Question 2.3:
    Now that you have done a single model - try using Gridsearch to get the best parameters for our data. Use this 
    param grid:

In [20]:
params_space = {
 'c1': [scipy.stats.expon(scale=0.5)],
 'c2': [scipy.stats.expon(scale=0.05)],
}

In [21]:
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

rs = RandomizedSearchCV(crf, params_space, 
 cv=3, 
 verbose=1, 
 scoring=f1_scorer)

In [22]:
%%time

rs.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.4min finished


Wall time: 5min 7s


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs', c1=0.1, c2=0.1,
                                 keep_tempfiles=None, max_iterations=400),
                   param_distributions={'c1': [<scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021413C6C040>],
                                        'c2': [<scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021413C6C2E0>]},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']),
                   verbose=1)

In [23]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

best params: {'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021413C6C2E0>, 'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021413C6C040>}
best CV score: 0.8394004263171778


### Question 2.4: 
For the best model that you have picked with Grid/Random Search, try getting the most important transition 
(top 5) features by using the transition_features_ method of the model class. Also, get the topmost 
(5) state feature using state_features_ method that tells you the most important words for specific NER. 

In [24]:
x = crf.transition_features_
{k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)[:10]}

{('B-PER', 'I-PER'): 11.137087,
 ('B-ORG', 'I-ORG'): 10.160162,
 ('I-ORG', 'I-ORG'): 9.075419,
 ('B-LOC', 'I-LOC'): 8.954269,
 ('B-MISC', 'I-MISC'): 8.901813,
 ('I-MISC', 'I-MISC'): 8.620332,
 ('I-LOC', 'I-LOC'): 8.141421,
 ('I-PER', 'I-PER'): 7.972248,
 ('O', 'B-PER'): 5.069268,
 ('O', 'O'): 3.633627}

In [25]:
x = crf.state_features_
{k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)[:10]}

{('bias', 'O'): 5.86177,
 ('word.lower():clinton', 'B-PER'): 5.716927,
 ('word[-3:]:day', 'O'): 5.713592,
 ('-1:word.lower():v', 'B-ORG'): 5.660281,
 ('+1:word.lower():1996-08-26', 'B-LOC'): 5.525331,
 ('-1:word.lower():colo', 'I-LOC'): 5.371632,
 ('-1:word.lower():wisc', 'I-LOC'): 5.359923,
 ('word.lower():minister', 'O'): 5.300604,
 ('+1:word.lower():1996-08-27', 'B-LOC'): 5.270305,
 ('word.lower():hungary', 'B-LOC'): 5.166564}