<a href="https://colab.research.google.com/github/nandinib1999/edureka-assignments/blob/main/Edureka_Sequence_Learning_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **DESCRIPTION**

Predict named entity from given data and showcase how entities like persons, locations, organizations, and 
other miscellaneous entity names of that do not belong to the previous three groups can be predicted from 
the tagged dataset. 

### **Problem Statement**

Using CoNLL 2003 data, predict tagged NER using CRF Algorithm. Also, tune the algorithm and explore the 
learnings that have been done by the CRF Model.

In [1]:
# Reading the input files for training and evaluation of CRF model
with open("/content/drive/MyDrive/sl_train.txt") as fp:
  train_data = fp.readlines()

with open("/content/drive/MyDrive/sl_test.txt") as fp:
  test_data = fp.readlines()

In [2]:
test_data=test_data[2:]
test_data[:20]

['SOCCER NN B-NP O\n',
 '- : O O\n',
 'JAPAN NNP B-NP B-LOC\n',
 'GET VB B-VP O\n',
 'LUCKY NNP B-NP O\n',
 'WIN NNP I-NP O\n',
 ', , O O\n',
 'CHINA NNP B-NP B-PER\n',
 'IN IN B-PP O\n',
 'SURPRISE DT B-NP O\n',
 'DEFEAT NN I-NP O\n',
 '. . O O\n',
 '\n',
 'Nadim NNP B-NP B-PER\n',
 'Ladki NNP I-NP I-PER\n',
 '\n',
 'AL-AIN NNP B-NP B-LOC\n',
 ', , O O\n',
 'United NNP B-NP B-LOC\n',
 'Arab NNP I-NP I-LOC\n']

In [3]:
train_data=train_data[2:]
train_data[:20]

['EU NNP B-NP B-ORG\n',
 'rejects VBZ B-VP O\n',
 'German JJ B-NP B-MISC\n',
 'call NN I-NP O\n',
 'to TO B-VP O\n',
 'boycott VB I-VP O\n',
 'British JJ B-NP B-MISC\n',
 'lamb NN I-NP O\n',
 '. . O O\n',
 '\n',
 'Peter NNP B-NP B-PER\n',
 'Blackburn NNP I-NP I-PER\n',
 '\n',
 'BRUSSELS NNP B-NP B-LOC\n',
 '1996-08-22 CD I-NP O\n',
 '\n',
 'The DT B-NP O\n',
 'European NNP I-NP B-ORG\n',
 'Commission NNP I-NP I-ORG\n',
 'said VBD B-VP O\n']

In [4]:
# Splitting the text from file into sentences based on newline character
def split_into_sentences(data_lst):
  final_lst = []
  tmp = []
  for token in data_lst:
    if token == "\n":
      final_lst.append(tmp)
      tmp = []
    else:
      tmp.append(tuple(token.strip().split(' ')))
  if len(tmp) > 0:
    final_lst.append(tmp)
  return final_lst

In [5]:
train_sentences = split_into_sentences(train_data)
train_sentences[:5]

[[('EU', 'NNP', 'B-NP', 'B-ORG'),
  ('rejects', 'VBZ', 'B-VP', 'O'),
  ('German', 'JJ', 'B-NP', 'B-MISC'),
  ('call', 'NN', 'I-NP', 'O'),
  ('to', 'TO', 'B-VP', 'O'),
  ('boycott', 'VB', 'I-VP', 'O'),
  ('British', 'JJ', 'B-NP', 'B-MISC'),
  ('lamb', 'NN', 'I-NP', 'O'),
  ('.', '.', 'O', 'O')],
 [('Peter', 'NNP', 'B-NP', 'B-PER'), ('Blackburn', 'NNP', 'I-NP', 'I-PER')],
 [('BRUSSELS', 'NNP', 'B-NP', 'B-LOC'), ('1996-08-22', 'CD', 'I-NP', 'O')],
 [('The', 'DT', 'B-NP', 'O'),
  ('European', 'NNP', 'I-NP', 'B-ORG'),
  ('Commission', 'NNP', 'I-NP', 'I-ORG'),
  ('said', 'VBD', 'B-VP', 'O'),
  ('on', 'IN', 'B-PP', 'O'),
  ('Thursday', 'NNP', 'B-NP', 'O'),
  ('it', 'PRP', 'B-NP', 'O'),
  ('disagreed', 'VBD', 'B-VP', 'O'),
  ('with', 'IN', 'B-PP', 'O'),
  ('German', 'JJ', 'B-NP', 'B-MISC'),
  ('advice', 'NN', 'I-NP', 'O'),
  ('to', 'TO', 'B-PP', 'O'),
  ('consumers', 'NNS', 'B-NP', 'O'),
  ('to', 'TO', 'B-VP', 'O'),
  ('shun', 'VB', 'I-VP', 'O'),
  ('British', 'JJ', 'B-NP', 'B-MISC'),
  ('lamb

In [6]:
test_sentences = split_into_sentences(test_data)
test_sentences[:5]

[[('SOCCER', 'NN', 'B-NP', 'O'),
  ('-', ':', 'O', 'O'),
  ('JAPAN', 'NNP', 'B-NP', 'B-LOC'),
  ('GET', 'VB', 'B-VP', 'O'),
  ('LUCKY', 'NNP', 'B-NP', 'O'),
  ('WIN', 'NNP', 'I-NP', 'O'),
  (',', ',', 'O', 'O'),
  ('CHINA', 'NNP', 'B-NP', 'B-PER'),
  ('IN', 'IN', 'B-PP', 'O'),
  ('SURPRISE', 'DT', 'B-NP', 'O'),
  ('DEFEAT', 'NN', 'I-NP', 'O'),
  ('.', '.', 'O', 'O')],
 [('Nadim', 'NNP', 'B-NP', 'B-PER'), ('Ladki', 'NNP', 'I-NP', 'I-PER')],
 [('AL-AIN', 'NNP', 'B-NP', 'B-LOC'),
  (',', ',', 'O', 'O'),
  ('United', 'NNP', 'B-NP', 'B-LOC'),
  ('Arab', 'NNP', 'I-NP', 'I-LOC'),
  ('Emirates', 'NNPS', 'I-NP', 'I-LOC'),
  ('1996-12-06', 'CD', 'I-NP', 'O')],
 [('Japan', 'NNP', 'B-NP', 'B-LOC'),
  ('began', 'VBD', 'B-VP', 'O'),
  ('the', 'DT', 'B-NP', 'O'),
  ('defence', 'NN', 'I-NP', 'O'),
  ('of', 'IN', 'B-PP', 'O'),
  ('their', 'PRP$', 'B-NP', 'O'),
  ('Asian', 'JJ', 'I-NP', 'B-MISC'),
  ('Cup', 'NNP', 'I-NP', 'I-MISC'),
  ('title', 'NN', 'I-NP', 'O'),
  ('with', 'IN', 'B-PP', 'O'),
  ('a', 

In [7]:
import pandas as pd

def sent2df(sent):
  df = pd.DataFrame(data=sent, columns=['word', 'pos', 'parse', 'ner'])
  df.index.name = 'seq_num'
  return df
    
def all_sentences(sents):
  total_df = [sent2df(s) for s in sents]
  return total_df

In [8]:
train_sents = all_sentences(train_sentences)
test_sents  = all_sentences(test_sentences)

In [9]:
train_sents[0]

Unnamed: 0_level_0,word,pos,parse,ner
word_seq_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,EU,NNP,B-NP,B-ORG
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,B-MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O
5,boycott,VB,I-VP,O
6,British,JJ,B-NP,B-MISC
7,lamb,NN,I-NP,O
8,.,.,O,O


In [10]:
def get_labels(all_sents):
    all_labels = []
    
    for sent_df in all_sents:
        labels = sent_df['ner'].tolist()
        all_labels.append(labels)
        
    return all_labels 


def word2features(i, sent_df):    
    word, postag = sent_df.iloc[i].loc[['word', 'pos']]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1, postag1 = sent_df.iloc[i-1].loc[['word', 'pos']]
        
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < (sent_df.shape[0] - 1):
        word1, postag1 = sent_df.iloc[i+1].loc[['word', 'pos']]
        
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(df):
    features = df.index.map(lambda x: word2features(x, df))
    return features.tolist()

def get_feature_values(all_sents):
    all_features = [sent2features(s) for s in all_sents]    
    return all_features

In [11]:
X_train = get_feature_values(train_sents)
y_train = get_labels(train_sents)

X_test = get_feature_values(test_sents)
y_test = get_labels(test_sents)

In [12]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/79/47/58f16c46506139f17de4630dbcfb877ce41a6355a1bbf3c443edb9708429/python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 6.9MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


### **Simple CRF Model**

In [13]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [14]:
%%time
crf.fit(X_train, y_train)

CPU times: user 34.6 s, sys: 217 ms, total: 34.8 s
Wall time: 34.7 s




CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [15]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']

#### **Evaluation of model performance**

In [16]:
from sklearn_crfsuite import metrics

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.7994617872929973

In [17]:
print(metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

       B-ORG      0.764     0.727     0.745      1661
      B-MISC      0.816     0.756     0.785       702
       B-PER      0.826     0.857     0.842      1617
       I-PER      0.865     0.951     0.906      1156
       B-LOC      0.853     0.808     0.830      1668
       I-ORG      0.670     0.738     0.702       835
      I-MISC      0.687     0.671     0.679       216
       I-LOC      0.754     0.595     0.665       257

   micro avg      0.801     0.800     0.800      8112
   macro avg      0.779     0.763     0.769      8112
weighted avg      0.802     0.800     0.799      8112



#### **Hyperparameter Tuning**

- GridSearchCV

In [None]:
%%time

import scipy
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': [0.5, 0.25, 0.125, 0.625],
    'c2': [0.05, 0.025, 0.0125, 0.0625],
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = GridSearchCV(crf, param_grid=params_space, cv=3, verbose=1, n_jobs=-1, scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 26.8min finished


CPU times: user 23min 47s, sys: 3.6 s, total: 23min 51s
Wall time: 27min 19s


In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

best params: {'c1': 0.125, 'c2': 0.0625}
best CV score: 0.8430630416854501


In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels, digits=3
))

              precision    recall  f1-score   support

      B-MISC      0.819     0.759     0.788       702
       B-PER      0.820     0.855     0.837      1617
       I-PER      0.864     0.948     0.904      1156
       B-LOC      0.853     0.813     0.832      1668
       B-ORG      0.772     0.722     0.746      1661
       I-ORG      0.680     0.734     0.706       835
      I-MISC      0.670     0.667     0.668       216
       I-LOC      0.746     0.607     0.670       257

   micro avg      0.802     0.799     0.801      8112
   macro avg      0.778     0.763     0.769      8112
weighted avg      0.802     0.799     0.799      8112



In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(5))

Top likely transitions:
I-MISC -> I-MISC  6.193969
B-MISC -> I-MISC  5.909604
B-LOC  -> I-LOC   5.390707
I-LOC  -> I-LOC   4.975298
B-PER  -> I-PER   4.664758


In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(5))

Top positive:
6.726481 B-LOC    word.lower():chester-le-street
6.300068 I-LOC    -1:word.lower():wisc
6.295602 I-LOC    -1:word.lower():colo
6.230998 B-PER    word.lower():clinton
6.048541 B-LOC    +1:word.lower():1996-08-26


- RandomizedSearchCV

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': [0.5, 0.25, 0.125, 0.625],
    'c2': [0.05, 0.025, 0.0125, 0.0625],
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 28.2min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=None, c2=None,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None, gamma=None,
                                 keep_...
                                 trainer_cls=None, variance=None,
                                 verbose=False),
                   iid='deprecated', n_iter=50, n_jobs=-1,
                   param_distributions={'c1': [0.5, 0.25, 0.125, 0.625],
                                        'c2': [0.05, 0.025, 0.012

In [19]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

best params: {'c2': 0.025, 'c1': 0.125}
best CV score: 0.8427853831671083


In [20]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels, digits=3
))

              precision    recall  f1-score   support

       B-ORG      0.767     0.722     0.744      1661
      B-MISC      0.804     0.754     0.778       702
       B-PER      0.822     0.852     0.836      1617
       I-PER      0.870     0.949     0.908      1156
       B-LOC      0.854     0.811     0.832      1668
       I-ORG      0.673     0.735     0.703       835
      I-MISC      0.664     0.657     0.660       216
       I-LOC      0.742     0.603     0.665       257

   micro avg      0.800     0.797     0.799      8112
   macro avg      0.774     0.760     0.766      8112
weighted avg      0.800     0.797     0.798      8112



In [21]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(5))

Top likely transitions:
I-MISC -> I-MISC  6.167986
B-MISC -> I-MISC  5.679625
I-ORG  -> I-ORG   4.966390
B-LOC  -> I-LOC   4.908322
B-ORG  -> I-ORG   4.836225


In [23]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(5))

Top positive:
10.704585 B-LOC    word.lower():chester-le-street
8.677321 B-ORG    word.lower():sungard
8.176832 I-LOC    -1:word.lower():colo
8.140300 I-LOC    -1:word.lower():wisc
8.034788 O        word.lower():minister
