In [4]:
import pandas as pds
import eli5 as el
from sklearn_crfsuite import CRF as crf
from sklearn.model_selection import cross_val_predict as cvp
from sklearn_crfsuite.metrics import flat_classification_report as fcr



In [28]:
text_df = pds.read_csv("crf_pos_dataset.csv", encoding="latin1")
text_df=text_df.head(10000)

In [29]:
text_df.head()

Unnamed: 0,Num,Word,Tag_POS
0,1.0,A,DT
1,,37-year-old,JJ
2,,woman,NN
3,,has,VBZ
4,,become,VBN


Perform appropriate data cleaning & preprocessing steps

In [30]:
text_df = text_df.fillna(method="ffill")

In [31]:
text_df["Word"].values
set(text_df["Word"].values)
list_of_words = list(set(text_df["Word"].values))

In [32]:
num_words = len(list_of_words)

 Then, tokenize the dataset.

In [33]:
class get_tokenised_text(object):
    
    def __init__(dummy, df):
        dummy.index = 1
        dummy.df = df
        dummy.nulls = False
        # Below function is used to concatenate or aggregare word, pos, tag in same tuple. For eg.: ('pope','NN','O')
        concat_function = lambda text: [(word, Tag_POS) for word, Tag_POS in zip(text["Word"].values.tolist(),
                                                        text["Tag_POS"].values.tolist())]
        # Then group the sentence with the concat function written above just to separate sentences
        dummy.grouped_sentence = dummy.df.groupby("Num").apply(concat_function)
        dummy.full_text = [text for text in dummy.grouped_sentence] 

In [34]:
def get_text(dummy):
    try:
        text = dummy.df[dummy.df["Num"] == "Sentence: {}".format(dummy.index)]
        dummy.index += 1
        return text["Word"].values.tolist(), text["Tag_POS"].values.tolist()  
    except:
        dummy.nulls = True
        return None, None, None

In [35]:
text_df_interim = get_tokenised_text(text_df)
print(text_df_interim)

<__main__.get_tokenised_text object at 0x7f345b699810>


In [36]:
tokenised_text = text_df_interim.full_text
print(tokenised_text)

[[('A', 'DT'), ('37-year-old', 'JJ'), ('woman', 'NN'), ('has', 'VBZ'), ('become', 'VBN'), ('the', 'DT'), ('13th', 'JJ'), ('person', 'NN'), ('in', 'IN'), ('Egypt', 'NNP'), ('to', 'TO'), ('die', 'VB'), ('of', 'IN'), ('the', 'DT'), ('H5N1', 'NNP'), ('strain', 'NN'), ('of', 'IN'), ('bird', 'NN'), ('flu', 'NN'), ('.', '.')], [('Nadia', 'NNP'), ('Mohammed', 'NNP'), ('Abdel', 'NNP'), ('Hafez', 'NNP'), ('died', 'VBD'), ('in', 'IN'), ('a', 'DT'), ('hospital', 'NN'), ('in', 'IN'), ('Cairo', 'NNP'), ('early', 'JJ'), ('Friday', 'NNP'), ('.', '.')], [('Health', 'NNP'), ('officials', 'NNS'), ('initially', 'RB'), ('reported', 'VBD'), ('that', 'IN'), ('her', 'PRP$'), ('condition', 'NN'), ('was', 'VBD'), ('stable', 'JJ'), ('and', 'CC'), ('that', 'IN'), ('she', 'PRP'), ('was', 'VBD'), ('being', 'VBG'), ('treated', 'VBN'), ('with', 'IN'), ('the', 'DT'), ('drug', 'NN'), ('Tamiflu', 'NNP'), ('.', '.')], [('The', 'DT'), ('woman', 'NN'), ('raised', 'VBD'), ('poultry', 'NN'), ('in', 'IN'), ('her', 'PRP$'), ('

Extract features from the text. Add features like whether the word is in lower case, is it a title or is it
a digit and what is its POS tag, whether it is at the beginning of the sentence or at the end of the
sentence. These features should be part of the X variable. And Y should be the target variable, i.e.,
POS of the particular word.

In [38]:
def text2features(text, index):
    word = text[index][0]
    pos_tag = text[index][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'pos_tag': pos_tag,
        'pos_tag[:2]': pos_tag[:2],
    }
    if index > 0:
        word_next = text[index-1][0]
        pos_tag_next = text[index-1][1]
        features.update({
            '-1:word.lower()': word_next.lower(),
            '-1:word.istitle()': word_next.istitle(),
            '-1:word.isupper()': word_next.isupper(),
            '-1:pos_tag': pos_tag_next,
            '-1:pos_tag[:2]': pos_tag_next[:2],
        })
    else:
        features['BOS'] = True

    if index < len(text)-1:
        word_next = text[index+1][0]
        pos_tag_next = text[index+1][1]
        features.update({
            '+1:word.lower()': word_next.lower(),
            '+1:word.istitle()': word_next.istitle(),
            '+1:word.isupper()': word_next.isupper(),
            '+1:pos_tag': pos_tag_next,
            '+1:pos_tag[:2]': pos_tag_next[:2],
        })
    else:
        features['EOS'] = True

    return features

In [39]:
def text_to_features_all(text):
    return [text2features(text, index) for index in range(len(text))]

In [42]:
def text_to_labels(text):
    return [postag for token, postag in text]

In [41]:
X = [text_to_features_all(text) for text in tokenised_text]
X

[[{'+1:pos_tag': 'JJ',
   '+1:pos_tag[:2]': 'JJ',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:word.lower()': '37-year-old',
   'BOS': True,
   'bias': 1.0,
   'pos_tag': 'DT',
   'pos_tag[:2]': 'DT',
   'word.isdigit()': False,
   'word.istitle()': True,
   'word.isupper()': True,
   'word.lower()': 'a',
   'word[-2:]': 'A',
   'word[-3:]': 'A'},
  {'+1:pos_tag': 'NN',
   '+1:pos_tag[:2]': 'NN',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:word.lower()': 'woman',
   '-1:pos_tag': 'DT',
   '-1:pos_tag[:2]': 'DT',
   '-1:word.istitle()': True,
   '-1:word.isupper()': True,
   '-1:word.lower()': 'a',
   'bias': 1.0,
   'pos_tag': 'JJ',
   'pos_tag[:2]': 'JJ',
   'word.isdigit()': False,
   'word.istitle()': False,
   'word.isupper()': False,
   'word.lower()': '37-year-old',
   'word[-2:]': 'ld',
   'word[-3:]': 'old'},
  {'+1:pos_tag': 'VBZ',
   '+1:pos_tag[:2]': 'VB',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:wo

In [47]:
y = [text_to_labels(text) for text in tokenised_text]
y

[['DT',
  'JJ',
  'NN',
  'VBZ',
  'VBN',
  'DT',
  'JJ',
  'NN',
  'IN',
  'NNP',
  'TO',
  'VB',
  'IN',
  'DT',
  'NNP',
  'NN',
  'IN',
  'NN',
  'NN',
  '.'],
 ['NNP',
  'NNP',
  'NNP',
  'NNP',
  'VBD',
  'IN',
  'DT',
  'NN',
  'IN',
  'NNP',
  'JJ',
  'NNP',
  '.'],
 ['NNP',
  'NNS',
  'RB',
  'VBD',
  'IN',
  'PRP$',
  'NN',
  'VBD',
  'JJ',
  'CC',
  'IN',
  'PRP',
  'VBD',
  'VBG',
  'VBN',
  'IN',
  'DT',
  'NN',
  'NNP',
  '.'],
 ['DT',
  'NN',
  'VBD',
  'NN',
  'IN',
  'PRP$',
  'NN',
  'IN',
  'DT',
  'NN',
  'IN',
  'NNP',
  ',',
  'NN',
  'IN',
  'NNP',
  ',',
  'WRB',
  'DT',
  'NN',
  'NN',
  'VBD',
  'IN',
  'NN',
  'NN',
  'RBR',
  'DT',
  'NN',
  '.'],
 ['NNP',
  'NNS',
  'VBD',
  'RB',
  'NNP',
  'IN',
  'DT',
  'JJ',
  'NN',
  'VBZ',
  'VBN',
  'JJ',
  'IN',
  'NN',
  'NN',
  '.'],
 ['PRP',
  'VBZ',
  'DT',
  'NN',
  'VBD',
  'JJ',
  'NN',
  'IN',
  'NN',
  'NN',
  'IN',
  'NNP',
  '.'],
 ['NNP',
  'VBZ',
  'VBN',
  'DT',
  'JJS',
  'NN',
  'IN',
  'JJ',
  'NNS

Use sklearn_crfsuite and build a CRF model. 

In [48]:
# Syntax to use the CRF model
crf_model = crf(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

c. Also, run predictions on the same dataset.

In [49]:
# Predict the model and check for accuracy and other metrics
prediction = cvp(estimator=crf_model, X=X, y=y, cv=5)



Build a flat classification report for each POS that is present in the corpus and calculate the accuracy,
F1 score

In [50]:
classification_analysis = fcr(y_pred=prediction, y_true=y)
print(classification_analysis)

              precision    recall  f1-score   support

           $       0.71      1.00      0.83        10
           ,       1.00      1.00      1.00       335
           .       1.00      1.00      1.00       450
           :       1.00      1.00      1.00        19
           ;       0.00      0.00      0.00         1
          CC       1.00      1.00      1.00       207
          CD       1.00      1.00      1.00       270
          DT       1.00      1.00      1.00       912
          EX       1.00      1.00      1.00         3
          IN       0.99      1.00      1.00      1112
          JJ       1.00      1.00      1.00       725
         JJR       1.00      1.00      1.00        28
         JJS       1.00      1.00      1.00        31
         LRB       0.00      0.00      0.00        13
          MD       0.98      1.00      0.99        59
          NN       1.00      1.00      1.00      1383
         NNP       1.00      1.00      1.00      1331
        NNPS       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
