In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 4.1MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [3]:
## import libraries
import nltk

## download pos tagging annotated dataset from nltk
nltk.download('treebank')

## load datset 
from nltk.corpus import treebank
import re
import warnings 
warnings.filterwarnings('ignore')
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics, scorers

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


## Popular datasets for pos tagging 


1.  Penn Treebank 
2.  Universal Dependencies




In [4]:
## extract words and associated tags from penn treebank dataset
penn_treebank_pos_dataset = []
for fileid in treebank.fileids():
  tokens = []
  tags = []
  for word, tag in treebank.tagged_words(fileid):
    tokens.append(word)
    tags.append(tag)
  penn_treebank_pos_dataset.append((tokens,tags))
print(f"First sentence :- \n {penn_treebank_pos_dataset[0]}")

First sentence :- 
 (['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.', 'Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.'], ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.', 'NNP', 'NNP', 'VBZ', 'NN', 'IN', 'NNP', 'NNP', ',', 'DT', 'NNP', 'VBG', 'NN', '.'])


In [5]:
## split whole dataset into train and test datasets
train_dataset_size = int(0.8*len(penn_treebank_pos_dataset)) ## taken 80% of data as traning dataset
penn_treebank_training_dataset = penn_treebank_pos_dataset[:train_dataset_size]
penn_treebank_testing_dataset = penn_treebank_pos_dataset[train_dataset_size:]

print(f"Size of Penn treebank training dataset :- {len(penn_treebank_training_dataset)}")
print(f"Size of Penn treebank testing dataset :- {len(penn_treebank_testing_dataset)}")

Size of Penn treebank training dataset :- 159
Size of Penn treebank testing dataset :- 40


In [6]:
## extract features from every word in the sentence
def extract_word_features(sentence, index):
  return {
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'is_capitalized':sentence[index][0].upper() == sentence[index][0],
      'is_all_caps': sentence[index].upper() == sentence[index],
      'is_all_lower': sentence[index].lower() == sentence[index],
      'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'prev_word':'' if index == 0 else sentence[index-1],
      'next_word':'' if index < len(sentence) else sentence[index+1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit(),
      'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
      }

In [7]:
## creating dataset from raw dataset with all word features
def transform_rawdata_dataset_format(tagged_sentences):
  X,y = [],[]
  for sentence, tags in tagged_sentences:
    sent_word_features, sent_tags = [], []
    for index in range(len(sentence)):
      sent_word_features.append(extract_word_features(sentence=sentence, index=index))
      sent_tags.append(tags[index])
    X.append(sent_word_features)
    y.append(sent_tags)
  return X,y

In [8]:
## create train, test datasets of features and target variables
X_penn_train, y_penn_train = transform_rawdata_dataset_format(penn_treebank_training_dataset)
X_penn_test, y_penn_test = transform_rawdata_dataset_format(penn_treebank_testing_dataset)

In [9]:
## initialize CRF model
penn_treebank_crf = CRF(algorithm='lbfgs',
                        c1=0.01,
                        c2=0.1,
                        max_iterations= 100,
                        all_possible_transitions = True)


In [10]:
## training a model
print("Starting CRF model traing on Penn Treebank dataset ")
penn_treebank_crf.fit(X_penn_train, y_penn_train)
print("Completed CRF model traning on Penn Treebank Dataset sucessfully")

Starting CRF model traing on Penn Treebank dataset 
Completed CRF model traning on Penn Treebank Dataset sucessfully


In [11]:
## evaluate CRF model on test dataset
y_penn_predict_test_results =  penn_treebank_crf.predict(X_penn_test)
print("F1-score on test dataset ")
f1_score_testdata = metrics.flat_f1_score(y_penn_test, y_penn_predict_test_results, average='weighted', labels=penn_treebank_crf.classes_)
print(f1_score_testdata)

## evaluate CRF model on training dataset
y_penn_predict_train_results =  penn_treebank_crf.predict(X_penn_train)
print("F1-score on train dataset ")
f1_score_testdata = metrics.flat_f1_score(y_penn_train, y_penn_predict_train_results, average='weighted', labels=penn_treebank_crf.classes_)
print(f1_score_testdata)

F1-score on test dataset 
0.9668646324625245
F1-score on train dataset 
0.9936643188628935


In [12]:
## classification report
print("Class wise score:")
print(metrics.flat_classification_report(
    y_penn_test, y_penn_predict_test_results, labels=penn_treebank_crf.classes_, digits=3
))

Class wise score:
              precision    recall  f1-score   support

         NNP      0.952     0.963     0.957      1213
           ,      1.000     1.000     1.000       592
          CD      1.000     0.999     0.999       683
         NNS      0.964     0.986     0.975       740
          JJ      0.879     0.912     0.895       731
          MD      0.993     1.000     0.996       135
          VB      0.980     0.946     0.963       313
          DT      0.992     0.993     0.992      1062
          NN      0.962     0.955     0.958      1899
          IN      0.981     0.980     0.981      1285
           .      1.000     1.000     1.000       509
         VBZ      0.958     0.936     0.947       219
         VBG      0.936     0.876     0.905       185
          CC      1.000     0.997     0.998       287
         VBD      0.965     0.945     0.955       492
         VBN      0.917     0.907     0.912       279
      -NONE-      0.998     1.000     0.999       871
         

Number Tag Description
1.	CC	Coordinating conjunction
2.	CD	Cardinal number
3.	DT	Determiner
4.	EX	Existential there
5.	FW	Foreign word
6.	IN	Preposition or subordinating conjunction
7.	JJ	Adjective
8.	JJR	Adjective, comparative
9.	JJS	Adjective, superlative
10.	LS	List item marker
11.	MD	Modal
12.	NN	Noun, singular or mass
13.	NNS	Noun, plural
14.	NNP	Proper noun, singular
15.	NNPS	Proper noun, plural
16.	PDT	Predeterminer
17.	POS	Possessive ending
18.	PRP	Personal pronoun
19.	PRP	Possessive pronounRB	Adverb
21.	RBR	Adverb, comparative
22.	RBS	Adverb, superlative
23.	RP	Particle
24.	SYM	Symbol
25.	TO	to
26.	UH	Interjection
27.	VB	Verb, base form
28.	VBD	Verb, past tense
29.	VBG	Verb, gerund or present participle
30.	VBN	Verb, past participle
31.	VBP	Verb, non-3rd person singular present
32.	VBZ	Verb, 3rd person singular present
33.	WDT	Wh-determiner
34.	WP	Wh-pronoun
35.	WP$	Possessive wh-pronoun
36.	WRB	Wh-adverb

In [13]:
## apply trained model on new text data
example_sentence = "The tagger produced good results"
word_features = [extract_word_features(example_sentence.split(), index) for index in range(len(example_sentence.split()))]
results = penn_treebank_crf.predict_single(word_features)
penn_tups = [(example_sentence.split()[index], results[index]) for index in range(len(example_sentence.split()))]
print(penn_tups)

[('The', 'DT'), ('tagger', 'NN'), ('produced', 'VBN'), ('good', 'JJ'), ('results', 'NNS')]
