## Formulating Conditional Random Fields (CRF)

In [1]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Obtaining dependency information for sklearn-crfsuite from https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl.metadata
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite)
  Obtaining dependency information for python-crfsuite>=0.8.3 from https://files.pythonhosted.org/packages/78/cd/8fe34205c01101b0e8dc1a18c92b3b932ab7b246daf0399072ddb551e109/python_crfsuite-0.9.10-cp311-cp311-win_amd64.whl.metadata
  Downloading python_crfsuite-0.9.10-cp311-cp311-win_amd64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Downloading python_crfsuite-0.9.10-cp311-cp311-win_amd64.whl (155 kB)
   ---------------------------------------- 0.0/155.8 kB ? eta -:--:--
   ------- ------------------------------- 30.7/155.8 kB 660.6 kB/s eta 0:00:01
   ----------------

In [3]:
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics

In [4]:
# Load the Penn Treebank corpus
nltk.download('treebank')
corpus = nltk.corpus.treebank.tagged_sents()
print(corpus)


[nltk_data] Downloading package treebank to
[nltk_data]     D:\Users\sfl22\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.


[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]


In [5]:
# Define a function to extract features for each word in a sentence
def word_features(sentence, i):
	word = sentence[i][0]
	features = {
		'word': word,
		'is_first': i == 0, #if the word is a first word
		'is_last': i == len(sentence) - 1, #if the word is a last word
		'is_capitalized': word[0].upper() == word[0],
		'is_all_caps': word.upper() == word,	 #word is in uppercase
		'is_all_lower': word.lower() == word,	 #word is in lowercase
		#prefix of the word
		'prefix-1': word[0],
		'prefix-2': word[:2],
		'prefix-3': word[:3],
		#suffix of the word
		'suffix-1': word[-1],
		'suffix-2': word[-2:],
		'suffix-3': word[-3:],
		#extracting previous word
		'prev_word': '' if i == 0 else sentence[i-1][0],
		#extracting next word
		'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
		'has_hyphen': '-' in word, #if word has hypen
		'is_numeric': word.isdigit(), #if word is in numeric
		'capitals_inside': word[1:].lower() != word[1:]
	}
	return features


In [6]:
# Extract features for each sentence in the corpus
X = []
y = []
for sentence in corpus:
	X_sentence = []
	y_sentence = []
	for i in range(len(sentence)):
		X_sentence.append(word_features(sentence, i))
		y_sentence.append(sentence[i][1])
	X.append(X_sentence)
	y.append(y_sentence)


# Split the data into training and testing sets
split = int(0.8 * len(X))
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]


In [7]:
import pycrfsuite

# Train a CRF model suing pysrfsuite
trainer = pycrfsuite.Trainer(verbose=False)
for x, y in zip(X_train, y_train):
	trainer.append(x, y)
trainer.set_params({
	'c1': 1.0,
	'c2': 1e-3,
	'max_iterations': 50,
	'feature.possible_transitions': True
})
trainer.train('pos.crfsuite')




In [8]:
# Tag a new sentence
tagger = pycrfsuite.Tagger()
tagger.open('pos.crfsuite')
sentence = 'The presentation highlighted the key achievements of the project’s development.'.split()
features = [word_features(sentence, i) for i in range(len(sentence))]
tags = tagger.tag(features)
print(list(zip(sentence, tags)))

[('The', 'MD'), ('presentation', 'VB'), ('highlighted', 'JJ'), ('the', 'NN'), ('key', 'VBP'), ('achievements', 'DT'), ('of', 'NN'), ('the', 'NN'), ('project’s', 'NN'), ('development.', 'NN')]
