In [155]:
# Import packages
import scipy
import sklearn_crfsuite
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report, flat_f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer

In [156]:
# Preprocess the dataset
def load_sentences(filepath):
    final = []
    sentences = []
    with open(filepath, 'r') as f:
        for line in f.readlines():
            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(sentences) > 0:
                    final.append(sentences)
                    sentences = []
            else:
                word, pos, chunk, ner = line.strip().split()
                sentences.append((word, pos, chunk, ner))
    return final

In [157]:
# Load the CoNLL2003 dataset
train_data = list(load_sentences('train.txt'))
test_data = list(load_sentences('test.txt'))

In [158]:
# Define the feature extraction function
def word2features(sentence, i):
    word = sentence[i][0]
    pos = sentence[i][1]
    chunk = sentence[i][2]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'pos': pos,
        'chunk': chunk,
    }
    
    if i > 0:
        prev_word = sentence[i-1][0]
        prev_pos = sentence[i-1][1]
        prev_chunk = sentence[i-1][2]
        features.update({
            'prev_word.lower()': prev_word.lower(),
            'prev_word.istitle()': prev_word.istitle(),
            'prev_pos': prev_pos,
            'prev_chunk': prev_chunk,
        })
    else:
        features['BOS'] = True
    
    if i < len(sentence)-1:
        next_word = sentence[i+1][0]
        next_pos = sentence[i+1][1]
        next_chunk = sentence[i+1][2]
        features.update({
            'next_word.lower()': next_word.lower(),
            'next_word.istitle()': next_word.istitle(),
            'next_pos': next_pos,
            'next_chunk': next_chunk,
        })
    else:
        features['EOS'] = True
        
    return features

In [159]:
# Define the feature extraction function for the entire sentence
def sent2features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]


In [160]:
# Define the label extraction function for the entire sentence
def sent2labels(sentence):
    return [label for word, pos, chunk, label in sentence]

In [161]:
# Extract features and labels for training and testing data
X_train = [sent2features(sentence) for sentence in train_data]
y_train = [sent2labels(sentence) for sentence in train_data]
X_test = [sent2features(sentence) for sentence in test_data]
y_test = [sent2labels(sentence) for sentence in test_data]

In [162]:
# Define the CRF model
crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                           c1=0.1,
                           c2=0.1,
                           max_iterations=100)

In [174]:
# # Train the CRF model
# crf.fit(X_train, y_train)
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
# predictions = crf.predict(X_test)

In [175]:
# Make predictions on the test set
y_pred = crf.predict(X_test)

In [176]:
# Evaluate the performance of the model using the F1 score
f1_score = flat_classification_report(y_test, y_pred, digits=3)
print('F1 score:', f1_score)



F1 score:               precision    recall  f1-score   support

       B-LOC      0.854     0.816     0.835      1667
      B-MISC      0.806     0.752     0.778       701
       B-ORG      0.770     0.722     0.745      1660
       B-PER      0.825     0.858     0.841      1616
       I-LOC      0.776     0.646     0.705       257
      I-MISC      0.670     0.645     0.657       214
       I-ORG      0.681     0.736     0.707       834
       I-PER      0.866     0.958     0.909      1156
           O      0.988     0.989     0.988     38289

    accuracy                          0.956     46394
   macro avg      0.804     0.791     0.796     46394
weighted avg      0.956     0.956     0.956     46394



In [177]:
# Print the most likely transitions
print('Most likely transitions:')
print(crf.transition_features_)

Most likely transitions:
{('B-ORG', 'B-ORG'): -1.142655, ('B-ORG', 'O'): 0.067879, ('B-ORG', 'B-MISC'): -0.866235, ('B-ORG', 'B-PER'): -1.495477, ('B-ORG', 'B-LOC'): -2.981682, ('B-ORG', 'I-ORG'): 8.176497, ('O', 'B-ORG'): 2.441686, ('O', 'O'): 3.142003, ('O', 'B-MISC'): 2.582048, ('O', 'B-PER'): 3.649068, ('O', 'B-LOC'): 1.761824, ('B-MISC', 'B-ORG'): 0.752474, ('B-MISC', 'O'): 0.55252, ('B-MISC', 'B-MISC'): -0.27661, ('B-MISC', 'B-PER'): 1.001064, ('B-MISC', 'B-LOC'): -1.016111, ('B-MISC', 'I-MISC'): 9.478986, ('B-PER', 'O'): 0.916406, ('B-PER', 'B-MISC'): -0.763408, ('B-PER', 'I-PER'): 9.472398, ('B-PER', 'B-LOC'): -0.458683, ('I-PER', 'O'): -0.055104, ('I-PER', 'I-PER'): 6.353849, ('B-LOC', 'B-ORG'): -0.335119, ('B-LOC', 'O'): 0.617937, ('B-LOC', 'B-MISC'): 0.503209, ('B-LOC', 'B-PER'): -1.465511, ('B-LOC', 'B-LOC'): -0.826935, ('B-LOC', 'I-LOC'): 7.871241, ('I-ORG', 'B-ORG'): -3.096303, ('I-ORG', 'O'): -1.133143, ('I-ORG', 'B-MISC'): -1.142506, ('I-ORG', 'B-PER'): -1.983915, ('I-O

In [178]:
labels = list(crf.classes_)

In [179]:
labels.remove('O')

In [180]:
labels

['B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']

In [181]:
# Define the hyperparameter grid for tuning
params_space = {
    'c1': [0.5, 1, 1.5, 2, 2.5],
    'c2': [0.05, 0.1, 0.15, 0.2]
}

In [182]:
# Evaluate the performance of the CRF model using the F1 score metric
f1_score = flat_f1_score(y_test, y_pred, average="weighted")
print("F1 score:", f1_score)

F1 score: 0.9556555320983782


In [183]:
# Perform grid search for hyperparameter tuning
f1_scorer = make_scorer(flat_f1_score, average='weighted', labels=labels)
grid = GridSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, scoring=f1_scorer)

In [184]:
grid.fit(X_train, y_train)

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

In [None]:
# Print the best hyperparameters
print('Best parameters:', grid.best_params_)
print('best CV score:', grid.best_score_)