# Conditional Random Fields


### 

In [25]:
data = []  # To store (sentence, labels) tuples
sentence = []
labels = []

with open('data/train_cleaned.txt', 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()

        # Split sentences at empty lines
        if not line:
            if sentence:  # If we have a completed sentence
                data.append((" ".join(sentence), labels))
                sentence = []
                labels = []
            continue
        
        # Split the word and its label
        word, label = line.rsplit(' ', 1)
        sentence.append(word)
        labels.append(label)

# Add the last sentence if file doesn't end with a blank line
if sentence:
    data.append((" ".join(sentence), labels)) 

###

In [26]:
from backend import Collection
from sklearn_crfsuite import CRF

# Prepare X (features) and y (labels)
X = [Collection.extract_features(sentence.split()) for sentence, labels in data]
y = [labels for sentence, labels in data]

crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X, y)
print("CRF model was trained.")

CRF model was trained.


### 

In [27]:
from sklearn_crfsuite.metrics import flat_classification_report
from joblib import dump
import os

y_pred = crf.predict(X)
print(flat_classification_report(y, y_pred))

# Create the directory if it doesn't exist
os.makedirs("model", exist_ok=True)
# Save the trained CRF model to a file
dump(crf, 'model/crf_model.joblib')
print("CRF model saved to crf_model.joblib")


              precision    recall  f1-score   support

       B-EVE       0.99      0.98      0.98      1850
       B-LAN       1.00      0.86      0.92       139
       B-LOC       0.95      0.96      0.95      8130
       B-MON       0.99      0.90      0.94       187
       B-NUM       0.97      0.94      0.95      1426
       B-ORG       0.98      0.97      0.98     11150
       B-PER       0.99      0.95      0.97      8106
      B-TIME       0.99      0.98      0.98     11014
       I-EVE       0.98      0.98      0.98      4107
       I-LAN       1.00      1.00      1.00         4
       I-LOC       0.98      0.99      0.98      6121
       I-MON       0.99      1.00      1.00       314
       I-NUM       0.97      0.99      0.98       544
       I-ORG       0.98      0.99      0.98     16342
       I-PER       0.99      0.98      0.98      7635
      I-TIME       0.97      0.99      0.98     39600
           O       0.99      0.99      0.99    274329

    accuracy              