# Conditional Random Fields


### Import the data and prepare it for training

In [1]:
import sys
sys.path.append('../backend')
from backend import assemblage

data = assemblage.load_data("../data/train_cleaned.txt")

### Training CRF model 

In [2]:
from sklearn_crfsuite import CRF

# Prepare X (features) and y (labels)
X = [assemblage.extract_features_for_crf(sentence.split()) for sentence, labels in data]
y = [labels for sentence, labels in data]

crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X, y)
print("CRF model was trained.")

CRF model was trained.


### Evaluate the model

In [6]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from joblib import dump
import os

def calculate_metrics(true_labels, predicted_labels):
    # Flatten the lists
    true_flat = [label for seq in true_labels for label in seq]
    pred_flat = [label for seq in predicted_labels for label in seq]

    # Calculate metrics
    precision = precision_score(true_flat, pred_flat, average='weighted', zero_division=0)
    recall = recall_score(true_flat, pred_flat, average='weighted', zero_division=0)
    f1 = f1_score(true_flat, pred_flat, average='weighted', zero_division=0)
    accuracy = accuracy_score(true_flat, pred_flat)

    return precision, recall, f1, accuracy

# Load test data and prepare features and labels
test_data = assemblage.load_data("../data/test_cleaned.txt")
X_test = [assemblage.extract_features_for_crf(sentence.split()) for sentence, labels in test_data]
y_test = [labels for sentence, labels in test_data]

# Predict using the trained CRF model
y_pred_test = crf.predict(X_test)

# Calculate metrics for test data
test_precision, test_recall, test_f1, test_accuracy = calculate_metrics(y_test, y_pred_test)

# Print test metrics
print("\nTest Data Metrics:")
print(f"Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1-Score: {test_f1:.4f}, Test Accuracy: {test_accuracy:.4f}")

# Load validation data and prepare features and labels
val_data = assemblage.load_data("../data/val_cleaned.txt")
X_val = [assemblage.extract_features_for_crf(sentence.split()) for sentence, labels in val_data]
y_val = [labels for sentence, labels in val_data]

# Predict using the trained CRF model
y_pred_val = crf.predict(X_val)

# Calculate metrics for validation data
val_precision, val_recall, val_f1, val_accuracy = calculate_metrics(y_val, y_pred_val)

# Print validation metrics
print("\nValidation Data Metrics:")
print(f"Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}, Validation F1-Score: {val_f1:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# Create the directory if it doesn't exist
os.makedirs("../model", exist_ok=True)

# Save the trained CRF model to a file
dump(crf, '../model/crf_model.joblib')
print("\nCRF model saved to crf_model.joblib")



Test Data Metrics:
Test Precision: 0.9540, Test Recall: 0.9550, Test F1-Score: 0.9539, Test Accuracy: 0.9550

Validation Data Metrics:
Validation Precision: 0.9546, Validation Recall: 0.9558, Validation F1-Score: 0.9546, Validation Accuracy: 0.9558

CRF model saved to crf_model.joblib
