In [6]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

# Pars the results from CRF++
def parse_document(filepath):
    actual_labels = []
    predicted_labels = []
    
    with open(filepath, 'r') as file:
        lines = file.readlines()
        
        for line in lines:
            if line.startswith('#') or not line.strip():
                continue  
            
            parts = line.split()
            if len(parts) >= 6:
                actual_label = parts[-2]  # last 2 elements are labels
                predicted_label = parts[-1].split('/')[0]  # after "/" is confidence score
                
                actual_labels.append(actual_label)
                predicted_labels.append(predicted_label)
    
    return actual_labels, predicted_labels

# F1-score
def calculate_metrics(actual_labels, predicted_labels):
    precision = precision_score(actual_labels, predicted_labels, average='macro')
    recall = recall_score(actual_labels, predicted_labels, average='macro')
    f1_macro = f1_score(actual_labels, predicted_labels, average='macro')
    f1_micro = f1_score(actual_labels, predicted_labels, average='micro')
    f1_weighted = f1_score(actual_labels, predicted_labels, average='weighted')
    cm = confusion_matrix(actual_labels, predicted_labels)
    report = classification_report(actual_labels, predicted_labels)
    
    return precision, recall, f1_macro, f1_micro, f1_weighted, cm, report

# File_path
filepath = r'C:\Users\X-pc\Downloads\crf\test_results.txt'
actual_labels, predicted_labels = parse_document(filepath)
precision, recall, f1_macro, f1_micro, f1_weighted, cm, report = calculate_metrics(actual_labels, predicted_labels)

#unique labels
unique_labels = list(sorted(set(actual_labels)))

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Macro F1-Score: {f1_macro:.4f}")
print(f"Micro F1-Score: {f1_micro:.4f}")
print(f"Weighted F1-Score: {f1_weighted:.4f}")

print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(report)


Precision: 0.5469
Recall: 0.4415
Macro F1-Score: 0.4606
Micro F1-Score: 0.6802
Weighted F1-Score: 0.6386

Confusion Matrix:
[[  189     9   734]
 [   10  2079  6981]
 [  244  2052 19061]]

Classification Report:
              precision    recall  f1-score   support

           B       0.43      0.20      0.27       932
           I       0.50      0.23      0.31      9070
           O       0.71      0.89      0.79     21357

    accuracy                           0.68     31359
   macro avg       0.55      0.44      0.46     31359
weighted avg       0.64      0.68      0.64     31359

