# TextMining Assignment 2
Lucas de Wolff (s3672980) and Ruben Ahrens (s3677532)

November 2023


In [35]:
import datasets
from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForTokenClassification
from huggingface_hub import interpreter_login
from transformers import DataCollatorForTokenClassification
import evaluate
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

In [36]:
names=["O", "B-corporation", "I-corporation", "B-creative-work", "I-creative-work", "B-group", 
       "I-group", "B-location", "I-location", "B-person", "I-person", "B-product", "I-product"]

In [37]:
train, valid, test = 'wnut17train.conll', 'emerging.dev.conll', 'emerging.test.annotated'
path = 'W-NUT_data'

raw_datasets = {}
for file, name in zip([train, valid, test], ['train', 'validation', 'test']):
    id = 0
    raw_datasets[name] = {'id': [], 'tokens': [], 'ner_tags': []}
    with open(f'{path}/{file}', 'r') as f:
        tokens, ner_tags = [], []
        for line in f:
            try:
                token, ner_tag = line.split()
                tokens.append(token)
                ner_tags.append(names.index(ner_tag))
            except:
                raw_datasets[name]['id'].append([id for _ in range(len(tokens))])
                raw_datasets[name]['tokens'].append(tokens)
                raw_datasets[name]['ner_tags'].append(ner_tags)
                id += 1
                tokens, ner_tags = [], []
    raw_datasets[name] = datasets.Dataset.from_dict(raw_datasets[name])

raw_datasets = datasets.DatasetDict(raw_datasets)
display(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

# Processing the data

In [38]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [39]:
tokenizer.is_fast

True

In [40]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)

In [41]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [42]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [43]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [44]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

# compare total number of tokens before and after tokenization. which is not the number of rows, but the number of tokens
train_set = set([word  for sentence  in  raw_datasets['train']['tokens'] for word in sentence ])
eval_set = set([word  for  sentence  in raw_datasets['validation']['tokens'] for word in sentence ])
test_set = set([word  for  sentence  in raw_datasets['test']['tokens'] for word in sentence ])

print(len(train_set), len(eval_set), len(test_set))

# Print unique tokens of each dataset
train_set_unique = train_set - eval_set - test_set
eval_set_unique = eval_set - train_set - test_set
test_set_unique = test_set - train_set - eval_set

print(len(train_set_unique), len(eval_set_unique), len(test_set_unique))

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

14878 4102 6348
11810 1847 3776


# Fine-tuning the model with keras

## Data collation

In [45]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Metrics

In [46]:
metric = evaluate.load("seqeval")

In [47]:

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [48]:
def compute_all_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return all_metrics

In [49]:
def write_tex_table(all_metrics, filename):
    with open(filename, 'w') as f:
        f.write("\\begin{table}[h]\n")
        f.write("\\centering\n")
        f.write("\\begin{tabular}{lrrrr}\n")
        f.write("\\toprule\n")
        f.write(" & \\textbf{Precision} & \\textbf{Recall} & \\textbf{F1} & \\textbf{Number} \\\\\n")
        f.write("\\midrule\n")
        for key, value in all_metrics.items():
            if "overall" not in key:
                f.write(f"{key.title()} & {value['precision']:.2f} & {value['recall']:.2f} & {value['f1']:.2f} & {value['number']} \\\\\n")
        try:
            f.write("\\midrule\n")
            f.write(" & \\textbf{Overall precision} & \\textbf{Overall recall} & \\textbf{Overall F1} & \\textbf{Overall accuracy} \\\\\n")
            f.write("\\midrule\n")
            f.write(f" & {all_metrics['overall_precision']:.2f} & {all_metrics['overall_recall']:.2f} & {all_metrics['overall_f1']:.2f} & {all_metrics['overall_accuracy']:.2f} \\\\\n")
        except:
            pass
        f.write("\\bottomrule\n")
        f.write("\\end{tabular}\n")
        f.write("\\end{table}\n")
    
def print_fancy_table(all_metrics):
    # display fancy table in ipython widget
    from IPython.display import display, HTML
    html = "<table>"
    html += "<tr><th></th><th>Precision</th><th>Recall</th><th>F1</th><th>Number</th></tr>"
    for key, value in all_metrics.items():
        if "overall" not in key:
            html += f"<tr><td>{key.title()}</td><td>{value['precision']:.2f}</td><td>{value['recall']:.2f}</td><td>{value['f1']:.2f}</td><td>{value['number']}</td></tr>"
    html += "<tr><th></th><th>Overall precision</th><th>Overall recall</th><th>Overall F1</th><th>Overall accuracy</th></tr>"
    html += f"<tr><td></td><td>{all_metrics['overall_precision']:.2f}</td><td>{all_metrics['overall_recall']:.2f}</td><td>{all_metrics['overall_f1']:.2f}</td><td>{all_metrics['overall_accuracy']:.2f}</td></tr>"
    html += "</table>"
    display(HTML(html))
    

## Defining the model

In [50]:
id2label = {i: label for i, label in enumerate(names)}
label2id = {v: k for k, v in id2label.items()}

In [51]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##  Fine-tuning the model

In [52]:
interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token can be pasted using 'Right-Click'.
Token is valid (permission: write).
Your token has been saved in your con

In [53]:
training_args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    logging_strategy="no",
    save_strategy="no",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [54]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()
trainer.push_to_hub(commit_message="Training complete")

  0%|          | 0/1275 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 0.34034600853919983, 'eval_precision': 0.5636672325976231, 'eval_recall': 0.39712918660287083, 'eval_f1': 0.4659649122807018, 'eval_accuracy': 0.9172063152261172, 'eval_runtime': 3.5193, 'eval_samples_per_second': 286.704, 'eval_steps_per_second': 36.087, 'epoch': 1.0}


  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 0.3707972764968872, 'eval_precision': 0.6083916083916084, 'eval_recall': 0.41626794258373206, 'eval_f1': 0.4943181818181819, 'eval_accuracy': 0.9217018999197217, 'eval_runtime': 3.487, 'eval_samples_per_second': 289.361, 'eval_steps_per_second': 36.421, 'epoch': 2.0}


  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 0.3559342920780182, 'eval_precision': 0.5850439882697948, 'eval_recall': 0.4772727272727273, 'eval_f1': 0.525691699604743, 'eval_accuracy': 0.9261439657479261, 'eval_runtime': 3.4459, 'eval_samples_per_second': 292.811, 'eval_steps_per_second': 36.855, 'epoch': 3.0}
{'train_runtime': 172.2891, 'train_samples_per_second': 59.098, 'train_steps_per_second': 7.4, 'train_loss': 0.1302520751953125, 'epoch': 3.0}


pytorch_model.bin:   0%|          | 0.00/431M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

'https://huggingface.co/rubinho/bert-finetuned-ner/tree/main/'

## Evaluating the model (baseline)

In [55]:
predictions = trainer.predict(tokenized_datasets["test"])
eval_preds = predictions.predictions, predictions.label_ids
all_metrics = compute_all_metrics(eval_preds)
write_tex_table(all_metrics, 'bert-finetuned-ner_baseline.txt')
print_fancy_table(all_metrics)

  0%|          | 0/161 [00:00<?, ?it/s]

Unnamed: 0,Precision,Recall,F1,Number
Corporation,0.19,0.18,0.19,66
Creative-Work,0.41,0.20,0.27,142
Group,0.35,0.13,0.19,165
Location,0.55,0.46,0.50,150
Person,0.75,0.45,0.57,429
Product,0.15,0.07,0.10,127
,Overall precision,Overall recall,Overall F1,Overall accuracy
,0.52,0.31,0.39,0.94


## Hyperparameter optimization

In [56]:
training_args = TrainingArguments(
    "bert-finetuned-ner-optuna",
    evaluation_strategy="epoch",
    logging_strategy="no",
    save_strategy="no",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

def model_init(): 
    return AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)    

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "adam_beta1": trial.suggest_float("beta_1", 0.9, 0.999),
        "adam_beta2": trial.suggest_float("beta_2", 0.9, 0.999),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.1),
        "adam_epsilon": trial.suggest_float("adam_epsilon", 1e-9, 1e-7, log=True),
    }

trainer = Trainer(
    model=None,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    model_init=model_init,
)

best_run = trainer.hyperparameter_search(
    direction="minimize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=2,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2023-11-07 23:44:37,240] A new study created in memory with name: no-name-16a160b2-5ef5-4a9f-b970-b6fc0bcda730
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 0.36082062125205994, 'eval_precision': 0.534923339011925, 'eval_recall': 0.37559808612440193, 'eval_f1': 0.4413211524947294, 'eval_accuracy': 0.914905004013915, 'eval_runtime': 3.6235, 'eval_samples_per_second': 278.459, 'eval_steps_per_second': 35.049, 'epoch': 1.0}


  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 0.31303343176841736, 'eval_precision': 0.5113314447592068, 'eval_recall': 0.4318181818181818, 'eval_f1': 0.46822308690012965, 'eval_accuracy': 0.9232004281509232, 'eval_runtime': 3.6412, 'eval_samples_per_second': 277.104, 'eval_steps_per_second': 34.878, 'epoch': 2.0}


  0%|          | 0/127 [00:00<?, ?it/s]

[I 2023-11-07 23:46:31,036] Trial 0 finished with value: 2.4381972240284178 and parameters: {'learning_rate': 7.46444021494764e-05, 'per_device_train_batch_size': 32, 'beta_1': 0.9823212753255092, 'beta_2': 0.9561558831152476, 'weight_decay': 0.0572085027093759, 'adam_epsilon': 7.561691368560218e-09}. Best is trial 0 with value: 2.4381972240284178.


{'eval_loss': 0.3379002809524536, 'eval_precision': 0.5438108484005564, 'eval_recall': 0.4677033492822967, 'eval_f1': 0.5028938906752413, 'eval_accuracy': 0.9237891356703238, 'eval_runtime': 3.5239, 'eval_samples_per_second': 286.331, 'eval_steps_per_second': 36.04, 'epoch': 3.0}
{'train_runtime': 112.7447, 'train_samples_per_second': 90.31, 'train_steps_per_second': 2.847, 'train_loss': 0.14821997906931464, 'epoch': 3.0}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.43544960021972656, 'eval_precision': 0.436241610738255, 'eval_recall': 0.07775119617224881, 'eval_f1': 0.1319796954314721, 'eval_accuracy': 0.8906074391222906, 'eval_runtime': 3.6181, 'eval_samples_per_second': 278.876, 'eval_steps_per_second': 35.101, 'epoch': 1.0}


  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 0.3932664394378662, 'eval_precision': 0.589098532494759, 'eval_recall': 0.3361244019138756, 'eval_f1': 0.4280274181264281, 'eval_accuracy': 0.9105164570511105, 'eval_runtime': 3.5064, 'eval_samples_per_second': 287.759, 'eval_steps_per_second': 36.219, 'epoch': 2.0}


  0%|          | 0/127 [00:00<?, ?it/s]

[I 2023-11-07 23:48:47,536] Trial 1 finished with value: 2.2580753449113597 and parameters: {'learning_rate': 9.97282119646338e-06, 'per_device_train_batch_size': 16, 'beta_1': 0.920206368594119, 'beta_2': 0.9894853335051167, 'weight_decay': 0.05079995961253338, 'adam_epsilon': 2.4714995116229446e-08}. Best is trial 1 with value: 2.2580753449113597.


{'eval_loss': 0.37271803617477417, 'eval_precision': 0.5221674876847291, 'eval_recall': 0.3803827751196172, 'eval_f1': 0.4401384083044983, 'eval_accuracy': 0.9153866738025154, 'eval_runtime': 3.6117, 'eval_samples_per_second': 279.369, 'eval_steps_per_second': 35.163, 'epoch': 3.0}
{'train_runtime': 135.3304, 'train_samples_per_second': 75.238, 'train_steps_per_second': 4.722, 'train_loss': 0.20646982685501028, 'epoch': 3.0}


## Retrain model with optimized hyperparameters

In [57]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

for n, v in best_run.hyperparameters.items():
    print(f"{n}: {v}")
    setattr(trainer.args, n, v)
    
trainer.train()
trainer.push_to_hub(commit_message="Training complete")

learning_rate: 9.97282119646338e-06
per_device_train_batch_size: 16
beta_1: 0.920206368594119
beta_2: 0.9894853335051167
weight_decay: 0.05079995961253338
adam_epsilon: 2.4714995116229446e-08


  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 0.3317992091178894, 'eval_precision': 0.5573549257759784, 'eval_recall': 0.49401913875598086, 'eval_f1': 0.5237793278376665, 'eval_accuracy': 0.9252876639015253, 'eval_runtime': 3.5636, 'eval_samples_per_second': 283.14, 'eval_steps_per_second': 35.638, 'epoch': 1.0}


  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 0.38918423652648926, 'eval_precision': 0.5584958217270195, 'eval_recall': 0.4796650717703349, 'eval_f1': 0.5160875160875161, 'eval_accuracy': 0.9249130318437249, 'eval_runtime': 3.6303, 'eval_samples_per_second': 277.938, 'eval_steps_per_second': 34.983, 'epoch': 2.0}


  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 0.3891257345676422, 'eval_precision': 0.5642076502732241, 'eval_recall': 0.49401913875598086, 'eval_f1': 0.5267857142857143, 'eval_accuracy': 0.9265185978057265, 'eval_runtime': 3.4944, 'eval_samples_per_second': 288.747, 'eval_steps_per_second': 36.344, 'epoch': 3.0}
{'train_runtime': 136.7229, 'train_samples_per_second': 74.472, 'train_steps_per_second': 4.674, 'train_loss': 0.03902594696188197, 'epoch': 3.0}


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/431M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

'https://huggingface.co/rubinho/bert-finetuned-ner-optuna/tree/main/'

In [58]:
predictions = trainer.predict(tokenized_datasets["test"])
eval_preds = predictions.predictions, predictions.label_ids

  0%|          | 0/161 [00:00<?, ?it/s]

In [59]:
all_metrics = compute_all_metrics(eval_preds)
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)

# Remove ignored index (special tokens) and convert to labels
true_labels = [[names[l] for l in label if l != -100] for label in labels]
true_predictions = [
    [names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

from seqeval.metrics import classification_report
print(classification_report(true_labels, true_predictions))

               precision    recall  f1-score   support

  corporation       0.19      0.21      0.20        66
creative-work       0.44      0.22      0.29       142
        group       0.45      0.18      0.25       165
     location       0.54      0.41      0.47       150
       person       0.75      0.46      0.57       429
      product       0.16      0.09      0.12       127

    micro avg       0.52      0.32      0.40      1079
    macro avg       0.42      0.26      0.32      1079
 weighted avg       0.53      0.32      0.40      1079



In [60]:
# flatten the labels and predictions
true_labels_flat = [item for sublist in true_labels for item in sublist]
true_predictions_flat = [item for sublist in true_predictions for item in sublist]

# compute precision, recall, and f1 per entity type
precision, recall, f1, _ = precision_recall_fscore_support(true_labels_flat, true_predictions_flat, average=None, labels=names[1:])

# count number of entities per type
from collections import Counter
count = Counter(true_labels_flat)

# format into pandas table and round to 2 decimals
import pandas as pd
df = pd.DataFrame({'precision': precision, 'recall': recall, 'f1': f1, 'number': count}, index=names[1:])
df = df.round(2)
display(df)
write_tex_table(df.to_dict('index'), 'bert-finetuned-ner_optuna.txt')

Unnamed: 0,precision,recall,f1,number
B-corporation,0.27,0.21,0.24,66
I-corporation,0.31,0.21,0.25,133
B-creative-work,0.68,0.27,0.38,142
I-creative-work,0.75,0.2,0.31,442
B-group,0.58,0.19,0.29,165
I-group,0.45,0.14,0.21,242
B-location,0.66,0.47,0.55,150
I-location,0.74,0.35,0.47,237
B-person,0.82,0.49,0.61,429
I-person,0.83,0.32,0.47,918
