In [None]:
# adapted from https://simpletransformers.ai/docs/ner-minimal-start/

# load dataset

import logging
import pandas as pd
from simpletransformers.ner import NERModel, NERArgs

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

df = pd.read_csv("../../data/experimental/eng_dataset.csv")

df["words"] = df.words.fillna("")

df_train = df.query("dataset=='train'")
df_valid = df.query("dataset=='valid'")
df_test = df.query("dataset=='test'")

Unnamed: 0,sentence_id,words,labels,dataset


In [41]:
# train model

model_args = NERArgs()
model_args.train_batch_size = 16
model_args.evaluate_during_training = True
model_args.overwrite_output_dir = True
model_args.labels_list = df.labels.unique().tolist()
model_args.num_train_epochs = 3

model = NERModel(
    "roberta", "roberta-base", args=model_args, use_cuda=False, 
)

# Train the model
model.train_model(df_train, eval_data=df_valid)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:simpletransformers.ner.ner_model: Converting to features started.
100%|██████████| 1/1 [00:05<00:00,  5.79s/it]
Epochs 1/3. Running Loss:    0.1526: 100%|██████████| 22/22 [01:17<00:00,  3.54s/it]
INFO:simpletransformers.ner.ner_model: Converting to features started.
100%|██████████| 1/1 [00:05<00:00,  5.66s/it]
Running Evaluation: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
Epochs 2/3. Running Loss:    0.0300: 100%|██████████| 22/22 [33:26<00:00, 91.22s/it]
INFO:simpletransformers.ner.ner_model: Converting to features started.
100%|██████████| 1/1 [00:05<00:00,  5.58s/it]
Running Evaluation: 100%|██████████| 1/1 [00:02<00:00,  2.01s/it]
Epochs 3/3. Running Loss:    0.0171: 100%|██████████| 22/22

(66,
 defaultdict(list,
             {'global_step': [22, 44, 66],
              'train_loss': [0.15263675153255463,
               0.029998715966939926,
               0.017085831612348557],
              'eval_loss': [0.21413953602313995,
               0.08134108781814575,
               0.0945669636130333],
              'precision': [np.float64(0.660377358490566),
               np.float64(0.851063829787234),
               np.float64(0.8913043478260869)],
              'recall': [np.float64(0.7777777777777778),
               np.float64(0.8888888888888888),
               np.float64(0.9111111111111111)],
              'f1_score': [np.float64(0.7142857142857142),
               np.float64(0.8695652173913044),
               np.float64(0.9010989010989011)]}))

In [43]:
# Evaluate the model
result, model_outputs, preds_list = model.eval_model(df.query("dataset=='test'"))

# Make predictions with the model
predictions, raw_outputs = model.predict(["Hermione was the best in her class"])

result

INFO:simpletransformers.ner.ner_model: Converting to features started.
100%|██████████| 1/1 [00:06<00:00,  6.44s/it]
Running Evaluation: 100%|██████████| 1/1 [00:01<00:00,  1.90s/it]
INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.061059463769197464, 'precision': np.float64(0.8166666666666667), 'recall': np.float64(0.9423076923076923), 'f1_score': np.float64(0.8749999999999999)}
INFO:simpletransformers.ner.ner_model: Converting to features started.
100%|██████████| 1/1 [00:04<00:00,  4.53s/it]
Running Prediction: 100%|██████████| 1/1 [00:00<00:00,  6.78it/s]


{'eval_loss': 0.061059463769197464,
 'precision': np.float64(0.8166666666666667),
 'recall': np.float64(0.9423076923076923),
 'f1_score': np.float64(0.8749999999999999)}

In [69]:
def predict_entities(model, sentence):
    prediction = model.predict([sentence])
    entities = []
    entity = []
    for word in prediction[0][0]:
        for key, value in word.items():
            if value != "O":
                entity.append(key)
            if value == "O" and len(entity):
                entities.append({
                    "span": " ".join(entity),
                    "type": value[2:]
                })
    if len(entity) > 0:
        entities.append({
            "span": " ".join(entity),
            "type": value[2:]
        })

    return entities

predict_entities(model, "My farm is in Musanze.")

INFO:simpletransformers.ner.ner_model: Converting to features started.
100%|██████████| 1/1 [00:04<00:00,  4.51s/it]
Running Prediction: 100%|██████████| 1/1 [00:00<00:00,  7.66it/s]


[{'span': 'Musanze.', 'type': 'LOCATION'}]