### Using Huggingface Datasets and Transformers to train a NER model

In [None]:
!pip install ipywidgets

In [None]:
!pip install seqeval

In [29]:
import datasets
from datasets import Dataset, DatasetDict
from datasets import list_datasets, load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [6]:
from datasets import ClassLabel, Sequence
import random
from IPython.display import display, HTML

In [68]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from torch.utils.data import DataLoader
from transformers import pipeline

In [None]:
train_test_split(

Prepare the dataset to be in the expected format

In [108]:
def prepare_ner_training_set(input_ner_pd,test_fraction=0.1,valid_fraction=0.1):
    
    input_ner_pd["labels"] = input_ner_pd["labels"].apply(lambda x: "O" if x == "0" else x)
    
    class_names = ner_data["labels"].unique().tolist()
    class_names_map = {class_names[i]:i for i in range(len(class_names))}
    
    input_ner_pd["int_label"] = input_ner_pd["labels"].apply(lambda x: int(class_names_map[str(x)]))
    
    sentences = input_ner_pd.\
        groupby("sentence_id").\
        apply(lambda x: [list(x['words']), list(x['int_label'])]).\
        apply(pd.Series).\
        reset_index()
    
    sentences.columns=["id","tokens","ner_tags"]
    
    X = sentences[["id","tokens"]]
    y = sentences["ner_tags"]

    if test_fraction > 0:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction, random_state=1)
    else:
        X_train = X
        y_train = y
        X_test = None
        y_test = None
    
    if valid_fraction > 0:
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=valid_fraction, random_state=1)
    else:
        X_valid = None
        y_test = None
        
    nsets = 1
    training_set = X_train.join(y_train).reset_index(drop=True)
    if isinstance(X_test,pd.DataFrame):
        test_set = X_test.join(y_test).reset_index(drop=True)
        nsets = 2
    if isinstance(X_valid,pd.DataFrame):
        valid_set = X_valid.join(y_valid).reset_index(drop=True)
        nsets = 3
        
    # Get NER features schema
    ner_features = datasets.Features(
            {"id": datasets.Value("int64"),
             "tokens": datasets.Sequence(datasets.Value("string")),
             "ner_tags": datasets.Sequence(datasets.ClassLabel(names=class_names))}
            )
    
    # Make Transformers dataset object
    if nsets == 1:
        ner_dataset = DatasetDict({
            "train":Dataset.from_pandas(training_set,features=ner_features)
        })
    elif nsets == 2:
        ner_dataset = DatasetDict({
            "train":Dataset.from_pandas(training_set,features=ner_features),
            "test":Dataset.from_pandas(test_set,features=ner_features)
        })
    else:
        ner_dataset = DatasetDict({
            "train":Dataset.from_pandas(training_set,features=ner_features),
            "test":Dataset.from_pandas(test_set,features=ner_features),
            "validation":Dataset.from_pandas(valid_set,features=ner_features)
        })
    
    return ner_dataset, class_names_map

In [104]:
ner_data = pd.read_csv("data/NER_TRAINING_SET_2021_08_30.csv")

In [106]:
ner_data.head()

Unnamed: 0,sentence_id,words,labels
0,0,for,0
1,0,country,0
2,0,Ireland,B-REG
3,0,for,0
4,0,second,B-DATE


In [109]:
ner_dataset, class_names_map = prepare_ner_training_set(ner_data,test_fraction=0.1,valid_fraction=0.05)

In [111]:
ner_dataset["test"]

Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 600
})

In [112]:
class_names_map

{'O': 0,
 'B-REG': 1,
 'B-DATE': 2,
 'I-DATE': 3,
 'B-PLAT': 4,
 'I-PLAT': 5,
 'B-CAT': 6,
 'I-CAT': 7,
 'B-MET': 8,
 'I-MET': 9,
 'B-APP': 10,
 'I-APP': 11,
 'I-REG': 12}

#### Show some random elements of the dataset

In [113]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [114]:
show_random_elements(ner_dataset["train"])

Unnamed: 0,id,tokens,ner_tags
0,2920,"[what, are, for, 주선, for, kakao, the, best, featuring, vs, dau, apple, ios, store, compared, to, apple, for, first, quarter, 2016]","[O, O, O, B-APP, I-APP, I-APP, O, O, B-MET, O, B-MET, B-PLAT, I-PLAT, I-PLAT, O, O, B-PLAT, O, B-DATE, I-DATE, I-DATE]"
1,1390,"[for, casino, the, best, installs, for, country, el, salvador, on, Apple, Play, compared, to, play, store, q2, 2013]","[O, B-CAT, O, O, B-MET, O, O, B-REG, I-REG, O, B-PLAT, I-PLAT, O, O, B-PLAT, I-PLAT, B-DATE, I-DATE]"
2,550,"[tell, me, about, for, category, action, platformer, game, 2009, last, quarter, for, Apple, Store, for, country, Pakistan, the, best, base, of, users, and, 7, day, retention, top, chart, ranks, in, canada, in, october, 2019]","[O, O, O, O, O, B-CAT, I-CAT, I-CAT, B-DATE, I-DATE, I-DATE, O, B-PLAT, I-PLAT, O, O, B-REG, O, O, B-MET, I-MET, I-MET, O, B-MET, I-MET, I-MET, B-MET, I-MET, I-MET, O, B-REG, O, B-DATE, I-DATE]"
3,1650,"[tell, me, about, in, El, Salvador, for, app, 银河足球队, and, 贵族：1896, hottest, by, dau, and, install, base]","[O, O, O, O, B-REG, I-REG, O, O, B-APP, O, B-APP, O, O, B-MET, O, B-MET, I-MET]"
4,3576,"[what, are, for, fish, hunter, games, applications, for, avg, time, per, user, for, country, Latvia, on, apple, store, and, google, play, in, since, 2010]","[O, O, O, B-CAT, I-CAT, I-CAT, O, O, B-MET, I-MET, I-MET, I-MET, O, O, B-REG, O, B-PLAT, I-PLAT, O, B-PLAT, I-PLAT, O, B-DATE, I-DATE]"
5,123,"[what, are, in, hidden, object, game, apps, best, by, highest, usage, vs, most, users, for, apple, ios, store, and, apple, |what, are, zoom's, ratings, on, android, this, year?]","[O, O, O, B-CAT, I-CAT, I-CAT, O, O, O, B-MET, I-MET, O, B-MET, I-MET, O, B-PLAT, I-PLAT, I-PLAT, O, B-PLAT, B-APP, O, B-APP, B-MET, O, B-PLAT, B-DATE, I-DATE]"
6,4539,"[for, android, in, 2011, since, highest, top, mobile, apps, safari专用adblock, plus, vs, kardia, turks, and, caicos]","[O, B-PLAT, O, B-DATE, I-DATE, O, B-MET, I-MET, I-MET, B-APP, I-APP, O, B-APP, B-REG, I-REG, I-REG]"
7,525,"[what, are, strategy, game, apps, the, best, paid, search, Grenada, for, Apple, Play, compared, to, android, store, in, q1, 2009, starbucks, ad, revenue, q1, 2019]","[O, O, B-CAT, I-CAT, O, O, O, B-MET, I-MET, B-REG, O, B-PLAT, I-PLAT, O, O, B-PLAT, I-PLAT, O, B-DATE, I-DATE, B-APP, B-MET, I-MET, B-DATE, I-DATE]"
8,2736,"[what, are, livraison, repas, and, i̇nternetsiz, okey, the, best, highest, wau, for, Ios, for, last, quarter]","[O, O, B-APP, I-APP, O, B-APP, I-APP, O, O, B-MET, I-MET, O, B-PLAT, O, B-DATE, I-DATE]"
9,2139,"[what, are, for, ゼロから始める異世界生活, リゼロス, Lost, In, Memories, hottest, by, top, grossing, have, in, italy, for, Android, Store, vs, apple, ios, store, q2, 2015]","[O, O, O, B-APP, I-APP, I-APP, I-APP, I-APP, O, O, B-MET, I-MET, O, O, B-REG, O, B-PLAT, I-PLAT, O, B-PLAT, I-PLAT, I-PLAT, B-DATE, I-DATE]"


In [19]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    
    
    label_all_tokens = True
    
    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

#### Some NER models that we could experiment with

#### How to save and reload a model
https://huggingface.co/transformers/main_classes/model.html

This one has already been trained on conll03, maybe it would be a good starting point?
https://huggingface.co/elastic/distilbert-base-cased-finetuned-conll03-english



In [126]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "distilbert-base-cased"
batch_size = 32

In [115]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /Users/rmartinshort/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.11.3",
  "vocab_size": 28996
}

loading file https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt from cache at /Users/rmartinshort/.cache/huggingface/transformers/ba377304984dc63e3ede0e23a938bbbf04d5c3835b66d5bb48343aecca188429.437aa611e89f6fc66

In [116]:
tokenized_ner_dataset = ner_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [117]:
tokenized_ner_dataset 

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'id', 'input_ids', 'labels', 'ner_tags', 'tokens'],
        num_rows: 5130
    })
    test: Dataset({
        features: ['attention_mask', 'id', 'input_ids', 'labels', 'ner_tags', 'tokens'],
        num_rows: 600
    })
    validation: Dataset({
        features: ['attention_mask', 'id', 'input_ids', 'labels', 'ner_tags', 'tokens'],
        num_rows: 270
    })
})

In [118]:
# Remove columns that we don't need in the training part
tokenized_ner_dataset = tokenized_ner_dataset.remove_columns(
    ["ner_tags", "tokens","id"]
)

In [120]:
class_names = list(class_names_map.keys())

In [121]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(class_names))

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /Users/rmartinshort/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9

In [122]:
model.config.label2id = class_names_map

In [124]:
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-APP": 10,
    "B-CAT": 6,
    "B-DATE": 2,
    "B-MET": 8,
    "B-PLAT": 4,
    "B-REG": 1,
    "I-APP": 11,
    "I-CAT": 7,
    "I-DATE": 3,
    "I-MET": 9,
    "I-PLAT": 5,
    "I-REG": 12,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.11.3",

In [125]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [187]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-v1-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [47]:
metric = datasets.load_metric("seqeval")

In [128]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [class_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [class_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

### Test that the model will work

In [131]:
train_dataloader = DataLoader(
    tokenized_ner_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_ner_dataset["validation"], batch_size=8, collate_fn=data_collator
)

In [132]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8, 49]),
 'input_ids': torch.Size([8, 49]),
 'attention_mask': torch.Size([8, 49])}

In [133]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(2.6243, grad_fn=<NllLossBackward0>) torch.Size([8, 49, 13])


### Set up trainer and train the model

In [188]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_ner_dataset["train"],
    eval_dataset=tokenized_ner_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [189]:
trainer.train()

***** Running training *****
  Num examples = 5130
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 805


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.061897,0.961669,0.94618,0.953861,0.978567
2,No log,0.053096,0.969255,0.953644,0.961386,0.981588
3,No log,0.046277,0.969315,0.967983,0.968649,0.985328
4,0.045500,0.04776,0.972978,0.968965,0.970967,0.986263
5,0.045500,0.046551,0.967856,0.969947,0.9689,0.985256


***** Running Evaluation *****
  Num examples = 600
  Batch size = 32
***** Running Evaluation *****
  Num examples = 600
  Batch size = 32
***** Running Evaluation *****
  Num examples = 600
  Batch size = 32
Saving model checkpoint to distilbert-base-cased-finetuned-v1-ner/checkpoint-500
Configuration saved in distilbert-base-cased-finetuned-v1-ner/checkpoint-500/config.json
Model weights saved in distilbert-base-cased-finetuned-v1-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in distilbert-base-cased-finetuned-v1-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in distilbert-base-cased-finetuned-v1-ner/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 600
  Batch size = 32
***** Running Evaluation *****
  Num examples = 600
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=805, training_loss=0.03611059899655929, metrics={'train_runtime': 5427.8438, 'train_samples_per_second': 4.726, 'train_steps_per_second': 0.148, 'total_flos': 316602755800320.0, 'train_loss': 0.03611059899655929, 'epoch': 5.0})

In [190]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 600
  Batch size = 32


{'eval_loss': 0.04655105248093605,
 'eval_precision': 0.9678557428459428,
 'eval_recall': 0.9699469652327637,
 'eval_f1': 0.9689002256450506,
 'eval_accuracy': 0.9852560414269275,
 'eval_runtime': 38.0201,
 'eval_samples_per_second': 15.781,
 'eval_steps_per_second': 0.5,
 'epoch': 5.0}

In [191]:
predictions, labels, _ = trainer.predict(tokenized_ner_dataset["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 270
  Batch size = 32


{'APP': {'precision': 0.877431906614786,
  'recall': 0.898406374501992,
  'f1': 0.8877952755905512,
  'number': 502},
 'CAT': {'precision': 0.976,
  'recall': 0.991869918699187,
  'f1': 0.9838709677419355,
  'number': 123},
 'DATE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 209},
 'MET': {'precision': 0.9884105960264901,
  'recall': 0.9835255354200988,
  'f1': 0.9859620148637489,
  'number': 607},
 'PLAT': {'precision': 0.9969879518072289,
  'recall': 0.993993993993994,
  'f1': 0.9954887218045113,
  'number': 333},
 'REG': {'precision': 0.9770114942528736,
  'recall': 0.9883720930232558,
  'f1': 0.9826589595375722,
  'number': 344},
 'overall_precision': 0.9615384615384616,
 'overall_recall': 0.9678942398489141,
 'overall_f1': 0.9647058823529413,
 'overall_accuracy': 0.9846955936764211}

In [192]:
tokenizer.save_pretrained("distilbert-base-case-finetuned-app-ner-tok")

tokenizer config file saved in distilbert-base-case-finetuned-app-ner-tok/tokenizer_config.json
Special tokens file saved in distilbert-base-case-finetuned-app-ner-tok/special_tokens_map.json


('distilbert-base-case-finetuned-app-ner-tok/tokenizer_config.json',
 'distilbert-base-case-finetuned-app-ner-tok/special_tokens_map.json',
 'distilbert-base-case-finetuned-app-ner-tok/vocab.txt',
 'distilbert-base-case-finetuned-app-ner-tok/added_tokens.json',
 'distilbert-base-case-finetuned-app-ner-tok/tokenizer.json')

In [193]:
trainer.model.save_pretrained("distilbert-base-case-finetuned-app-ner")

Configuration saved in distilbert-base-case-finetuned-app-ner/config.json
Model weights saved in distilbert-base-case-finetuned-app-ner/pytorch_model.bin


In [194]:
loaded_model = AutoModelForTokenClassification.from_pretrained("distilbert-base-case-finetuned-app-ner")

loading configuration file distilbert-base-case-finetuned-app-ner/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-APP": 10,
    "B-CAT": 6,
    "B-DATE": 2,
    "B-MET": 8,
    "B-PLAT": 4,
    "B-REG": 1,
    "I-APP": 11,
    "I-CAT": 7,
    "I-DATE": 3,
    "I-MET": 9,
    "I-PLAT": 5,
    "I-REG": 12,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_t

In [195]:
id2label_dict = loaded_model.config.id2label
label2id_dict = loaded_model.config.label2id

In [196]:
loaded_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-case-finetuned-app-ner-tok")

Didn't find file distilbert-base-case-finetuned-app-ner-tok/added_tokens.json. We won't load it.
loading file distilbert-base-case-finetuned-app-ner-tok/vocab.txt
loading file distilbert-base-case-finetuned-app-ner-tok/tokenizer.json
loading file None
loading file distilbert-base-case-finetuned-app-ner-tok/special_tokens_map.json
loading file distilbert-base-case-finetuned-app-ner-tok/tokenizer_config.json


In [224]:
ner_pipe = pipeline("ner", grouped_entities=True, model=loaded_model,tokenizer=loaded_tokenizer)

In [225]:
op = ner_pipe(["tell me about snapchat downloads in 2020"])



In [214]:
model_label_2_id = {v:k for k,v in id2label_dict.items()}

In [215]:
model_id_2_label = {v:k for k,v in label2id_dict.items()}

In [223]:
def extract_entities(model_output):

    ents_data = {"APP":[],"REG":[],"DATE":[],"PLAT":[],"MET":[],"CAT":[]}
    for ent in model_output[0]:
        ent_id = model_label_2_id[ent['entity_group']]
        ent_label = model_id_2_label[ent_id]
    
        if "-" in ent_label:
            ent_label = ent_label.split("-")[1]
            ents_data[ent_label].append(ent["word"])
    
    return ents_data
    

In [229]:
query = "clash of clans mobile users in japan on google last year"
op = ner_pipe([query])
extract_entities(op)



{'APP': ['clash', 'of clans'],
 'REG': ['japan'],
 'DATE': ['last', 'year'],
 'PLAT': ['google'],
 'MET': ['mobile users'],
 'CAT': []}