# Setup

In [1]:
!pip install transformers datasets
!pip install seqeval evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 21.3 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 52.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 67.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 63.4 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 46.3 MB/s 
[?25hCollecting 

In [43]:
from transformers import DistilBertTokenizer, DistilBertForTokenClassification, AdamW, DataCollatorForTokenClassification, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import datasets
from datasets import load_dataset, load_metric, concatenate_datasets, Dataset, DatasetDict
import torch 
import numpy as np
import string
import random
from sklearn.preprocessing import normalize
import evaluate
import copy

# Dataset

In [3]:
# get CoNLL 2003 dataset
conll_dataset = load_dataset('conll2003')

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

# Dataset Preprocessing

Adds noise to text dataset

In [4]:
qwertyKeyboardArray = [
    ['`','1','2','3','4','5','6','7','8','9','0','-','='],
    ['q','w','e','r','t','y','u','i','o','p','[',']','\\'],
    ['a','s','d','f','g','h','j','k','l',';','\''],
    ['z','x','c','v','b','n','m',',','.','/'],
    ['', '', ' ', ' ', ' ', ' ', ' ', '', '']
    ]

qwertyShiftedKeyboardArray = [
    ['~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '+'],
    ['Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'],
    ['A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'],
    ['Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'],
    ['', '', ' ', ' ', ' ', ' ', ' ', '', '']
    ]

In [5]:
def flatten(l):
  return sum(l, [])

In [6]:
def get_keyboard_dicts(keyboard):
  nonshift = None
  shift = None
  if keyboard == 'qwerty':
    nonshift = qwertyKeyboardArray
    shift = qwertyShiftedKeyboardArray
  else:
    raise ValueError('keyboard layout not available')

  chara2idx = dict() #{chara: (shift T / notshift F boolean, idx}
  idx2chara = dict()
  coords = np.zeros([len(flatten(nonshift))+len(flatten(shift)), 3]) #{coords[idx] = (x, y)}
  idx = 0
  for board in [shift, nonshift]:
    shift_case = 0
    for y in range(len(board)):
      for x in range(len(board[y])):
        idx2chara[idx] = board[y][x]
        chara2idx[board[y][x]] = (shift_case, idx)
        coords[idx] = np.array([x, y, shift_case])
        idx += 1
    shift_case = 3

  return coords, chara2idx, idx2chara

In [7]:
def keyboard_distance(keyboard_type):
  '''
  keyboard_type = str, accepts qwerty, qwertz, azerty
  '''
  coords, chara2idx, idx2chara = get_keyboard_dicts(keyboard_type)
  board_size = coords.shape[0]
  distances = np.zeros([board_size, board_size])
  for i in range(board_size):
    for j in range(board_size):
      distances[i][j] = 1+((coords[i][0] - coords[j][0])**4 + (coords[i][1] - coords[j][1])**4 + (coords[i][2] - coords[j][2])**4)**0.5
  
  norm_div_distances = normalize(1/distances, axis=1, norm='l1')
  return coords, chara2idx, idx2chara, norm_div_distances, distances

In [8]:
qwerty_coords, qwerty_chara2idx, qwerty_idx2chara, qwerty_norm_distances, qwerty_distances = keyboard_distance('qwerty')

In [9]:
def likely(c, distances, chara2idx):
  return np.random.choice(range(distances.shape[0]), 1, p=np.squeeze(distances[chara2idx[c][1]]))[0] 

In [10]:
def generate_typo(c):
  #substitute prob = 0.5, missing prob = 0.2, transpose prob = 0.25, insert prob = 0.05
  typo = np.random.rand(0,1)
  if typo < 0.5: #substitute
    likely_c = likely(c, qwerty_norm_distances, qwerty_chara2idx)
    return qwerty_idx2chara[likely_c]
  elif typo < 0.7: #missing
    return False
  elif typo < 0.95: #transpose
    return True
  else: #insert
    likely_c = likely(c, qwerty_norm_distances, qwerty_chara2idx)
    return str(likely_c) + c

In [11]:
length = max(len(conll_dataset['train']), len(conll_dataset['validation']), len(conll_dataset['test']))

In [None]:
# def noisify(data, k=0.02):
#   # new_tokens = []
#   # for seq in data['tokens']:
#   #   print('seq: ', seq)
#   #   noisy = generate_k_noisy(seq, k)
#   #   print(noisy)
#   #   new_tokens.append(noisy)
#   # performed on a row
#   keys = ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']
#   aug = {k: list() for k in keys}
#   # aug['tokens'] = np.zeros([len(data), len(data['tokens'][0])])
#   for idx in range(len(data['id'])):
#     aug['id'].append(str(length+int(data['id'][idx])))
#     noisy = [generate_k_noisy(seq, k) for seq in data['tokens'][idx]]
#     aug['tokens'].append(noisy)
#     aug['pos_tags'].append(data['pos_tags'][idx])
#     aug['chunk_tags'].append(data['chunk_tags'][idx])
#     aug['ner_tags'].append(data['ner_tags'][idx])
#   aug_data = Dataset.from_dict(aug)
#   return concatenate_datasets(data, aug_data)


  # keys = ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']
  # aug = {k: list() for k in keys}
  # # aug['tokens'] = np.zeros([len(data), len(data['tokens'][0])])
  # for idx in range(len(data['id'])):
  #   aug['id'].append(str(length+int(data['id'][idx])))
  #   noisy = [generate_k_noisy(seq, k) for seq in data['tokens'][idx]]
  #   aug['tokens'].append(noisy)
  #   aug['pos_tags'].append(data['pos_tags'][idx])
  #   aug['chunk_tags'].append(data['chunk_tags'][idx])
  #   aug['ner_tags'].append(data['ner_tags'][idx])
  # aug_data = Dataset.from_dict(aug)
  # return concatenate_datasets(data, aug_data)


In [12]:
conll_dataset['train']['tokens'][1]

['Peter', 'Blackburn']

In [13]:
# not map
def generate_k_noisy(seq, k, keyboard=True):
  ''' 
  seq: list of str
  k = desired percent noise
  keyboard = boolean, is text typed (TRUE) or OCR (FALSE)
  '''
  new_seq = []
  str_seq = ' '.join([w for w in seq])
  flip = False
  prev = None

  for c in str_seq:
    if flip:
      new_seq.append(c + prev)
    else:
      typo_prob = np.random.uniform(low=0, high=1)
      if typo_prob > k:
        new_seq.append(c)
      else:
        new_c = generate_typo(c)
        if not new_c.isascii():
          if not new_c:
            flip = False
          elif new_c:
            prev = c
            flip = True
        else:
          flip = False
          new_seq.append(new_c)
  return "".join(new_seq).split()

### Debugging

In [14]:
# map
def generate_k_noisy(seq, k, keyboard=True):
  ''' 
  seq: list of str
  k = desired percent noise
  keyboard = boolean, is text typed (TRUE) or OCR (FALSE)
  '''
  new_seq = []
  str_seq = ''.join(seq)
  flip = False
  prev = None

  for c in str_seq:
    if flip:
      new_seq.append(c + prev)
    else:
      typo_prob = np.random.uniform(low=0, high=1)
      if typo_prob > k:
        new_seq.append(c)
      else:
        new_c = generate_typo(c)
        if not new_c.isascii():
          if not new_c:
            flip = False
          elif new_c:
            prev = c
            flip = True
        else:
          flip = False
          new_seq.append(new_c)
  return "".join(new_seq)

In [15]:
MAX_LENGTH = max(len(conll_dataset['train']), len(conll_dataset['validation']), len(conll_dataset['test']))

In [74]:
def noisify(data, k=0.08):
  new_tokens = []
  for seq in data['tokens']:
    noisy = generate_k_noisy(seq, k)
    new_tokens.append(noisy)
  return {'tokens': new_tokens, 'id': str(int(data['id']) + MAX_LENGTH)}

In [45]:
def noisy_dataset(data):
  aug_data = dict()
  for split in data:
    aug_split = data[split].map(noisify)
    combo_split = concatenate_datasets([aug_split, data[split]])
    aug_data[split] = combo_split
  return DatasetDict(aug_data)
  #return datasets.DatasetDict.from_dict(aug_data)

In [73]:
aug_conll_dataset1 = noisy_dataset(conll_dataset) #k=.03



In [75]:
aug_conll_dataset2 = noisy_dataset(conll_dataset) #k=.08



# Model

In [19]:
def tokenize(model_type='distilbert-base-cased'):
  tokenizer = AutoTokenizer.from_pretrained(model_type)
  try:
    assert tokenizer.is_fast
    return tokenizer
  except AssertionError:
    print('Tokenizer is not fast')

In [20]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [21]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [22]:
def run_model(data, tokenizer, model_type='distilbert-base-cased', task='ner', batch_size=16):
  task = task # needs to be one of ner, pos, or chunk
  batch_size = batch_size

  train = data['train']
  test = data['test']
  val = data['validation']

  tokenizer(train[0]['tokens'],is_split_into_words=True).tokens()
  tokenizer(train[0]['tokens'],is_split_into_words=True).word_ids()

  labels = train[0]["ner_tags"]
  word_ids = tokenizer(train[0]['tokens'],is_split_into_words=True).word_ids()

  tokenized_datasets = data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=data["train"].column_names,
  )

  data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

  batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])

  ner_feature = data["train"].features["ner_tags"]
  label_names = ner_feature.feature.names

  labels = data["train"][0]["ner_tags"]
  labels = [label_names[i] for i in labels]

  id2label = {i: label for i, label in enumerate(label_names)}
  label2id = {v: k for k, v in id2label.items()}

  model = AutoModelForTokenClassification.from_pretrained(
    model_type,
    id2label=id2label,
    label2id=label2id,
  )

  model.to('cuda')

  model.config.num_labels

  return model, labels, tokenized_datasets, data_collator, label_names

In [23]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [84]:
def train(model, tokenized_datasets, data_collator, eval_strat='epoch', save_strat='epoch', lr=2e-5, epochs=3, decay=0.01):
  args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy=eval_strat,
    save_strategy=save_strat,
    learning_rate=lr,
    num_train_epochs=epochs,
    weight_decay=decay,
  )

  trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
  )
  trainer.train()
  #run on test set
  test_preds=trainer.predict(tokenized_datasets['test'])
  print('Test Metrics')
  print(test_preds.metrics)
  
  



In [25]:
tokenizer = tokenize('distilbert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [26]:
metric = evaluate.load('seqeval')

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [76]:
aug_conll_dataset1

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 28082
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 6500
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 6906
    })
})

In [77]:
aug_conll_dataset2

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 28082
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 6500
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 6906
    })
})

In [30]:
conll_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [78]:
baseline_dataset1=DatasetDict({'train':conll_dataset['train'],'validation':conll_dataset['validation'],'test':aug_conll_dataset1['test']})

In [79]:
baseline_dataset2=DatasetDict({'train':conll_dataset['train'],'validation':conll_dataset['validation'],'test':aug_conll_dataset2['test']})

In [93]:
model, labels, tokenized_datasets, data_collator, label_names = run_model(conll_dataset, tokenizer)
train(model, tokenized_datasets, data_collator)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_versio

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1007,0.084967,0.900796,0.913834,0.907268,0.97579
2,0.0451,0.074674,0.908823,0.932683,0.920598,0.982369
3,0.0258,0.07193,0.914478,0.937563,0.925877,0.98359


***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-1756
Configuration saved in bert-finetuned-ner/checkpoint-1756/config.json
Model weights saved in bert-finetuned-ner/checkpoint-1756/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-1756/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-1756/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-3512
Configuration saved in bert-finetuned-ner/checkpoint-3512/config.json
Model weights saved in bert-finetuned-ner/checkpoint-3512/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-3512/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-3512/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoin

Test Metrics
{'test_loss': 0.18359676003456116, 'test_precision': 0.8645303492709393, 'test_recall': 0.9027974504249292, 'test_f1': 0.8832496102546337, 'test_accuracy': 0.9683427616961598, 'test_runtime': 6.0033, 'test_samples_per_second': 575.186, 'test_steps_per_second': 71.961}


In [85]:
model, labels, tokenized_datasets, data_collator, label_names = run_model(baseline_dataset1, tokenizer)
train(model, tokenized_datasets, data_collator)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_versio

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1007,0.084967,0.900796,0.913834,0.907268,0.97579
2,0.0451,0.074674,0.908823,0.932683,0.920598,0.982369
3,0.0258,0.07193,0.914478,0.937563,0.925877,0.98359


***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-1756
Configuration saved in bert-finetuned-ner/checkpoint-1756/config.json
Model weights saved in bert-finetuned-ner/checkpoint-1756/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-1756/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-1756/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-3512
Configuration saved in bert-finetuned-ner/checkpoint-3512/config.json
Model weights saved in bert-finetuned-ner/checkpoint-3512/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-3512/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-3512/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoin

Test Metrics
{'test_loss': 0.28601646423339844, 'test_precision': 0.7715974415027124, 'test_recall': 0.8436614730878187, 'test_f1': 0.806021905527128, 'test_accuracy': 0.947463182029173, 'test_runtime': 13.3191, 'test_samples_per_second': 518.504, 'test_steps_per_second': 64.869}


In [92]:
model, labels, tokenized_datasets, data_collator, label_names = run_model(baseline_dataset2, tokenizer)
train(model, tokenized_datasets, data_collator)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_versio

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1007,0.084967,0.900796,0.913834,0.907268,0.97579
2,0.0451,0.074674,0.908823,0.932683,0.920598,0.982369
3,0.0258,0.07193,0.914478,0.937563,0.925877,0.98359


***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-1756
Configuration saved in bert-finetuned-ner/checkpoint-1756/config.json
Model weights saved in bert-finetuned-ner/checkpoint-1756/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-1756/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-1756/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-3512
Configuration saved in bert-finetuned-ner/checkpoint-3512/config.json
Model weights saved in bert-finetuned-ner/checkpoint-3512/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-3512/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-3512/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoin

Test Metrics
{'test_loss': 0.4307788014411926, 'test_precision': 0.6537984496124031, 'test_recall': 0.7466359773371105, 'test_f1': 0.6971400231443213, 'test_accuracy': 0.9168529199711608, 'test_runtime': 15.0148, 'test_samples_per_second': 459.946, 'test_steps_per_second': 57.543}


In [86]:
model, labels, tokenized_datasets, data_collator, label_names = run_model(aug_conll_dataset1, tokenizer)
train(model, tokenized_datasets, data_collator)



  0%|          | 0/7 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_versio

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0667,0.080526,0.889214,0.919892,0.904293,0.977918
2,0.0312,0.082216,0.906679,0.928728,0.917571,0.980526
3,0.0137,0.088775,0.906923,0.93142,0.919009,0.981095


***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-3511
Configuration saved in bert-finetuned-ner/checkpoint-3511/config.json
Model weights saved in bert-finetuned-ner/checkpoint-3511/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-3511/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-3511/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-7022
Configuration saved in bert-finetuned-ner/checkpoint-7022/config.json
Model weights saved in bert-finetuned-ner/checkpoint-7022/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-7022/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-7022/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoin

Test Metrics
{'test_loss': 0.2572144567966461, 'test_precision': 0.8525532634426783, 'test_recall': 0.8927053824362606, 'test_f1': 0.8721674450787061, 'test_accuracy': 0.9637868707186361, 'test_runtime': 13.3691, 'test_samples_per_second': 516.566, 'test_steps_per_second': 64.627}


In [87]:
model, labels, tokenized_datasets, data_collator, label_names = run_model(aug_conll_dataset2, tokenizer)
train(model, tokenized_datasets, data_collator)



  0%|          | 0/7 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_versio

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0848,0.096937,0.857006,0.900202,0.878073,0.973224
2,0.0482,0.087414,0.878539,0.919051,0.898339,0.977252
3,0.0191,0.095613,0.884981,0.923258,0.903715,0.978339


***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-3511
Configuration saved in bert-finetuned-ner/checkpoint-3511/config.json
Model weights saved in bert-finetuned-ner/checkpoint-3511/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-3511/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-3511/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-7022
Configuration saved in bert-finetuned-ner/checkpoint-7022/config.json
Model weights saved in bert-finetuned-ner/checkpoint-7022/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-7022/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-7022/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoin

Test Metrics
{'test_loss': 0.24655063450336456, 'test_precision': 0.8347731275685649, 'test_recall': 0.8811083569405099, 'test_f1': 0.8573151298505534, 'test_accuracy': 0.9605203076183609, 'test_runtime': 15.1776, 'test_samples_per_second': 455.013, 'test_steps_per_second': 56.926}


In [88]:
rev_baseline_dataset1=DatasetDict({'train':aug_conll_dataset1['train'],'validation':aug_conll_dataset1['validation'],'test':conll_dataset['test']})

In [90]:
model, labels, tokenized_datasets, data_collator, label_names = run_model(rev_baseline_dataset1, tokenizer)
train(model, tokenized_datasets, data_collator)



  0%|          | 0/4 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_versio

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0667,0.080526,0.889214,0.919892,0.904293,0.977918
2,0.0312,0.082216,0.906679,0.928728,0.917571,0.980526
3,0.0137,0.088775,0.906923,0.93142,0.919009,0.981095


***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-3511
Configuration saved in bert-finetuned-ner/checkpoint-3511/config.json
Model weights saved in bert-finetuned-ner/checkpoint-3511/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-3511/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-3511/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-7022
Configuration saved in bert-finetuned-ner/checkpoint-7022/config.json
Model weights saved in bert-finetuned-ner/checkpoint-7022/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-7022/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-7022/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoin

Test Metrics
{'test_loss': 0.23021858930587769, 'test_precision': 0.8699062233589088, 'test_recall': 0.9033286118980169, 'test_f1': 0.886302440719187, 'test_accuracy': 0.9674288145475174, 'test_runtime': 5.957, 'test_samples_per_second': 579.658, 'test_steps_per_second': 72.52}


In [89]:
rev_baseline_dataset2=DatasetDict({'train':aug_conll_dataset2['train'],'validation':aug_conll_dataset2['validation'],'test':conll_dataset['test']})

In [91]:
model, labels, tokenized_datasets, data_collator, label_names = run_model(rev_baseline_dataset2, tokenizer)
train(model, tokenized_datasets, data_collator)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_versio

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0848,0.096937,0.857006,0.900202,0.878073,0.973224
2,0.0482,0.087414,0.878539,0.919051,0.898339,0.977252
3,0.0191,0.095613,0.884981,0.923258,0.903715,0.978339


***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-3511
Configuration saved in bert-finetuned-ner/checkpoint-3511/config.json
Model weights saved in bert-finetuned-ner/checkpoint-3511/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-3511/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-3511/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-7022
Configuration saved in bert-finetuned-ner/checkpoint-7022/config.json
Model weights saved in bert-finetuned-ner/checkpoint-7022/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-7022/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-7022/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 6500
  Batch size = 8
Saving model checkpoin

Test Metrics
{'test_loss': 0.20859140157699585, 'test_precision': 0.8696023212152244, 'test_recall': 0.9020892351274787, 'test_f1': 0.8855479273485704, 'test_accuracy': 0.9680276075069728, 'test_runtime': 5.999, 'test_samples_per_second': 575.593, 'test_steps_per_second': 72.012}
