# Installetions, imports and global variables

In [1]:
! pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-1.18.4-py3-none-any.whl (312 kB)
[K     |████████████████████████████████| 312 kB 12.3 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 47.6 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.8 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 24.5 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 36.7 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.2.0-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 40.9 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0

In [2]:
! pip install adapter_transformers

Collecting adapter_transformers
  Downloading adapter_transformers-2.3.0-py3-none-any.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 8.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 47.4 MB/s 
Installing collected packages: tokenizers, adapter-transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.11.6
    Uninstalling tokenizers-0.11.6:
      Successfully uninstalled tokenizers-0.11.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.17.0 requires tokenizers!=0.11.3,>=0.11.1, but you have tokenizers 0.10.3 which is incompatible.[0m
Successfully installed adapter-transformers-2.3.0 tokenizers-0.10.3


In [3]:
from re import template
from pathlib import Path
import random
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import pipeline
from datasets import load_dataset, load_metric
from datasets import DatasetDict, Dataset
from sklearn.metrics import f1_score, accuracy_score


In [4]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CUDA_LAUNCH_BLOCKING = "1"
OUT_PATH = Path("results")

TOKENIZER = "bert-base-uncased"
MODEL = "bert-base-uncased"

#Part 1 - Load data and preprocess 

## Load data

In [5]:
from re import template

with open('Alice_book') as f:
    alice_book = f.readlines()

alice_book = alice_book[40:-2]

for line in alice_book:
  if '*' in line or 'CHAPTER' in line:
    alice_book.remove(line)

tmp_paragraph = []
paragraphs_list = []
for i, line in enumerate(alice_book):
  if line != '\n':
    tmp_paragraph.append(line[:-1])
  else:
    if tmp_paragraph:
      tmp_paragraph = ' '.join(tmp_paragraph)
      paragraphs_list.append(tmp_paragraph)
    tmp_paragraph = []

## Tagging train set

In [6]:
label_list = [
    "O",       # Outside of a named entity
    "B-PER",   # Beginning of a person's name right after another person's name
    "I-PER",   # Person's name
    "B-ORG",   # Beginning of an organisation right after another organisation
    "I-ORG",   # Organisation
    "B-LOC",   # Beginning of a location right after another location
    "I-LOC",   # Location
    "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
    "I-MISC"   # Miscellaneous entity
]
ner_tags_map = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [7]:
tokens_list = []
for i in range(5):
  tokens_list.append(paragraphs_list[i].split())

In [8]:
ner_tags_0 = ['O']*len(tokens_list[0])
ner_tags_0[0] = 'B-PER'
ner_tags_0[52] = 'B-PER'
ner_tags_0[11] = 'B-PER'
ner_tags_0[31] = 'B-PER'
ner_tags_0[14] = 'B-LOC'
ner_tags_0[29] = 'B-MISC'
ner_tags_0[-7] = 'B-MISC'
ner_tags_0[38] = 'B-MISC'
ner_tags_0[54] = 'B-MISC'
ner_tags_0[40] = 'B-MISC'
ner_tags_0[-1] = 'B-MISC'

ner_nums_0 = []
for ner_tag in ner_tags_0:
  ner_nums_0.append(ner_tags_map[ner_tag])

In [9]:
ner_tags_1 = ['O']*len(tokens_list[1])
ner_tags_1[7] = 'B-MISC'
ner_tags_1[16] = 'B-MISC' 
ner_tags_1[30] = 'B-MISC'
ner_tags_1[42] = 'B-MISC'
ner_tags_1[47] = 'B-MISC'
ner_tags_1[50] = 'B-MISC'

ner_nums_1 = []
for ner_tag in ner_tags_1:
  ner_nums_1.append(ner_tags_map[ner_tag])

In [10]:
ner_tags_2 = ['O']*len(tokens_list[2])
ner_tags_2[10] = 'B-PER'
ner_tags_2[82] = 'B-PER'
ner_tags_2[23] = 'B-MISC' 
ner_tags_2[65] = 'B-MISC'
ner_tags_2[69] = 'B-MISC'
ner_tags_2[73] = 'B-MISC'
ner_tags_2[92] = 'B-MISC'
ner_tags_2[100] = 'B-MISC'
ner_tags_2[104] = 'B-MISC'
ner_tags_2[107] = 'B-MISC'
ner_tags_2[121] = 'B-LOC'
ner_tags_2[137] = 'B-LOC'
ner_tags_2[140] = 'B-MISC'

ner_nums_2 = []
for ner_tag in ner_tags_2:
  ner_nums_2.append(ner_tags_map[ner_tag])

In [11]:
ner_tags_3 = ['O']*len(tokens_list[3])
ner_tags_3[5] = 'B-PER'
ner_tags_3[14] = 'B-MISC'

ner_nums_3 = []
for ner_tag in ner_tags_3:
  ner_nums_3.append(ner_tags_map[ner_tag])

In [12]:
ner_tags_4 = ['O']*len(tokens_list[4])
ner_tags_4[1] = 'B-LOC'
ner_tags_4[7] = 'B-LOC'
ner_tags_4[19] = 'B-PER'

ner_nums_4 = []
for ner_tag in ner_tags_4:
  ner_nums_4.append(ner_tags_map[ner_tag])

In [13]:
ner_tags = [ner_tags_0, ner_tags_1, ner_tags_2, ner_tags_3, ner_tags_4]
ner_nums = [ner_nums_0, ner_nums_1, ner_nums_2, ner_nums_3, ner_nums_4]

## Create the dataset

In [14]:
paragraph_dict = {'train': {'id': [], 'tokens': [], 'ner_tags': []}, 
                  'test': {'id': [], 'tokens': [], 'ner_tags': []}}
for i, paragraph in enumerate(paragraphs_list):
  if i <= 4:
    paragraph_dict['train']['id'].append(str(i))
    paragraph_dict['train']['tokens'].append(paragraph.split())
    paragraph_dict['train']['ner_tags'].append(ner_nums[i])
  else:
    paragraph_dict['test']['id'].append(str(i))
    paragraph_dict['test']['tokens'].append(paragraph.split())
    paragraph_dict['test']['ner_tags'].append([]*len(paragraph))

In [15]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict()
# using your `Dict` object
for k,v in paragraph_dict.items():
    dataset[k] = Dataset.from_dict(v)

# Part 2 - Training and predicting

## Fine tuning

#### We will fine tuned the bert-base model on conll2003 dataset for NER task

In [16]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [17]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [18]:
conll = load_dataset("conll2003")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

label_all_tokens = True

label_list = conll["train"].features[f"ner_tags"].feature.names

tokenized_conll = conll.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(label_list))

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    do_train=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_conll["train"],
    eval_dataset=tokenized_conll["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
model.save_pretrained('./Fine_tune_BERT/')

Downloading:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 959.94 KiB, generated: 9.78 MiB, post-processed: Unknown size, total: 10.72 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee...


Downloading:   0%|          | 0.00/983k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2146,0.113388,0.885857,0.882388,0.884119,0.974294
2,0.0493,0.120573,0.891411,0.904106,0.897714,0.97583
3,0.0252,0.128727,0.890478,0.901258,0.895836,0.975744
4,0.0154,0.139971,0.894953,0.906955,0.900914,0.9764
5,0.0103,0.149178,0.89356,0.905649,0.899564,0.976366


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, pos_tags, chunk_tags, id, ner_tags.
***** Running Evaluation *****
  Num examples = 3454
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration s

## Training

#### We will add another fine tuning specific for NER task on our data

In [20]:
# run this cell again because the previous cell overrides the dataset
from datasets import DatasetDict, Dataset

dataset = DatasetDict()
# using your `Dict` object
for k,v in paragraph_dict.items():
    dataset[k] = Dataset.from_dict(v)

In [21]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

label_all_tokens = True

tokenized_dataset = dataset['train'].map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

model = AutoModelForTokenClassification.from_pretrained('./Fine_tune_BERT/', num_labels=len(label_list))

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    do_train=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

model.save_pretrained('./Fine_tune_BERT/')
# tokenizer.save_pretrained('./Fine_tune_BERT/')

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "adapters": {
    "adapters": {},
    "config_map": {},
    "fusion_config_map": {},
    "fusions": {}
  },
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 305

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file ./Fine_tune_BERT/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "adapters": {
    "adapters": {},
    "config_map": {},
    "fusion_config_map": {},
    "fusions": {}
  },
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "n

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in ./Fine_tune_BERT/config.json
Model weights saved in ./Fine_tune_BERT/pytorch_model.bin


## Predicting

In [22]:
model = AutoModelForTokenClassification.from_pretrained('./Fine_tune_BERT/', num_labels=len(label_list))
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

results = pd.DataFrame(columns = ['tokens', 'ner_tags'])

for i, paragraph in enumerate(paragraphs_list):
  tokens = tokenizer(paragraph)

  torch.tensor(tokens['input_ids']).unsqueeze(0).size()

  preds = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), 
              attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
  preds = torch.argmax(preds.logits.squeeze(), axis=1)
  words = tokenizer.batch_decode(tokens['input_ids'])
  value_preds = [label_list[i] for i in preds]

  tmp = pd.DataFrame({'tokens': words, 'ner_tags': value_preds})
  results = results.append(tmp, ignore_index=True)

results

loading configuration file ./Fine_tune_BERT/config.json
Model config BertConfig {
  "_name_or_path": "./Fine_tune_BERT/",
  "adapters": {
    "adapters": {},
    "config_map": {},
    "fusion_config_map": {},
    "fusions": {}
  },
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "n

Unnamed: 0,tokens,ner_tags
0,[CLS],O
1,alice,B-PER
2,was,O
3,beginning,O
4,to,O
...,...,...
37990,s,O
37991,heavy,O
37992,sobs,O
37993,.,O


In [23]:
results.to_csv('Alice_results.csv')