<a href="https://colab.research.google.com/github/samira20494/bigscience-pii-farsi/blob/module3-model-training/src/module3/notebook/fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install transformers
!pip install seqeval

Collecting datasets
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
[K     |████████████████████████████████| 306 kB 5.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 441 kB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 44.7 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 46.8 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 45.4 MB/s 
Collecting asynctest==0.13.0
  Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_

In [None]:
import itertools
import os

import pandas as pd
from datasets import Dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
def get_all_tokens_and_ner_tags(directory):
    return pd.concat([get_tokens_and_ner_tags(os.path.join(directory, filename)) for filename in
                      os.listdir(directory)]).reset_index().drop('index', axis=1)


def get_tokens_and_ner_tags(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
        tokens = [[x.split(' ')[0] for x in y] for y in split_list]
        entities = [[x.split(' ')[1][:-1] for x in y] for y in split_list]
    return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})


def get_token_dataset(directory):
    df = get_all_tokens_and_ner_tags(directory)
    train_df = df.sample(frac=0.8, random_state=25)
    test_df = df.drop(train_df.index)
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    return train_dataset, test_dataset





In [None]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in
                        zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in
                   zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    print("precision: ", results["overall_precision"], "recall:", results["overall_recall"],
          "f1: ", results["overall_f1"], "accuracy: ", results["overall_accuracy"])
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"]}



In [None]:
label_list = ['O', 'B-event', 'I-event', 'B-loc', 'I-loc', 'B-pers', 'I-pers', 'B-pro', 'I-pro', 'B-fac', 'I-fac',
              'B-org', 'I-org']

label_encoding_dict = {'': 0, 'O': 0, 'I-': 0, 'B-event': 1, 'I-event': 2, 'B-pers': 3, 'I-pers': 4,
                       'B-org': 5, 'I-org': 6, 'B-loc': 7, 'I-loc': 8, 'B-pro': 9, 'I-pro': 10, 'B-fac': 11, 'I-fac': 12}


task = "ner"
model_checkpoint = "HooshvareLab/bert-fa-zwnj-base"
batch_size = 16

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)




No. of training examples: 6145
No. of testing examples: 1536


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Shape of training examples: (6145, 7)
Shape of testing examples: (1536, 7)


In [None]:
# preprocessing
train_dataset, test_dataset = get_token_dataset('/content/gdrive/My Drive/Colab Notebooks/bigscience-farsi/data')

print(f"No. of training examples: {train_dataset.shape[0]}")
print(f"No. of testing examples: {test_dataset.shape[0]}")

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

print(f"Shape of training examples: {train_tokenized_datasets.shape}")
print(f"Shape of testing examples: {test_tokenized_datasets.shape}")

Mounted at /content/gdrive


In [None]:
# fine-tuning

model_checkpoint = "HooshvareLab/bert-fa-zwnj-base"
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()
trainer.save_model('/content/gdrive/My Drive/Colab Notebooks/bigscience-farsi/model/armanperso-ner.model')

Some weights of the model checkpoint at HooshvareLab/bert-fa-zwnj-base were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-zwnj-

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.128708,0.692683,0.688927,0.6908,0.962193
2,0.144400,0.108949,0.730435,0.791096,0.759556,0.968125
3,0.042500,0.118674,0.768132,0.797945,0.782755,0.970869


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, __index_level_0__, ner_tags.
***** Running Evaluation *****
  Num examples = 1536
  Batch size = 16


precision:  0.6926829268292682 recall: 0.6889269406392694 f1:  0.6907998283016169 accuracy:  0.9621926545728885


Saving model checkpoint to test-ner/checkpoint-500
Configuration saved in test-ner/checkpoint-500/config.json
Model weights saved in test-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-ner/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, __index_level_0__, ner_tags.
***** Running Evaluation *****
  Num examples = 1536
  Batch size = 16


precision:  0.7304347826086957 recall: 0.791095890410959 f1:  0.7595561035758324 accuracy:  0.9681252357600906


Saving model checkpoint to test-ner/checkpoint-1000
Configuration saved in test-ner/checkpoint-1000/config.json
Model weights saved in test-ner/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-ner/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-ner/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, __index_level_0__, ner_tags.
***** Running Evaluation *****
  Num examples = 1536
  Batch size = 16


precision:  0.7681318681318682 recall: 0.797945205479452 f1:  0.7827547592385219 accuracy:  0.9708686259044614




Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, __index_level_0__, ner_tags.
***** Running Evaluation *****
  Num examples = 1536
  Batch size = 16


Saving model checkpoint to model/armanperso-ner.model
Configuration saved in model/armanperso-ner.model/config.json


precision:  0.7681318681318682 recall: 0.797945205479452 f1:  0.7827547592385219 accuracy:  0.9708686259044614


Model weights saved in model/armanperso-ner.model/pytorch_model.bin
tokenizer config file saved in model/armanperso-ner.model/tokenizer_config.json
Special tokens file saved in model/armanperso-ner.model/special_tokens_map.json


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l[K     |███████▌                        | 10 kB 16.4 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 9.6 MB/s eta 0:00:01[K     |██████████████████████▌         | 30 kB 5.2 MB/s eta 0:00:01[K     |██████████████████████████████  | 40 kB 4.9 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 1.3 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16181 sha256=b2a1b8f3c25b11f4b171c17588cf22f77da6ec786e58279bc524ff08f1940979
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
