### Install Required Packages

In [1]:
%pip install transformers datasets black evaluate[evaluator]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 8.5 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 70.4 MB/s 
[?25hCollecting black
  Downloading black-22.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 48.3 MB/s 
[?25hCollecting evaluate[evaluator]
  Downloading evaluate-0.1.2-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 65.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.

### Import Required Packages and Initialize Variables


PERSIAN: Train persian model (english model is just for test)

LOAD: Load a trained model instead of training

SAVE: Save model after training is finished

In [1]:
import json
import torch
import evaluate

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    pipeline,
)

PERSIAN = True
LOAD = False
SAVE = True
model_name = "my_model"

LABELS = [
    "مستقیم منفی",
    "غیر مستقیم منفی",
    "خنثی",
    "غیر مستقیم مثبت",
    "مستقیم مثبت",
]

LABEL_to_NUMBER = {
    LABELS[0]: 0,
    LABELS[1]: 1,
    LABELS[2]: 2,
    LABELS[3]: 3,
    LABELS[4]: 4,
}

AUTO_LABEL_MAPPING = {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
}


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device


'cuda:0'

### Initialize Tokenizer

In [3]:
if PERSIAN:
    tokenizer = AutoTokenizer.from_pretrained(
        "HooshvareLab/bert-fa-base-uncased-clf-digimag"
    )
else:
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

### Load Dataset

numberize converts text labels to numbers. Mapping of labels to numbers is in LABEL_to_NUMBER variable that was instantiated before. 

weighted_label inputs a list of annotations, and computes the average of the labels, and then round to nearest number. Note that labels are treated as numbers from 0 to 4.


In [4]:
def weighted_label(annotations):
    return list(
        map(
            lambda annotation_list: round(sum(annotation_list) / len(annotation_list)),
            annotations,
        )
    )


def numberize(annotation_list):
    return list(map(lambda x: LABEL_to_NUMBER[x], annotation_list))


Now we load can load persian dataset by reading the data from json file, create a label using list of annotations, and then tokenize the text. 

Note that we used average instead of majority voting, because of the way that the data was labeled and the sequential nature of labels.


English data is just for test.

In [5]:
def add_label(entry):
    annotations = entry["annotations"]
    annotations = list(map(numberize, annotations))
    labels = weighted_label(annotations)
    entry["label"] = labels
    return entry


def tokenize_persian(entry):
    return tokenizer(entry["text"], truncation=True, padding=True, max_length=512)


def get_dataset_persian(split):
    dataset = load_dataset(
        "json", data_files="dataset_annotated_finance.json", field=split
    )["train"]
    dataset = dataset.map(add_label, batched=True)
    dataset = dataset.map(tokenize_persian, batched=True)
    return dataset


def load_persian_data():
    train_dataset = get_dataset_persian("train")
    test_dataset = get_dataset_persian("test")
    val_dataset = get_dataset_persian("eval")
    return train_dataset, test_dataset, val_dataset


In [7]:
def tokenize_english(entry):
    return tokenizer(entry["text"], truncation=True)


def get_dataset_english(split):
    dataset = load_dataset("imdb", split=split)
    dataset = dataset.map(tokenize_english, batched=True)
    return dataset


def load_english_data():
    train_dataset = get_dataset_english("train")
    test_dataset = get_dataset_english("test")
    val_dataset = get_dataset_english("val")
    # train_data = load_dataset("imdb", split='train[:10%]')
    return train_dataset, test_dataset, val_dataset


In [8]:
if PERSIAN:
    train_dataset, test_dataset, val_dataset = load_persian_data()
else:
    train_dataset, test_dataset, val_dataset = load_english_data()

print(train_dataset[0])


Using custom data configuration default-8c630074b9eb2aae


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-8c630074b9eb2aae/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-8c630074b9eb2aae/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Using custom data configuration default-61f1150e68951fc2


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-61f1150e68951fc2/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-61f1150e68951fc2/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Using custom data configuration default-745e50ecc6f3348b


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-745e50ecc6f3348b/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-745e50ecc6f3348b/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

{'text': 'آتش سوزی کارخانه جمیل نخ گسترده است / اعزام نیرو ادامه دارد\n____________\nمنصور شیشه فروش در گفت و گو با خبرنگار مهر با اشاره به حادثه آتش سوزی در کارخانه جمیل نخ اظهار داشت: این حادثه ساعت ۱۷ و ۳۰ دقیقه اتفاق افتاده و به نیروهای امدادی اعلام شده است. وی با بیان اینکه نیروهای امداد و نجات برای این حادثه اعزام شده است، تصریح کرد: دو دستگاه خودروی اطفا حریق از آتش نشانی شهرک صنعتی مورچه خورت و دو دستگاه خودرو به همراه نیروهای امدادی نیز از آتش نشانی شاهین شهر برای این حادثه اعزام شده است', 'annotations': ['غیر مستقیم منفی', 'خنثی', 'مستقیم منفی'], 'label': 1, 'input_ids': [2, 4662, 12596, 6307, 38526, 3234, 5223, 2806, 1013, 8207, 5077, 3251, 2924, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9649, 8030, 3569, 2786, 3017, 1379, 3302, 2799, 6534, 4153, 2799, 3364, 2789, 5979, 4662, 12596, 2786, 6307, 38526, 3234, 3913, 2996, 1014, 2802, 5979, 3551, 4051, 1379, 3979, 4853, 3929, 6707, 1379, 2789, 4147, 18647, 3402, 2871, 2806, 1012, 2931, 2799, 3533, 3131, 4147, 9181, 1379, 6190, 2831, 2

### Train or Load the Model

If model was trained before, we can load the model. 

For training, after starting from a pretrained BERT model for text classification, we chane the last layer (classification layer) and then train the model with the provided data. Training parameter are set in the following code. 

In [9]:
if LOAD:
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=5, ignore_mismatched_sizes=True
    )
else:
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    if PERSIAN:
        pretrained_model = AutoModelForSequenceClassification.from_pretrained(
            "HooshvareLab/bert-fa-base-uncased-clf-digimag",
            num_labels=5,
            ignore_mismatched_sizes=True,
        ).to(device)
    else:
        pretrained_model = AutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=2
        ).to(device)
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
    )

    trainer = Trainer(
        model=pretrained_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    model = trainer.model


Downloading:   0%|          | 0.00/621M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased-clf-digimag and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: annotations, text. If annotations, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1450
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumula

Step,Training Loss
500,0.616


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




## Save Model

In [10]:
if SAVE:
    trainer.save_model(model_name)


Saving model checkpoint to my_model
Configuration saved in my_model/config.json
Model weights saved in my_model/pytorch_model.bin
tokenizer config file saved in my_model/tokenizer_config.json
Special tokens file saved in my_model/special_tokens_map.json


In [11]:
classifier = pipeline("text-classification", model=model.to("cpu"), tokenizer=tokenizer)


In [21]:
text = "ثبت قرار داد تجاری ایران و چین در جهت افزایش روابط اقتصادی برای اقتصاد ایران بسیار خوب است"
print(classifier(text))
text = "آمریکا و انگلیس قرار داد جدیدی برای تحریم ایران ثبت کردند"
print(classifier(text))


[{'label': 'LABEL_4', 'score': 0.4737503230571747}]
[{'label': 'LABEL_1', 'score': 0.3711070120334625}]


## Evaluate model

We evaluate model by calculating predictions, and then calculating metrics using predictions and the true label.

In [14]:
predictions = classifier(test_dataset["text"])
predictions = [AUTO_LABEL_MAPPING[label["label"]] for label in predictions]


Disabling tokenizer parallelism, we're using DataLoader multithreading already


We used f1 and accuracy as our metrics, but you can use any other metric that you wish.

In [15]:
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")
# auc = evaluate.load("roc auc")


Downloading builder script:   0%|          | 0.00/2.33k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

In [16]:
references = test_dataset["label"]
f1_score = f1_metric.compute(
    predictions=predictions, references=references, average="weighted"
)
accuracy_score = accuracy_metric.compute(predictions=predictions, references=references)


In [17]:
print(accuracy_score)
print(f1_score)


{'accuracy': 0.8}
{'f1': 0.7531228070175437}
