In [1]:
import os
import json
import math
from functools import partial

from accelerate import Accelerator
from datasets import load_dataset, load_metric
import torch
from torch.utils.data.dataloader import DataLoader
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    DataCollatorWithPadding,
    get_scheduler,
    set_seed,
)

2022-11-08 21:23:04.959750: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-08 21:23:07.243339: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-08 21:23:12.901527: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-11-08 21:23:12.902351: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

# Organize dataset
- organize to format that could be used by `load_dataset`

In [5]:
intent_data_path = './intent/data'
datas = ['train', 'eval', 'test']
for data in datas:   
    in_file = os.path.join(intent_data_path, data + ".json")
    out_file = os.path.join(intent_data_path, data + "_organized.json")
    with open(out_file, 'w') as wf:
        with open(in_file, 'r') as rf:
            data = json.load(rf)
        for d in data:
            print(json.dumps(d), file=wf)

In [2]:
datasets = load_dataset(
    "json", 
    data_files={"train": 'data/intent/train_organized.json', "valid": 'data/intent/eval_organized.json'}
    )
metrics = load_metric("accuracy")

Using custom data configuration default-b80b426ef07fa8bc
Found cached dataset json (/home/guest/r11922a05/.cache/huggingface/datasets/json/default-b80b426ef07fa8bc/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/2 [00:00<?, ?it/s]

  metrics = load_metric("accuracy")


In [3]:
cols = datasets["train"].column_names
text_col, intent_col = "text", "intent"

In [4]:
train_examples = datasets["train"]
valid_examples = datasets["valid"]
intent2id = {intent: i for i, intent in enumerate(sorted(list(set(train_examples[intent_col]))))}
id2intent = {v: k for k, v in intent2id.items()}

# Data preprocessing

In [4]:
model_name = "distilbert-base-uncased"
batch_size = 16
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
config = AutoConfig.from_pretrained(model_name, id2label=id2intent)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

In [6]:
def prepare_features(examples, tokenizer, intent2id):
    tokenized_examples = tokenizer(examples[text_col])
    if examples.get(intent_col):
        tokenized_examples["labels"] = [intent2id[intent] for intent in examples[intent_col]]
    return tokenized_examples

In [7]:
prepare_features = partial(prepare_features, tokenizer=tokenizer, intent2id=intent2id)
train_dataset = train_examples.map(
    prepare_features,
    batched=True,
    num_proc=4,
    remove_columns=cols,
)
valid_dataset = valid_examples.map(
    prepare_features,
    batched=True,
    num_proc=4,
    remove_columns=cols,
)

     

#0:   0%|          | 0/4 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/4 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/4 [00:00<?, ?ba/s]

#3:   0%|          | 0/4 [00:00<?, ?ba/s]

     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

# Modeling
- see `example/text_classification.ipynb`

In [13]:
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, 
                    batch_size=batch_size, num_workers=4)
valid_dataloader = DataLoader(valid_dataset, collate_fn=data_collator, 
                    batch_size=batch_size, num_workers=4)

In [14]:
no_decay = ["bias", "LayerNorm.weight"]
weight_decay = 1e-2
lr = 3e-5
optimizer_gparams = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_gparams, lr=lr)



In [16]:
accelerator = Accelerator()
model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, valid_dataloader
)

In [18]:
n_epoch = 5
scheduler_type = 'linear'
grad_accum_steps = 5
warm_up_ratio = 0.1
log_steps = 50

update_steps_per_epoch = math.ceil(len(train_dataloader) / grad_accum_steps)
max_update_steps = n_epoch * update_steps_per_epoch
lr_scheduler = get_scheduler(
    name=scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=int(max_update_steps * warm_up_ratio),
    num_training_steps=max_update_steps,
)

In [None]:
eval_steps = 50
max_valid_acc = 0
for epoch in range(n_epoch):
    total_loss = 0
    for step, data in enumerate(train_dataloader, 1):
        model.train()
        outputs = model(**data)
        loss = outputs.loss
        total_loss += loss.item()
        if len(train_dataloader) % grad_accum_steps != 0 \
                and len(train_dataloader) - step < grad_accum_steps:
            loss = loss / (len(train_dataloader) % grad_accum_steps)
        else:
            loss = loss / grad_accum_steps
        accelerator.backward(loss)
        
    # Update model parameters
        if step % grad_accum_steps == 0 or step == len(train_dataloader):
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
    # Log train loss
        if step % log_steps == 0 or step == len(train_dataloader):
            print("Train | Loss: {:.5f}".format(total_loss / step))
    # Evaluate!
        if step % eval_steps == 0 or step == len(train_dataloader)):
            model.eval()
            all_logits = []
            for step, data in enumerate(valid_dataloader):
                with torch.no_grad():
                    outputs = model(**data)
                    predictions = outputs.logits.argmax(dim=-1)
                    metrics.add_batch(predictions=accelerator.gather(predictions),
                                    references=accelerator.gather(data["labels"]))
            valid_acc = metrics.compute()["accuracy"]
            print("Valid | Acc: {:.5f}".format(valid_acc))
            if valid_acc >= max_valid_acc:
                max_valid_acc = valid_acc
                accelerator.wait_for_everyone()
                unwrapped_model = accelerator.unwrap_model(model)
                unwrapped_model.save_pretrained(args.saved_dir, save_function=accelerator.save)
                logger.info("Saving config and model to {}...".format(args.saved_dir))
if not args.valid_file:
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(args.saved_dir, save_function=accelerator.save)
    logger.info("Saving config and model to {}...".format(args.saved_dir))

# Inference
- see https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_xnli.py#L415