<a href="https://colab.research.google.com/github/skywalkerzhang/notes/blob/master/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets scipy scikit-learn torch torchvision torchaudio



In [2]:
# load_data
import dataclasses
import json
import logging
from dataclasses import dataclass
from typing import List, Optional, Union

import torch
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizer

logger = logging.getLogger(__name__)


@dataclass
class STSInputExample:
    """A single training/test example for semantic textual similarity.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: float. The label of the example.
    """

    guid: str
    text_a: str
    text_b: str
    label: float

    def to_dict(self):
        return dataclasses.asdict(self)

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2) + "\n"


@dataclass(frozen=True)
class STSInputFeatures:
    """A single set of features of data. Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded)
            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    """

    input_ids: List[int]
    attention_mask: Optional[List[int]] = None
    token_type_ids: Optional[List[int]] = None
    label: Optional[Union[int, float]] = None

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(dataclasses.asdict(self)) + "\n"


class STSDataset:
    def __init__(self, data: list, tokenizer: PreTrainedTokenizer, max_seq_length: int):
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.features = self._convert_features(self._create_examples(self.data))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature = self.features[idx]
        input_ids = torch.tensor(feature.input_ids, dtype=torch.long)
        attn_mask = torch.tensor(feature.attention_mask, dtype=torch.long)
        token_type_ids = torch.tensor(
            0 if feature.token_type_ids is None else feature.token_type_ids,
            dtype=torch.long,
        )
        labels = torch.tensor(feature.label, dtype=torch.float)
        return (input_ids, attn_mask, token_type_ids, labels)

    def _create_examples(self, data):
        examples = [
            STSInputExample(
                guid=d["idx"],
                text_a=d["sentence1"],
                text_b=d["sentence2"],
                label=d["label"]
            )
            for d in self.data
        ]
        return examples

    def _convert_features(
        self, examples: List[STSInputExample]
    ) -> List[STSInputFeatures]:
        return convert_examples_to_features(
            examples,
            self.tokenizer,
            max_length=self.max_seq_length,
        )


def convert_examples_to_features(
    examples: List[STSInputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
):
    if max_length is None:
        max_length = tokenizer.model_max_length

    labels = [float(example.label) for example in examples]

    batch_encoding = tokenizer(
        [(example.text_a, example.text_b) for example in examples],
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}

        feature = STSInputFeatures(**inputs, label=labels[i])
        features.append(feature)

    for i, example in enumerate(examples[:1]):
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % features[i])

    return features


class STSDataLoader(object):
    def __init__(self, tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None):
        self.tokenizer = tokenizer
        self.max_length = max_length if max_length else self.tokenizer.model_max_length

    def get_dataloader(self, data, batch_size, **kwargs):
        dataset = STSDataset(data, self.tokenizer, self.max_length)
        return DataLoader(dataset, batch_size=batch_size, shuffle=False, **kwargs)


In [3]:
# trainer
from datasets import Metric
import torch
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
from transformers import PreTrainedModel, PreTrainedTokenizer


class Trainer(object):
    def __init__(self,
                 model: PreTrainedModel,
                 tokenizer: PreTrainedTokenizer,
                 train_loader: DataLoader,
                 valid_loader: DataLoader,
                 optimizer: Optimizer,
                 scheduler: LambdaLR,
                 metric: Metric,
                 device: str,
                 logger):

        self.model = model
        self.tokenizer = tokenizer
        self.train_loader = train_loader
        self.valid_loader = valid_loader

        self.optimizer = optimizer
        self.scheduler = scheduler
        self.metric = metric

        self.device = device
        self.logger = logger

    def save_checkpoint(self, output_dir: str):
        self.logger.info("saving model..")
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)

    def optimize(self, loss: float):
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
        self.optimizer.step()
        self.scheduler.step()
        self.optimizer.zero_grad()

    def train(self, epoch: int, log_interval: int):
        self.model.train()

        train_loss = 0.0
        n_data = 0
        for i, batch in enumerate(self.train_loader, 1):
            input_ids, attention_mask, token_type_ids, labels = [x.to(self.device) for x in batch]
            output = self.model(
                input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels
            )
            loss = output['loss']
            self.optimize(loss)

            train_loss += loss * input_ids.size(0)
            n_data += input_ids.size(0)

            if i % log_interval == 0 or i == len(self.train_loader) - 1:
                self.logger.info(f"[Epoch {epoch+1} / Step {i}] " + "loss={:.4f}".format(train_loss / n_data))
                train_loss = 0.0
                n_data = 0

    @torch.no_grad()
    def eval(self, best_score: float, output_dir: str):
        self.model.eval()

        valid_loss = 0.0
        for _, batch in enumerate(self.valid_loader, 1):
            input_ids, attention_mask, token_type_ids, labels = [x.to(self.device) for x in batch]
            output = self.model(
                input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels
            )
            valid_loss += output['loss'] * input_ids.size(0)
            logits = output['logits']

            self.metric.add_batch(predictions=logits, references=labels)

        spearmanr = self.metric.compute()['spearmanr']
        valid_loss /= len(self.valid_loader.dataset)

        self.logger.info("[Valid] " + "loss={:.4f}  spearmanr={:.4f};".format(valid_loss, spearmanr))
        if spearmanr > best_score:
            best_score = spearmanr
            self.logger.info("Hit the best score")
            self.save_checkpoint(output_dir)
        return best_score


In [4]:
# utils
from datetime import datetime
import json
import os

from pytz import timezone


def read_json(file_path):
    with open(file_path) as f:
        return json.load(f)


def kst(sec, what):
    kst = datetime.now(timezone('Asia/Seoul'))
    return kst.timetuple()


def make_dirs(directory):
    if not os.path.isdir(directory):
        os.makedirs(directory, exist_ok=True)


In [5]:
# class MyArgs():
#   def __init__(self):
#     self.epochs = 10
#     self.log_interval = 100
#     self.lr = 5e-05
#     self.model_dir = './model'
#     self.max_length = 512
#     self.batch_size = 8
#     self.model_name_or_path = 'bert-base-uncased'
#     self.seed = 1
#     self.output_dir = './output'
#     self.data_dir = '/data'
#     self.train_batch_size = 8
#     self.valid_batch_size = 8
#     self.warmup_proportion = 0.1
class MyArgs():
  def __init__(self):
    self.epochs = 10
    self.log_interval = 100
    self.lr = 5e-05
    self.model_dir = './model'
    self.max_length = 512
    self.batch_size = 1
    self.model_name_or_path = 'microsoft/deberta-base'
    self.seed = 1
    self.output_dir = './output'
    self.data_dir = '/data'
    self.train_batch_size = 1
    self.valid_batch_size = 1
    self.warmup_proportion = 0.1

In [None]:
import argparse
import logging
import os

from datasets import load_dataset, load_metric
from transformers import (
    AdamW,
    AutoConfig,
    # AutoModelForSequenceClassification,
    # AutoTokenizer,
    DebertaTokenizer, 
    DebertaForSequenceClassification,
    get_linear_schedule_with_warmup,
    set_seed,
)
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"


def build_adamw_optimizer(lr, model, num_train_steps, num_warmup_steps, global_step=0):
    last_epoch = -1 if global_step == 0 else global_step
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    for group in optimizer.param_groups:
        group['initial_lr'] = lr
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=num_warmup_steps,
                                                num_training_steps=num_train_steps,
                                                last_epoch=last_epoch)
    return optimizer, scheduler


def train(args):
    # Set logger & seed
    set_seed(args.seed)
    make_dirs(args.model_dir)
    logging.Formatter.converter = kst
    logging.basicConfig(filename=os.path.join(args.model_dir, 'logs_train.txt'),
                        filemode='w', format='%(asctime)s -  %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.info(args)

    # Set device
    num_gpus = torch.cuda.device_count()
    use_cuda = num_gpus > 0
    device = torch.device("cuda" if use_cuda else "cpu")

    logger.info(f"***** using {device} *****")
    logger.info(f"***** num GPU: {num_gpus} *****")

    # Build PLM config & tokenizer
    config = AutoConfig.from_pretrained(
        args.model_name_or_path,
        num_labels=1,
        finetuning_task="stsb"
    )
    # tokenizer = AutoTokenizer.from_pretrained(
    tokenizer = DebertaTokenizer.from_pretrained(
        args.model_name_or_path,
        use_fast=True,
    )

    # Build data loader
    datasets = load_dataset("glue", "stsb")
    sts_dataloader = STSDataLoader(tokenizer)
    train_loader = sts_dataloader.get_dataloader(
        data=list(datasets['train']),
        batch_size=args.train_batch_size,
    )
    valid_loader = sts_dataloader.get_dataloader(
        data=list(datasets['validation']),
        batch_size=args.valid_batch_size,
    )

    # Build model
    # model = AutoModelForSequenceClassification.from_pretrained(
    model = DebertaForSequenceClassification.from_pretrained(
        args.model_name_or_path,
        config=config,
    )
    if num_gpus > 1:
        model = torch.nn.DataParallel(model)
    model = model.to(device)

    # Build optimizer
    total_train_steps = len(train_loader) * args.epochs
    num_warmup_steps = int(args.warmup_proportion * total_train_steps)
    optimizer, scheduler = build_adamw_optimizer(args.lr,
                                                 model,
                                                 total_train_steps,
                                                 num_warmup_steps)

    # Set metric
    metric = load_metric("glue", "stsb")

    # Build trainer
    trainer = Trainer(model=model,
                      tokenizer=tokenizer,
                      train_loader=train_loader,
                      valid_loader=valid_loader,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      metric=metric,
                      device=device,
                      logger=logger)

    logger.info("***** training start *****")
    logger.info("Learning rate: " + f"{args.lr}")
    logger.info(f"Batch_size : {args.train_batch_size * max(num_gpus, 1)}")

    best_score = 0.0
    for epoch in range(args.epochs):
        trainer.train(epoch, args.log_interval)
        best_score = trainer.eval(best_score, args.model_dir)


# parser = argparse.ArgumentParser()
# parser.add_argument(
#     "--model_name_or_path", type=str, default='bert-base-uncased',
#     help='Path to pretrained model or model name from huggingface'
# )
# parser.add_argument(
#     '--train_batch_size', type=int, default=1,
#     help='input batch size for training'
# )
# parser.add_argument(
#     '--valid_batch_size', type=int, default=1,
#     help='input batch size for validing'
# )
# parser.add_argument(
#     '--epochs', type=int, default=10,
#     help='number of epochs to train'
# )
# parser.add_argument(
#     '--lr', type=float, default=5e-5,
#     help='learning rate'
# )
# parser.add_argument(
#     '--warmup_proportion', type=float, default=0.1,
#     help="Proportion of lr increasing steps"
# )
# parser.add_argument(
#     '--seed', type=int, default=1,
#     help='random seed'
# )
# parser.add_argument(
#     '--log_interval', type=int, default=100,
#     help='how many batches to wait before logging training status'
# )

# # Container environment
# parser.add_argument(
#     "--model_dir", type=str, default=os.environ.get('SM_MODEL_DIR', './model'),
#     help='path to save output'
# )
# args = parser.parse_args()


train(MyArgs())


  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.den

In [None]:
import argparse
import os

from transformers import DebertaTokenizer, DebertaForSequenceClassification
import torch


@torch.no_grad()
def inference(args):
    # Set device
    num_gpus = torch.cuda.device_count()
    use_cuda = num_gpus > 0
    device = torch.device("cuda" if use_cuda else "cpu")

    # Load model
    model = DebertaForSequenceClassification.from_pretrained(args.model_dir)
    model.to(device)
    model.eval()

    # Load tokenizer
    tokenizer = DebertaTokenizer.from_pretrained(args.model_dir)

    # Build data loader
    sts_dataloader = STSDataLoader(tokenizer, args.max_length)
    # test_data = read_json(os.path.join(args.data_dir, 'sts_test.json'))
    datasets = load_dataset("glue", "stsb")
    test_data = datasets['test']
    sts_test_loader = sts_dataloader.get_dataloader(
        data=test_data,
        batch_size=args.batch_size,
    )

    # Infer
    make_dirs(args.output_dir)
    output_file = open(os.path.join(args.output_dir, "output.csv"), "w")
    for out in sts_test_loader:
        input_ids, attention_mask, token_type_ids, _ = [o.to(device) for o in out]
        output = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]

        preds = output.detach().cpu().numpy()

        for p in preds:
            score = p[0]
            output_file.write(f"{score}\n")

    output_file.close()


def main():
    # parser = argparse.ArgumentParser()

    # parser.add_argument(
    #     "--batch_size",
    #     type=int,
    #     default=32,
    #     metavar="N",
    #     help="input batch size for inference (default: 64)",
    # )

    # parser.add_argument(
    #     "--data_dir", type=str, default=os.environ.get("SM_CHANNEL_EVAL", "./data"),
    #     help='path to load test data'
    # )
    # parser.add_argument(
    #     "--model_dir", type=str, default=os.environ.get("SM_CHANNEL_MODEL", "./model"),
    #     help='path to load trained model'
    # )
    # parser.add_argument(
    #     "--output_dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "./output"),
    #     help='path to save the output'
    # )
    # parser.add_argument(
    #     "--max_length",
    #     type=int,
    #     default=512,
    #     help="maximum sequence length",
    # )

    # args = parser.parse_args()

    inference(MyArgs())


main()
