In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import scml
from scml import pandasx as pdx
from lalaes2 import Aes2Model

In [2]:
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda:1")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

int16, min=-32768, max=32767
device=0, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB
device=1, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [3]:
dst_path = Path("models/aes2/deberta_v3_base/20240615_063400")
ckpt_path = dst_path / "lightning_logs/version_0/checkpoints/epoch=3-step=2004-val_loss=0.31256.ckpt"
checkpoint = torch.load(ckpt_path)
print(checkpoint.keys())

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers', 'hparams_name', 'hyper_parameters'])


In [4]:
print(checkpoint["hyper_parameters"])

{'pretrained_dir': '/mnt/c/huggingface/microsoft/deberta-v3-base', 'lr': 1e-05, 'scheduler_conf': [<Section: reduce_lr_on_plateau>], 'model_class': 'auto', 'swa_start_epoch': -1, 'gradient_checkpointing': False, 'hidden_dropout_prob': 0.0, 'attention_probs_dropout_prob': 0.0, 'max_position_embeddings': 512}


In [5]:
print(checkpoint["state_dict"].keys())

odict_keys(['model.deberta.embeddings.word_embeddings.weight', 'model.deberta.embeddings.LayerNorm.weight', 'model.deberta.embeddings.LayerNorm.bias', 'model.deberta.encoder.layer.0.attention.self.query_proj.weight', 'model.deberta.encoder.layer.0.attention.self.query_proj.bias', 'model.deberta.encoder.layer.0.attention.self.key_proj.weight', 'model.deberta.encoder.layer.0.attention.self.key_proj.bias', 'model.deberta.encoder.layer.0.attention.self.value_proj.weight', 'model.deberta.encoder.layer.0.attention.self.value_proj.bias', 'model.deberta.encoder.layer.0.attention.output.dense.weight', 'model.deberta.encoder.layer.0.attention.output.dense.bias', 'model.deberta.encoder.layer.0.attention.output.LayerNorm.weight', 'model.deberta.encoder.layer.0.attention.output.LayerNorm.bias', 'model.deberta.encoder.layer.0.intermediate.dense.weight', 'model.deberta.encoder.layer.0.intermediate.dense.bias', 'model.deberta.encoder.layer.0.output.dense.weight', 'model.deberta.encoder.layer.0.output.

In [6]:
%%time
model = Aes2Model.load_from_checkpoint(ckpt_path)

[INFO|aes2.py:207] 2024-06-15 08:43:00,162 >> config.to_diff_dict={
  "id2label": {
    "0": "LABEL_0"
  },
  "label2id": {
    "LABEL_0": 0
  },
  "pad_token_id": 0,
  "problem_type": "regression",
  "_name_or_path": "/mnt/c/huggingface/microsoft/deberta-v3-base",
  "transformers_version": "4.41.2",
  "model_type": "deberta-v2",
  "position_buckets": 256,
  "norm_rel_ebd": "layer_norm",
  "share_att_key": true,
  "hidden_size": 768,
  "num_hidden_layers": 12,
  "num_attention_heads": 12,
  "intermediate_size": 3072,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "max_position_embeddings": 512,
  "type_vocab_size": 0,
  "initializer_range": 0.02,
  "relative_attention": true,
  "max_relative_positions": -1,
  "position_biased_input": false,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "vocab_size": 128100,
  "layer_norm_eps": 1e-07,
  "pooler_hidden_size": 768,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "gradient_checkpoin

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /mnt/c/huggingface/microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CPU times: user 9.46 s, sys: 1.38 s, total: 10.8 s
Wall time: 13.9 s


In [7]:
%%time
model.model.save_pretrained(str(dst_path))

CPU times: user 920 ms, sys: 177 ms, total: 1.1 s
Wall time: 3.31 s


In [8]:
print(model.model)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [9]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:25.847133
