In [1]:
!pwd

%load_ext autoreload
%autoreload 2

/app


In [2]:
sys.path.append('/app/src')
import torch
import os
import sys 
import wandb
import numpy as np
from loader.data import _load_data, DataCollatorForNumAdditions, GenerateLabels
from dataset.tokernizer import set_tokenizer, set_vocab_for_num_additions
from torch.utils.data import DataLoader
from loader.model import load_model_for_num_additions
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

from transformers import TrainingArguments
from trainer.trainer import CustomTrainer as Trainer
from trainer.trainer import CustomTrainingArguments as TrainingArguments

# これをしないと，sagemathが勝手にintやfloatをinteger型やRealNuber型に変換してしまうしてしまう
preparser(False)

In [3]:
num_variables = 3
field = 'GF7'
max_coefficient = 200
max_degree=20
continous_ids = [2]

params = {'encoding_method': 'standard',
          'd_model': 512,
          'nhead': 8,
          'num_encoder_layers': 6,
          'num_decoder_layers': 6,
          'dim_feedforward': 2048,
          'dropout': 0.1,
          'max_sequence_length': 10000,
          'positional_encoding': 'embedding',
          'regression_weight': 1.0,}

import argparse

params = argparse.Namespace(**params)

trainset = _load_data(f'./data/pred_nadds/pred_nadds_n={num_variables}_field=GF7/F_matrix_nadds/train.F_matrix_nadds')
testset = _load_data(f'./data/pred_nadds/pred_nadds_n={num_variables}_field=GF7/F_matrix_nadds/test.F_matrix_nadds')


vocab = set_vocab_for_num_additions(num_variables, 
                  field=field, 
                  max_coeff=max_coefficient, 
                  max_degree=max_degree,
                  weight_mx_entry_bound=1000, 
                  continuous_coefficient=False, 
                continuous_exponent=False)

tokenizer = set_tokenizer(vocab)
# bins = np.array([0, 100, 150, 200, 400, 1000, 2000, 3000, 4000, 5000])
bins = np.array([0, 500])

num_classes = len(bins)
generate_labels = GenerateLabels(bins, True)
dc = DataCollatorForNumAdditions(tokenizer, generate_labels)
label_names = ['labels']
model = load_model_for_num_additions(params, num_classes=num_classes, vocab=vocab, tokenizer=tokenizer)


In [22]:
trainset.input_mask

In [4]:
## Set up trainer
trainer_config = TrainingArguments(
    output_dir                  = './output_tameshi',
    num_train_epochs            = 8,
    # max_steps_per_epoch         = params.max_steps_per_epoch,
    logging_steps               = 50,
    save_total_limit            = 1,
    dataloader_pin_memory       = True,
    bf16                        = True,
    # save_steps                  = 100,
    eval_steps                  = 100,
    label_names                 = label_names, 
    remove_unused_columns       = False,
    # per_device_train_batch_size = params.batch_size // count_cuda_devices(),
    eval_strategy               = 'steps',
    # torch_compile               = True,
    report_to                   = 'none',
    disable_tqdm                = True,
)

trainer = Trainer(
        args                            = trainer_config,
        model                           = model,
        train_dataset                   = trainset,
        eval_dataset                    = testset,
        data_collator                   = dc,
        # compute_metrics                 = _compute_metrics,
        # preprocess_logits_for_metrics   = preprocess_logits_for_metrics,
        # callbacks                       = [limit_steps_callback]
        )

In [8]:
# trainer.train()


In [5]:
loader = DataLoader(trainset, collate_fn=dc, batch_size=16, shuffle=False)

batch = next(iter(loader))

for k, v in batch.items():
    print(k, v.shape)

encoder_input torch.Size([16, 791])
encoder_padding_mask torch.Size([16, 791])
labels torch.Size([16, 2])


In [28]:
batch['labels']
# batch['encoder_padding_mask']


tensor([[0.7311, 0.2689],
        [0.2689, 0.7311],
        [0.7311, 0.2689],
        [0.7311, 0.2689],
        [0.7311, 0.2689],
        [0.7311, 0.2689],
        [0.2689, 0.7311],
        [0.7311, 0.2689],
        [0.2689, 0.7311],
        [0.2689, 0.7311],
        [0.7311, 0.2689],
        [0.7311, 0.2689],
        [0.7311, 0.2689],
        [0.2689, 0.7311],
        [0.7311, 0.2689],
        [0.7311, 0.2689]])

In [9]:
tokenizer.vocab['[PAD]']

2039

In [None]:
tokenizer.all_special_tokens

['<s>', '</s>', '[UNK]', '[SEP]', '[PAD]', '[CLS]']

In [11]:
batch['labels']

tensor([[0.7311, 0.2689],
        [0.2689, 0.7311],
        [0.7311, 0.2689],
        [0.7311, 0.2689],
        [0.7311, 0.2689],
        [0.7311, 0.2689],
        [0.2689, 0.7311],
        [0.7311, 0.2689],
        [0.2689, 0.7311],
        [0.2689, 0.7311],
        [0.7311, 0.2689],
        [0.7311, 0.2689],
        [0.7311, 0.2689],
        [0.2689, 0.7311],
        [0.7311, 0.2689],
        [0.7311, 0.2689]])

In [48]:
labels = np.array([1, 2, 3], dtype=np.int64)
num_classes = 4

# one_hot_labels = np.eye(num_classes)[labels]
# print(one_hot_labels)

# # soft_A = np.exp(A) / np.sum(np.exp(A))
# # soft_A
# correct_class = 3
# B = np.arange(num_classes)
# print(B)
# C = np.ones(num_classes) * correct_class
# print(C)
# D = C - B
# print(D)
# D = -np.abs(D)
# E = np.exp((D)) / np.sum(np.exp(D))
# print(E)

def get_soft_label(correct_class_id, num_classes):
    assert 0 <= correct_class_id and correct_class_id < num_classes, 'correct_class_id は0以上かつnum_classes未満の整数である必要がある'
    
    diffs = np.ones(num_classes) * correct_class_id - np.arange(num_classes)
    dist = np.abs(diffs)
    get_soft_label = np.exp(-dist) / np.sum(np.exp(-dist))
    print(get_soft_label.dtype)
    return get_soft_label



soft_labels = np.zeros((0, num_classes))
for label in labels:
    soft_labels = np.vstack((soft_labels, get_soft_label(label, num_classes)))

print(soft_labels)



float64
float64
float64
[[0.19661193 0.53444665 0.19661193 0.07232949]
 [0.07232949 0.19661193 0.53444665 0.19661193]
 [0.0320586  0.08714432 0.23688282 0.64391426]]


torch.float32

In [40]:
np.random.random_integers(0, 3, 10)

  np.random.random_integers(0, 3, 10)


array([2, 3, 0, 2, 2, 3, 0, 0, 2, 1])

In [18]:
2 - 2

0