In [1]:
import sys
sys.path.append('/home/shuirh/project/research/invariant_learning')

In [2]:
import importlib
import os
import numpy as np

In [3]:
if 'trainer' in globals():
    importlib.reload(trainer)
else:
    import modules.trainer as trainer

In [4]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor

In [5]:
training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)
test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)
class MNIST_DS(Dataset):
    def __init__(self, vd):
        self.vd = vd
    def __len__(self):
        return len(self.vd)
    def __getitem__(self, idx):
        x,y = self.vd[idx]
        return {
            'x': x,
            'labels': y
        }
tr_ds = MNIST_DS(training_data)
te_ds = MNIST_DS(test_data)

In [6]:
# Model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x, labels = None):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        outputs = {'logits': logits}
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
            outputs['loss'] = loss
        return outputs

model = NeuralNetwork()

In [7]:
def compute_accuracy(all_logits, all_labels):
    """
    Metric should start with `eval_`
    """
    all_preds = np.argmax(all_logits, axis = -1)
    num_samples = len(all_preds)
    acc = (all_preds == all_labels).sum() / num_samples
    metrics = {
        'eval_acc': acc
    }
    return metrics
args1 = trainer.SimpleTrainingArguments(
    output_dir = './trainer_output',
    eval_epochs = 1,
    num_train_epochs = 3,
    learning_rate = 1e-2,
    per_device_train_batch_size = 64
)
trainer1 = trainer.BasicTrainer(
    model,
    args = args1,
    train_dataset = tr_ds,
    eval_dataset = te_ds,
    compute_metrics = compute_accuracy
)

In [9]:
trainer1.train()

***** Running training *****
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Total optimization steps = 2814
  3%|█▍                                       | 97/2814 [00:02<00:43, 62.45it/s]{'loss': 0.6206926579773426, 'learning_rate': 0.0009644633972992182, 'step': 100, 'epoch': 0.11}
  7%|██▊                                     | 200/2814 [00:04<00:40, 65.07it/s]{'loss': 0.3264257598668337, 'learning_rate': 0.0009289267945984365, 'step': 200, 'epoch': 0.21}
 10%|████▏                                   | 295/2814 [00:05<00:33, 74.89it/s]{'loss': 0.23941356915980577, 'learning_rate': 0.0008933901918976545, 'step': 300, 'epoch': 0.32}
 14%|█████▌                                  | 395/2814 [00:06<00:33, 71.44it/s]{'loss': 0.19263759993016719, 'learning_rate': 0.0008578535891968728, 'step': 400, 'epoch': 0.43}
 18%|███████                                 | 495/2814 [00:08<00:32, 70.87it/s]{'loss': 0.179

{'eval_acc': 0.9621}


 35%|██████████████▏                         | 997/2814 [00:20<00:58, 30.93it/s]{'loss': 0.09905611813766882, 'learning_rate': 0.000644633972992182, 'step': 1000, 'epoch': 1.07}
 39%|███████████████▏                       | 1099/2814 [00:21<00:23, 73.07it/s]{'loss': 0.10436855559237301, 'learning_rate': 0.0006090973702914001, 'step': 1100, 'epoch': 1.17}
 42%|████████████████▌                      | 1193/2814 [00:23<00:27, 59.83it/s]{'loss': 0.08180068640038371, 'learning_rate': 0.0005735607675906183, 'step': 1200, 'epoch': 1.28}
 46%|██████████████████                     | 1299/2814 [00:25<00:25, 60.26it/s]{'loss': 0.080203139600344, 'learning_rate': 0.0005380241648898365, 'step': 1300, 'epoch': 1.39}
 50%|███████████████████▎                   | 1396/2814 [00:26<00:21, 65.93it/s]{'loss': 0.07547680045012385, 'learning_rate': 0.0005024875621890548, 'step': 1400, 'epoch': 1.49}
 53%|████████████████████▋                  | 1494/2814 [00:27<00:19, 66.34it/s]{'loss': 0.0712768396968022,

{'eval_acc': 0.9737}


 67%|██████████████████████████▎            | 1895/2814 [00:39<01:53,  8.07it/s]{'loss': 0.054000143457669764, 'learning_rate': 0.0003248045486851457, 'step': 1900, 'epoch': 2.03}
 71%|███████████████████████████▋           | 1998/2814 [00:40<00:09, 88.63it/s]{'loss': 0.04194109082454815, 'learning_rate': 0.0002892679459843639, 'step': 2000, 'epoch': 2.13}
 75%|█████████████████████████████          | 2099/2814 [00:41<00:11, 64.16it/s]{'loss': 0.04782524004112929, 'learning_rate': 0.0002537313432835821, 'step': 2100, 'epoch': 2.24}
 78%|██████████████████████████████▍        | 2194/2814 [00:43<00:09, 64.00it/s]{'loss': 0.03669497192371637, 'learning_rate': 0.00021819474058280028, 'step': 2200, 'epoch': 2.35}
 82%|███████████████████████████████▊       | 2298/2814 [00:44<00:06, 77.17it/s]{'loss': 0.0373785044712713, 'learning_rate': 0.00018265813788201848, 'step': 2300, 'epoch': 2.45}
 85%|█████████████████████████████████▏     | 2395/2814 [00:45<00:05, 73.22it/s]{'loss': 0.034438124380

{'eval_acc': 0.9832}





In [8]:
args2 = trainer.SimpleTrainingArguments(
    output_dir = './trainer_output',
    eval_steps = 1000,
    max_steps = 2000,
    learning_rate = 1e-2,
    per_device_train_batch_size = 64
)
trainer2 = trainer.BasicTrainer(
    model,
    args = args2,
    train_dataset = tr_ds,
    eval_dataset = te_ds,
    compute_metrics = compute_accuracy
)

In [9]:
trainer2.train()

***** Running training *****
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Total optimization steps = 2000
  5%|█▉                                       | 95/2000 [00:02<00:22, 84.80it/s]{'loss': 0.6270670919120311, 'learning_rate': 0.00095, 'step': 100, 'epoch': 0.11}
 10%|███▉                                    | 198/2000 [00:03<00:20, 86.48it/s]{'loss': 0.32939486429095266, 'learning_rate': 0.0009000000000000001, 'step': 200, 'epoch': 0.21}
 15%|█████▉                                  | 294/2000 [00:04<00:21, 80.11it/s]{'loss': 0.24090489789843558, 'learning_rate': 0.00085, 'step': 300, 'epoch': 0.32}
 20%|███████▉                                | 398/2000 [00:06<00:24, 65.05it/s]{'loss': 0.19022969640791415, 'learning_rate': 0.0008, 'step': 400, 'epoch': 0.43}
 25%|█████████▉                              | 498/2000 [00:08<00:23, 64.12it/s]{'loss': 0.180901157297194, 'learning_rate': 0.00075, 's

In [10]:
args3 = trainer.SimpleTrainingArguments(
    eval_steps = 1000,
    max_steps = 2000,
    learning_rate = 1e-2,
    per_device_train_batch_size = 64
)
trainer3 = trainer.BasicTrainer(
    model,
    args = args3,
    train_dataset = tr_ds,
    eval_dataset = te_ds,
    compute_metrics = compute_accuracy
)

In [11]:
trainer3.train()

***** Running training *****
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Total optimization steps = 2000
  5%|█▉                                       | 94/2000 [00:01<00:30, 62.00it/s]{'loss': 0.07370475668925792, 'learning_rate': 0.00095, 'step': 100, 'epoch': 0.11}
 10%|███▉                                    | 198/2000 [00:02<00:20, 88.78it/s]{'loss': 0.08779692580457776, 'learning_rate': 0.0009000000000000001, 'step': 200, 'epoch': 0.21}
 14%|█████▊                                  | 290/2000 [00:03<00:21, 78.30it/s]{'loss': 0.07774433095939458, 'learning_rate': 0.00085, 'step': 300, 'epoch': 0.32}
 20%|███████▉                                | 394/2000 [00:05<00:20, 78.42it/s]{'loss': 0.07097152324859053, 'learning_rate': 0.0008, 'step': 400, 'epoch': 0.43}
 25%|█████████▉                              | 495/2000 [00:06<00:21, 70.44it/s]{'loss': 0.07783148474991322, 'learning_rate': 0.00075,

In [12]:
from transformers import BertForSequenceClassification

In [15]:
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class = 21)

TypeError: __init__() got an unexpected keyword argument 'num_class'

In [44]:
trainer3.lr_scheduler.get_last_lr()

[0.0]

## Collator

In [24]:
import modules
from transformers.data.data_collator import default_data_collator
from torch.utils.data import DataLoader

In [18]:
import pickle

In [31]:
with open('/home/shuirh/project/research/invariant_learning/rnp/cache/cail_transformer_ds.pkl', 'rb') as f:
    train,dev,test = pickle.load(f)

In [32]:
train.set_label_name('labels_charge')

In [25]:
train_dl = DataLoader(train, batch_size = 8, collate_fn = default_data_collator)

In [33]:
for k,v in train[2].items():
    print(f"{k}: {len(v) if isinstance(v, list) else 0}")

input_ids: 518
attention_mask: 518
token_type_ids: 518
labels: 0


In [34]:
type(train[0]['input_ids'])

list

In [42]:
(torch.LongTensor([1,2,3]) == torch.LongTensor([1,2,0])).float().mean()

tensor(0.6667)