In [1]:
!nvidia-smi

Sun May  1 13:34:41 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
import sys
drive.mount('/content/drive')
#设置路径
sys.path.append('/content/drive/MyDrive/Colab Notebooks')

Mounted at /content/drive


In [3]:
! pip install transformers==4.0.1

Collecting transformers==4.0.1
  Downloading transformers-4.0.1-py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 13.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.2 MB/s 
[?25hCollecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp37-cp37m-manylinux2010_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 46.0 MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.49 tokenizers-0.9.4 transformers-4.0.1


In [4]:
# torch版本为1.6
! pip install torch==1.6.0

Collecting torch==1.6.0
  Downloading torch-1.6.0-cp37-cp37m-manylinux1_x86_64.whl (748.8 MB)
[K     |████████████████████████████████| 748.8 MB 18 kB/s 
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.0+cu113 requires torch==1.11.0, but you have torch 1.6.0 which is incompatible.
torchtext 0.12.0 requires torch==1.11.0, but you have torch 1.6.0 which is incompatible.
torchaudio 0.11.0+cu113 requires torch==1.11.0, but you have torch 1.6.0 which is incompatible.[0m
Successfully installed torch-1.6.0


In [5]:
# torchvision 是PyTorch中专门用来处理图像的库
! pip install torchvision==0.7.0

Collecting torchvision==0.7.0
  Downloading torchvision-0.7.0-cp37-cp37m-manylinux1_x86_64.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 9.2 MB/s 
Installing collected packages: torchvision
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.12.0+cu113
    Uninstalling torchvision-0.12.0+cu113:
      Successfully uninstalled torchvision-0.12.0+cu113
Successfully installed torchvision-0.7.0


In [6]:
import torch
import random
import os
import numpy as np
import json
import pandas as pd
from tqdm import tqdm


config = {
    'train_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/ESIM/train.json',
    'dev_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/ESIM/dev.json',
    'test_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/ESIM/test.json',
    'model_path':'/content/drive/MyDrive/Colab Notebooks/dataset/BERT_model',
    'output_path': '.',
    'train_val_ratio':0.1,
    'vocab_size':30000,
    'batch_size':64,
    'max_seq_len':64,
    'num_epochs':1,
    'learning_rate':2e-5,
    'eps': 0.1,
    'alpha': 0.3,
    'adv': 'fgm',
    'warmup_ratio': 0.05,
    'weight_decay': 0.01,
    'use_bucket': True,
    'bucket_multiplier': 200,
    'n_gpus': 0,
    'use_amp': True, # 只针对有 tensor core 的gpu有效
    'ema_start_step': 500,
    'ema_start': False,
    'logging_step':100,
    'device': 'cuda',
    'seed':2022
}

if not torch.cuda.is_available():
  config['device'] = 'cpu'
else:
  config['n_gpus'] = torch.cuda.device_count()
  config['batch_size'] *= config['n_gpus']

if not os.path.exists(config['output_path']):
    os.makedirs((config['output_path']))

    
def seed_everything(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  return seed

seed_everything(config['seed'])

2022

In [7]:
def parse_data(path, data_type='train'):
  sentence_a = []
  sentence_b = []
  labels = []

  with open(path, 'r', encoding = 'utf8') as f:
    for line in tqdm(f.readlines(), desc=f'Reading {data_type} data'):
      line = json.loads(line)
      sentence_a.append(line['sentence1'])
      sentence_b.append(line['sentence2'])
      if data_type != 'test':
        labels.append(int(line['label']))
      else:
        labels.append(0)

  df = pd.DataFrame(zip(sentence_a, sentence_b, labels), columns = ['text_a', 'text_b', 'labels'])
  return df

## encode和encode_plus的区别
1. encode仅返回input_ids
2. encode_plus返回所有的编码信息，具体如下：
’input_ids:是单词在词典中的编码; 
‘token_type_ids’:区分两个句子的编码（上句全为0，下句全为1）; 
‘attention_mask’:指定对哪些词进行self-Attention操作

```
model_name = 'bert-base-uncased'

# a.通过词典导入分词器
tokenizer = BertTokenizer.from_pretrained(model_name)
sentence = "Hello, my son is laughing."

print(tokenizer.encode(sentence))
print(tokenizer.encode_plus(sentence))


运行结果：

[101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102]
{'input_ids': [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
```

In [8]:
# inputs: defaultdict(list)
def build_bert_inputs(inputs, label, sentence_a, sentence_b, tokenizer):
  # add_special_tokens [CLS] [SEP]
  # return_token_type_ids 该词属于sentence_a(返回0) or sentence_b(返回1). 
  # return_attention_mask pad=0, 不是pad的部分标为1， 是pad标为0.
  inputs_dict = tokenizer.encode_plus(sentence_a, sentence_b, add_special_tokens = True,
                     return_token_type_ids = True,
                     return_attention_mask = True)
  inputs['input_ids'].append(inputs_dict['input_ids'])
  inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
  inputs['attention_mask'].append(inputs_dict['attention_mask'])
  inputs['labels'].append(label)

## defaultdict(list)
```
from collections import defaultdict
result = defaultdict(list)
data = [("p", 1), ("p", 2), ("p", 3),
     ("h", 1), ("h", 2), ("h", 3)]
 
for (key, value) in data:
    result[key].append(value)
print(result)#defaultdict(<class 'list'>, {'p': [1, 2, 3], 'h': [1, 2, 3]})

```

In [9]:
from collections import defaultdict
def read_data(config, tokenizer):
  train_df = parse_data(config['train_file_path'], data_type = 'train')
  dev_df = parse_data(config['dev_file_path'], data_type = 'dev')
  test_df = parse_data(config['test_file_path'], data_type = 'test')

  # 把这些 df 打包成字典
  data_df = {'train': train_df, 'dev': dev_df, 'test': test_df}
  #保存 BERT 的输入
  processed_data = {}
  # 遍历字典(data_df)
  for data_type, df in data_df.items():
    inputs = defaultdict(list)
    #遍历每一行
    for i, row in tqdm(df.iterrows(), desc= f'Preprocessing {data_type} data', total = len(df)):
      label = row[2]
      sentence_a, sentence_b = row[0], row[1]
      build_bert_inputs(inputs, label, sentence_a, sentence_b, tokenizer)

    processed_data[data_type] = inputs
  return processed_data

In [10]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(config['model_path'])
dt = read_data(config, tokenizer)

Reading train data: 100%|██████████| 34334/34334 [00:00<00:00, 83943.54it/s]
Reading dev data: 100%|██████████| 4316/4316 [00:00<00:00, 36928.44it/s]
Reading test data: 100%|██████████| 3861/3861 [00:00<00:00, 121829.66it/s]
Preprocessing train data: 100%|██████████| 34334/34334 [00:30<00:00, 1126.07it/s]
Preprocessing dev data: 100%|██████████| 4316/4316 [00:02<00:00, 1448.29it/s]
Preprocessing test data: 100%|██████████| 3861/3861 [00:02<00:00, 1347.08it/s]


In [11]:
print('train_df中 input_ids的第一条数据',dt['train']['input_ids'][0])
print('dev_df中 token_type_ids的第一条数据',dt['dev']['token_type_ids'][0])

train_df中 input_ids的第一条数据 [101, 6010, 6009, 955, 1446, 5023, 7583, 6820, 3621, 1377, 809, 2940, 2768, 1044, 2622, 1400, 3315, 1408, 102, 955, 1446, 3300, 1044, 2622, 1168, 3309, 6820, 3315, 1408, 102]
dev_df中 token_type_ids的第一条数据 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [12]:
from torch.utils.data import Dataset
class AFQMCDataset(Dataset):
  def __init__(self, data_dict):
    super(AFQMCDataset, self).__init__()
    self.data_dict = data_dict
  
  # 返回一个example
  def __getitem__(self, idx):
    data = (self.data_dict['input_ids'][idx],
         self.data_dict['token_type_ids'][idx],
         self.data_dict['attention_mask'][idx],
         self.data_dict['labels'][idx])
    return data
  
  def __len__(self):
    return len(self.data_dict['input_ids'])

In [13]:
class Collator:
  def __init__(self, max_seq_len, tokenizer):
    self.max_seq_len = max_seq_len
    self.tokenizer = tokenizer
  
  def pad_and_truncate(self, input_ids_list, token_type_ids_list, attention_mask_list, labels_list, max_seq_len):
    input_ids = torch.zeros((len(input_ids_list), max_seq_len),dtype=torch.long)
    token_type_ids = torch.zeros_like(input_ids)
    attention_mask = torch.zeros_like(input_ids)
    
    for i in range(len(input_ids_list)):
      seq_len = len(input_ids_list[i])
      if seq_len <= max_seq_len:
        input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype = torch.long)
        token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype = torch.long)
        attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype = torch.long)
      else:
        # input_ids 最后一位放上一个特殊的token
        input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len-1] + [self.tokenizer.sep_token_id], dtype = torch.long)
        # token_type_ids 和 attention_mask 不需要加上特殊token
        token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype = torch.long)
        attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype = torch.long)
    labels = torch.tensor(labels_list, dtype = torch.long)
    return input_ids, token_type_ids, attention_mask, labels

  def __call__(self, examples):
    input_ids_list, token_type_ids_list, attention_mask_list, labels_list = list(zip(*examples))
    cur_max_seq_len = max(len(input_id) for input_id in input_ids_list)
    max_seq_len = min(cur_max_seq_len, self.max_seq_len)
    
    input_ids, token_type_ids, attention_mask, labels = self.pad_and_truncate(input_ids_list, token_type_ids_list, attention_mask_list, labels_list, max_seq_len)                     
    
    data_dict = {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }
    return data_dict

In [14]:
collate_fn = Collator(config['max_seq_len'], tokenizer)

## 采样（Dataloader）

![Dataloader](https://img-blog.csdnimg.cn/b80cee8a1c7d49b79e7b80cc81150d66.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

## Sampler 
所有采样器都继承自Sampler这个类

每个Sampler子类都要实现iter方法【迭代数据集example索引的方法】，以及返回迭代器长度的len方法

![sampler](https://img-blog.csdnimg.cn/1c40aedade9f40a493b4df97d0c1def0.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

### 顺序采样
![sequentialsampler](https://img-blog.csdnimg.cn/9e8ee018cea84729ac6b5742395d8ea2.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

***在初始化时拿到数据集data_source， 按顺序对元素进行采样，每次只返回一个索引值 ***

In [15]:
# 顺序采样举例
# randperm 把 0-23 数据打乱 形成3维tensor
# (2,3,4) batch_size:2 seq_len=3, embedding_dim=4，每个 batch 有2条数据，每个句子包含3个词， 每个词的维度是4
a = torch.randperm(24).reshape((2,3,4))
print('a:',a)
b = torch.utils.data.SequentialSampler(a)
print('b:',b)
# i 是索引
for i in b:
    print(i)

a: tensor([[[ 5, 14,  9,  2],
         [13, 12, 20,  1],
         [16, 15,  7,  4]],

        [[17,  0,  3, 19],
         [10, 22,  6, 18],
         [ 8, 23, 11, 21]]])
b: <torch.utils.data.sampler.SequentialSampler object at 0x7f0ce6e01810>
0
1


### 随机采样
replacement : True 表示可以重复采样

num_samples: 指定采样的数量

PS:当使用replacement=False，不应制定num_samples
![randomsampler](https://img-blog.csdnimg.cn/9d2e2afdbe4d4df4aee3102e46054650.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

In [16]:
# 随机采样举例
a = torch.randperm(60).reshape((5,3,4))
print('a:',a)
# 随机采样3条数据
b = torch.utils.data.RandomSampler(a, replacement=True, num_samples=3)
print('b:',b)
for i in b:
    print(i)

a: tensor([[[47, 36, 58, 12],
         [49,  7, 24,  9],
         [27, 13, 45,  0]],

        [[ 3, 28, 23, 39],
         [37, 29, 10, 59],
         [ 4, 35, 56, 53]],

        [[54, 32, 18, 42],
         [41, 46, 30, 14],
         [38, 22, 11,  5]],

        [[48, 33, 57, 26],
         [15, 19, 55, 16],
         [20, 40, 31,  6]],

        [[51, 17,  1, 25],
         [34,  2, 43, 21],
         [52, 50,  8, 44]]])
b: <torch.utils.data.sampler.RandomSampler object at 0x7f0c5c29b250>
3
3
2


### Subset随机采样
SubsetRandomSampler： 从给定的索引列表中随机采样元素，不放回采样 

indices(sequence): 索引序列
![sunsetRandomSampler](https://img-blog.csdnimg.cn/e80f6a1bafe042f28da652dc5a2388ab.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

In [17]:
# Subset采样举例
a = torch.randperm(60).reshape((5,3,4))
print('a:',a)
# 从索引2以后的样本中随机采样
b = torch.utils.data.SubsetRandomSampler(indices=a[2:])
for i in b:
    print(i)

a: tensor([[[ 8, 18, 55, 16],
         [49, 54, 14,  7],
         [33, 37, 39,  2]],

        [[45,  6, 24, 29],
         [58, 57,  3, 47],
         [46, 56, 26, 21]],

        [[12, 25, 52, 40],
         [ 9, 53, 10, 50],
         [48, 59, 27, 22]],

        [[ 0, 20, 34, 13],
         [41, 32, 35, 51],
         [15,  4, 36, 38]],

        [[11, 19,  5, 43],
         [23, 31, 44, 30],
         [28,  1, 42, 17]]])
tensor([[ 0, 20, 34, 13],
        [41, 32, 35, 51],
        [15,  4, 36, 38]])
tensor([[12, 25, 52, 40],
        [ 9, 53, 10, 50],
        [48, 59, 27, 22]])
tensor([[11, 19,  5, 43],
        [23, 31, 44, 30],
        [28,  1, 42, 17]])


### 分批采样
sampler: 基采样器 

batch_size: size of mini-batch

drop_last=True, 如果一个batch的长度小于batch_size则丢弃
![BatchSampler](https://img-blog.csdnimg.cn/8a1b2f5ae320453c9fae8ae8e0ef2080.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)



In [18]:
# 分批采样举例
a = torch.randperm(60).reshape((5,3,4))
print('a:',a)
# 要传一个基采样器torch.utils.data.RandomSampler(a)
b = torch.utils.data.BatchSampler(torch.utils.data.RandomSampler(a), 2, drop_last=True)
# 上面的i都是一个数；现在是batch_size的列表
for i in b:
    print(i)

a: tensor([[[34, 50, 28, 24],
         [46,  0, 35, 21],
         [51, 52, 33, 59]],

        [[31,  5, 26, 42],
         [11, 49,  8, 29],
         [ 9, 17, 53, 36]],

        [[ 1, 37, 22, 40],
         [18, 20, 45,  7],
         [10, 47, 19, 32]],

        [[38, 14, 58,  3],
         [13, 25, 27, 48],
         [ 6, 44, 55, 30]],

        [[56,  2, 57, 12],
         [ 4, 23, 16, 15],
         [43, 54, 41, 39]]])
[3, 4]
[1, 0]


### 桶采样
sort_key: 按XXX排序

bucket_sampler: batch_size * bucket_size_multiplier 相当于 n * batch_size
；len(sampler)最大为数据集的长度
![BucketSampler](https://img-blog.csdnimg.cn/6413cea5dfbf4494a6b2b64504f74a97.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![SortedSampler](https://img-blog.csdnimg.cn/64f60217df474cf1b0d7aa1c3558cc1f.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![BucketSampler](https://img-blog.csdnimg.cn/d7e03938f2824f9cb8a6c3a895f5a78a.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

In [19]:
# 桶采样举例
# Dataset -> 得到‘大桶'的排序索引

# 真实train中数据，前6条
mini_dataset = {k: v[:6] for k, v in dt['train'].items()}
mini_data = AFQMCDataset(mini_dataset)
print(mini_data)
# mini_data 的前6条数据的长度
for i, d in enumerate(mini_data):
    print(d[0]) # input_ids
    print(len(d[0]))

<__main__.AFQMCDataset object at 0x7f0c5c686310>
[101, 6010, 6009, 955, 1446, 5023, 7583, 6820, 3621, 1377, 809, 2940, 2768, 1044, 2622, 1400, 3315, 1408, 102, 955, 1446, 3300, 1044, 2622, 1168, 3309, 6820, 3315, 1408, 102]
30
[101, 6010, 6009, 5709, 1446, 6432, 2769, 6824, 5276, 671, 3613, 102, 6010, 6009, 5709, 1446, 6824, 5276, 6121, 711, 3221, 784, 720, 102]
24
[101, 2376, 2769, 4692, 671, 678, 3315, 3299, 5709, 1446, 6572, 1296, 3300, 3766, 3300, 5310, 3926, 102, 678, 3299, 5709, 1446, 6572, 1296, 102]
25
[101, 6010, 6009, 955, 1446, 1914, 7270, 3198, 7313, 5341, 1394, 6397, 844, 671, 3613, 102, 955, 1446, 2533, 6397, 844, 1914, 719, 102]
24
[101, 2769, 4638, 5709, 1446, 6572, 1296, 3221, 115, 115, 115, 8024, 6820, 3621, 2582, 720, 3221, 115, 115, 115, 102, 2769, 4638, 5709, 1446, 8024, 3299, 5310, 1139, 3341, 6432, 6375, 2769, 6820, 115, 115, 115, 1039, 8024, 2769, 5632, 2346, 5050, 749, 671, 678, 6422, 5301, 1399, 1296, 2769, 2418, 6421, 6820, 115, 115, 115, 1039, 102]
59
[101, 

In [20]:
from bucket_sampler import SortedSampler
random_sampler = torch.utils.data.RandomSampler(mini_data, replacement=False)
# print(list(random_sampler))
# 关于dataset的随机索引 [3, 5, 4, 1, 0, 2]

batch_sampler = torch.utils.data.BatchSampler(random_sampler, 4, drop_last=True)
# [0, 5, 2, 4] 【还有[1, 3] 但是丢弃了】

for samp in batch_sampler:
    print('samp:',samp)
    sorted_sampler = SortedSampler(samp, sort_key=lambda x:len(mini_data[x][0]))
    print('list_sorted_sampler:',list(sorted_sampler))

samp: [0, 3, 1, 4]
list_sorted_sampler: [1, 2, 0, 3]


```
[0, 5, 2, 4]分别对应mini_data中的长度[30, 28, 25, 59]

[2, 1, 0, 3] 

2（位置2的数据len最小） -> 2 -> 25 

1 -> 5 -> 28 

0 -> 0 -> 30 

3（位置3的数据len最大） -> 4 -> 59
```

In [21]:
# 得到‘大桶'的排序索引 -> 返回‘小桶'在‘大桶'中的位置
c = list(torch.utils.data.BatchSampler(sorted_sampler, 2, drop_last=True))
print(c)
# c 把大桶 分成 batch_size大小的小桶

[[1, 2], [0, 3]]


```
[[2, 1], [0, 3]]
```

In [22]:
for batch in torch.utils.data.SubsetRandomSampler(c):
    print('从给定的索引列表中随机采样元素')
    print(batch)
    print('所对应的原序列是什么：')
    print([samp[i] for i in batch])
    # 参考上面 from bucket_sampler import SortedSampler 单元格对应法则

从给定的索引列表中随机采样元素
[1, 2]
所对应的原序列是什么：
[3, 1]
从给定的索引列表中随机采样元素
[0, 3]
所对应的原序列是什么：
[0, 4]


```
从给定的索引列表中随机采样元素
[2, 1]
所对应的原序列是什么：
[2, 5]
从给定的索引列表中随机采样元素
[0, 3]
所对应的原序列是什么：
[0, 4]
```

In [23]:
# 采样在dataloader中使用
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from bucket_sampler import BucketBatchSampler

def build_dataloader(config, data, collate_fn):
  train_dataset = AFQMCDataset(data['train'])
  dev_dataset = AFQMCDataset(data['dev'])
  test_dataset = AFQMCDataset(data['test'])

  if config['use_bucket']:
    # 先放一个基采样器
    train_sampler = RandomSampler(train_dataset)
    # sort_key 以input_ids 的len排序
    bucket_sampler = BucketBatchSampler(train_sampler, 
                       batch_size = config['batch_size'],
                       drop_last = False,
                       sort_key = lambda x:len(train_dataset[x][0]),
                       bucket_size_multiplier = config['bucket_multiplier'])
    train_dataloader = DataLoader(dataset = train_dataset, batch_sampler = bucket_sampler,
                    num_workers = 4, collate_fn = collate_fn)
    
  else:
    train_dataloader = DataLoader(train_dataset, batch_size = config['batch_size'],
                    shuffle = True, num_workers = 4, collate_fn = collate_fn)
  dev_dataloader = DataLoader(dev_dataset, batch_size = config['batch_size'],
                  shuffle = False, num_workers = 4, collate_fn = collate_fn)
  test_dataloader = DataLoader(test_dataset, batch_size = config['batch_size'],
                  shuffle = False, num_workers = 4, collate_fn = collate_fn)
  return train_dataloader, dev_dataloader, test_dataloader  


In [24]:
train_dataloader, dev_dataloader, test_dataloader = build_dataloader(config, dt, collate_fn)

In [25]:
for i in train_dataloader:
    print('train_dataloader一个batch:',i)
    break

train_dataloader一个batch: {'input_ids': tensor([[ 101, 2769,  671,  ...,  955, 1446,  102],
        [ 101,  671, 2476,  ..., 1126,  702,  102],
        [ 101, 2769, 4500,  ..., 1168, 6572,  102],
        ...,
        [ 101, 2769, 4638,  ..., 1921, 6820,  102],
        [ 101, 2769, 4638,  ..., 5709, 1446,  102],
        [ 101, 2769,  955,  ...,  955, 1446,  102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0,

## 自动混合精度（混合精度训练）

![amp](https://img-blog.csdnimg.cn/e4226734b82f462e983aa905de50891a.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)


### 混合精度训练
作用：训练时，尽量不降低性能，并提升速度 

Float16优点:

* 减少内存的使用
* 加快训练和推断的计算，能带来多一倍速的体验

Float16缺点:
* 溢出错误
* 舍入误差

In [26]:
# torch.FloatTensor 32位
a = torch.zeros(2,3)
print(a.type())

torch.FloatTensor


混合精度将 ***autocast*** 和 ***GradScaler*** 一起使用

***当进入autocast()时， 系统自动切换为float16, autocast上下文只包含前向传播，建议不用反向传播***

![amp2](https://img-blog.csdnimg.cn/72b642b508024cc2a6207c308347c7e7.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

#### Gradient Scaling
* scaler.scale(loss) 将给定的损失乘以缩放器的当前比例因子，进行反向传播
* scaler.step(optimizer) 取消缩放梯度并调用optimizer.step()
* scaler.update() 更新缩放器的比例因子

![scaling](https://img-blog.csdnimg.cn/bbae5cdd360748ecb59cee8dc6f728f2.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![GradScaler](https://img-blog.csdnimg.cn/2c0fdf08602748ea8a7655f2d5bb1829.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![example](https://img-blog.csdnimg.cn/dfeebde4d34b496096062bb7dbbee7b6.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

In [27]:
from sklearn.metrics import f1_score, accuracy_score
def evaluation(config, model, val_dataloader):
  model.eval()
  preds = []
  labels = []
  val_loss = 0.
  val_iterator = tqdm(val_dataloader, desc = 'Evaluation', total = len(val_dataloader))

  with torch.no_grad():
    for batch in val_iterator:
      labels.append(batch['labels'])
      batch_cuda = {item: value.to(config['device']) for item, value in list(batch.items())}
      loss, logits = model(**batch_cuda)[:2]

      if config['n_gpus'] > 1:
        loss = loss.mean()

      val_loss += loss.item()
      preds.append(logits.argmax(dim = -1).detach().cpu())

  avg_val_loss = val_loss / len(val_dataloader)
  labels = torch.cat(labels, dim = 0).numpy()
  preds = torch.cat(preds, dim = 0).numpy()
  f1 = f1_score(labels, preds)
  acc = accuracy_score(labels, preds)
  return avg_val_loss, f1, acc

In [28]:
class EMA:
  def __init__(self, model, decay):
    self.model = model
    self.decay = decay
    self.shadow = {}
    self.backup = {}
    self.register()

  def register(self):
    for name, param in self.model.named_parameters():
      if param.requires_grad:
        self.shadow[name] = param.data.clone()

  def update(self):
    for name, param in self.model.named_parameters():
      if param.requires_grad:
        # 如果 name in self.shadow 则运行下面两行代码， 否则报错
        assert name in self.shadow
        new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name]
        self.shadow[name] = new_average.clone()

  def apply_shadow(self):
    for name, param in self.model.named_parameters():
      if param.requires_grad:
        assert name in self.shadow
        self.backup[name] = param.data
        param.data = self.shadow[name]
 
  def restore(self):
    for name, param in self.model.named_parameters():
      if param.requires_grad:
        assert name in self.backup
        param.data = self.backup[name]
    self.backup = {}
  

In [29]:
from transformers import BertForSequenceClassification
from torch.cuda import amp
from transformers import AdamW
from extra_pgd import *
from extra_loss import *
from extra_fgm import *
from extra_optim import *
from tqdm import trange
def train(config, train_dataloader, dev_dataloader):
  # 封装好 BertForSequenceClassification
  model = BertForSequenceClassification.from_pretrained(config['model_path'])

  # param_optimizer = model.named_parameters()
  param_optimizer = list(model.named_parameters())

  # 实例化scaler对象 enabled=True 可以使用梯度缩放
  scaler = amp.GradScaler(enabled = config['use_amp'])

  # 权重缩减
  no_decay = ['bias', 'LayerNorm.weight']

  # 名称包含 ['bias', 'LayerNorm.weight']的权重， 其权重衰减因子为0
  # 名称不包含 ['bias', 'LayerNorm.weight']的权重， 其权重衰减因子为 0.01
  # any() 理解成any True的意思，是否存在True，只要有一个是True，结果就是True
  optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay': config['weight_decay']},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
  ]

  optimizer = AdamW(optimizer_grouped_parameters, lr = config['learning_rate'],
            eps = 1e-8)
  
  # lookahead 预先查看由 AdamW 生成的快速权重 来选择搜索方向
  optimizer = Lookahead(optimizer, 5, 1)
  total_steps = config['num_epochs'] * len(train_dataloader)

  # 使用Warmup来调整学习率，每调用warmup_steps次，对应的学习率就会调整一次。
  lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps = int(config['warmup_ratio'] * total_steps),
                     t_total = total_steps)
  

                                  
  model.to(config['device'])

  if config['adv'] == 'fgm':
    fgm = FGM(model)
  else:
    pgd = PGD(model)
    K = 3

  epoch_iterator = trange(config['num_epochs'])
  global_steps = 0
  train_loss = 0.
  logging_loss = 0.
  best_acc = 0.
  best_model_path = ''

  # 多卡情况
  if config['n_gpus'] > 1:
    model = nn.DataParallel(model)
  for _ in epoch_iterator:
    train_iterator = tqdm(train_dataloader, desc = 'Trainging', total = len(train_dataloader))
    model.train()
    for batch in train_iterator:
      batch_cuda = {item: value.to(config['device']) for item, value in list(batch.items())}

      # 前向过程（前向传播 + loss）
      with amp.autocast(enabled = config['use_amp']):
        loss = model(**batch_cuda)[0]
        # 多卡 取平均
        if config['n_gpus'] > 1:
          loss = loss.mean()
        
        scaler.scale(loss).backward()

      if config['adv'] == 'fgm':
          # 在embedding上加扰动
        fgm.attack(epsilon = config['eps'])

          # autocast
        with amp.autocast(enabled = config['use_amp']):
          loss_adv = model(**batch_cuda)[0]

          if config['n_gpus'] > 1:
            loss_adv =loss_adv.mean()

        scaler.scale(loss_adv).backward()
        # 恢复embedding参数
        fgm.restore()
      else:
        pgd.backup_grad()
        for t in range(K):
          pgd.attack(epsilon = config['eps'], alpha = config['alpha'], is_first_attack= ( t == 0))
          if t != K - 1:
            model.zero_grad()
          else:
            pgd.restore_grad()
          with amp.autocast(enabled = config['use_amp']):
            loss_adv = model(**batch_cuda)[0]
            if config['n_gpus'] > 1:
              loss_adv = loss_adv.mean()

          scaler.scale(loss_adv).backward()
        pgd.restore()
        
      scaler.step(optimizer)
      scaler.update()

      lr_scheduler.step()
      optimizer.zero_grad()

      if config['ema_start']:
        ema.update()
        
      train_loss += loss.item()
      global_steps += 1

      train_iterator.set_postfix_str(f'running train loss: {loss.item():.5f}')

      if global_steps % config['logging_step'] == 0:
        if global_steps >= config['ema_start_step'] and not config['ema_start']:
          print('\n>>> EMA starting .....')
          config['ema_start'] = True

          ema = EMA(model.module if hasattr(model, 'module') else model, decay = 0.99)

        print_train_loss = (train_loss - logging_loss) / config['logging_step']
        logging_loss = train_loss

        if config['ema_start']:
          ema.apply_shadow()
        val_loss, f1, acc = evaluation(config, model, dev_dataloader)

        print_log = f'\n>>> training loss: {print_train_loss:.6f}, valid loss: {val_loss:.6f},' 

        if acc > best_acc:
          model_save_path = os.path.join(config['output_path'],
                          f'checkpoint- {global_steps} - {acc:.6f}')
          model_to_save = model.module if hasattr(model, 'module') else model
          model_to_save.save_pretrained(model_save_path)
          best_acc = acc
          best_model_path = model_save_path
        print_log += f'valid f1: {f1:.6f}, valid acc:{acc:.6f}'

        print(print_log)
        model.train()

        if config['ema_start']:
          ema.restore()


  return model, best_model_path        

      

In [30]:
train(config, train_dataloader, dev_dataloader)

Some weights of the model checkpoint at /content/drive/MyDrive/Colab Notebooks/dataset/BERT_model were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 


>>> training loss: 0.608127, valid loss: 0.581218,valid f1: 0.000000, valid acc:0.689991



Trainging:  19%|█▊        | 100/537 [07:06<1:33:45, 12.87s/it, running train loss: 0.67692][A
Trainging:  19%|█▉        | 101/537 [07:06<1:15:10, 10.35s/it, running train loss: 0.67692][A
Trainging:  19%|█▉        | 101/537 [07:10<1:15:10, 10.35s/it, running train loss: 0.57110][A
Trainging:  19%|█▉        | 102/537 [07:10<1:01:56,  8.54s/it, running train loss: 0.57110][A
Trainging:  19%|█▉        | 102/537 [07:14<1:01:56,  8.54s/it, running train loss: 0.53864][A
Trainging:  19%|█▉        | 103/537 [07:14<50:34,  6.99s/it, running train loss: 0.53864]  [A
Trainging:  19%|█▉        | 103/537 [07:19<50:34,  6.99s/it, running train loss: 0.58431][A
Trainging:  19%|█▉        | 104/537 [07:19<46:18,  6.42s/it, running train loss: 0.58431][A
Trainging:  19%|█▉        | 104/537 [07:23<46:18,  6.42s/it, running train loss: 0.54549][A
Trainging:  20%|█▉        | 105/537 [07:23<41:52,  5.82s/it, running train loss: 0.54549][A
Trainging:  20%|█▉        | 105/537 [07:27<41:52,  5.82s/


>>> training loss: 0.563966, valid loss: 0.538982,valid f1: 0.000000, valid acc:0.689991



Trainging:  37%|███▋      | 200/537 [14:13<1:09:40, 12.40s/it, running train loss: 0.46188][A
Trainging:  37%|███▋      | 201/537 [14:13<55:11,  9.86s/it, running train loss: 0.46188]  [A
Trainging:  37%|███▋      | 201/537 [14:17<55:11,  9.86s/it, running train loss: 0.42793][A
Trainging:  38%|███▊      | 202/537 [14:17<45:11,  8.09s/it, running train loss: 0.42793][A
Trainging:  38%|███▊      | 202/537 [14:21<45:11,  8.09s/it, running train loss: 0.55234][A
Trainging:  38%|███▊      | 203/537 [14:21<38:09,  6.86s/it, running train loss: 0.55234][A
Trainging:  38%|███▊      | 203/537 [14:25<38:09,  6.86s/it, running train loss: 0.59766][A
Trainging:  38%|███▊      | 204/537 [14:25<32:31,  5.86s/it, running train loss: 0.59766][A
Trainging:  38%|███▊      | 204/537 [14:29<32:31,  5.86s/it, running train loss: 0.49247][A
Trainging:  38%|███▊      | 205/537 [14:29<30:14,  5.46s/it, running train loss: 0.49247][A
Trainging:  38%|███▊      | 205/537 [14:33<30:14,  5.46s/it, runn


>>> training loss: 0.544851, valid loss: 0.526998,valid f1: 0.477660, valid acc:0.702039



Trainging:  56%|█████▌    | 300/537 [21:15<51:37, 13.07s/it, running train loss: 0.48187][A
Trainging:  56%|█████▌    | 301/537 [21:15<40:22, 10.27s/it, running train loss: 0.48187][A
Trainging:  56%|█████▌    | 301/537 [21:18<40:22, 10.27s/it, running train loss: 0.60991][A
Trainging:  56%|█████▌    | 302/537 [21:18<32:13,  8.23s/it, running train loss: 0.60991][A
Trainging:  56%|█████▌    | 302/537 [21:23<32:13,  8.23s/it, running train loss: 0.50224][A
Trainging:  56%|█████▋    | 303/537 [21:23<28:43,  7.37s/it, running train loss: 0.50224][A
Trainging:  56%|█████▋    | 303/537 [21:27<28:43,  7.37s/it, running train loss: 0.47104][A
Trainging:  57%|█████▋    | 304/537 [21:27<24:00,  6.18s/it, running train loss: 0.47104][A
Trainging:  57%|█████▋    | 304/537 [21:30<24:00,  6.18s/it, running train loss: 0.46893][A
Trainging:  57%|█████▋    | 305/537 [21:30<20:35,  5.33s/it, running train loss: 0.46893][A
Trainging:  57%|█████▋    | 305/537 [21:34<20:35,  5.33s/it, running 


>>> training loss: 0.533753, valid loss: 0.513328,valid f1: 0.458169, valid acc:0.713392



Trainging:  74%|███████▍  | 400/537 [28:26<29:34, 12.95s/it, running train loss: 0.53525][A
Trainging:  75%|███████▍  | 401/537 [28:26<22:57, 10.13s/it, running train loss: 0.53525][A
Trainging:  75%|███████▍  | 401/537 [28:29<22:57, 10.13s/it, running train loss: 0.52939][A
Trainging:  75%|███████▍  | 402/537 [28:29<18:16,  8.12s/it, running train loss: 0.52939][A
Trainging:  75%|███████▍  | 402/537 [28:33<18:16,  8.12s/it, running train loss: 0.53405][A
Trainging:  75%|███████▌  | 403/537 [28:33<15:19,  6.86s/it, running train loss: 0.53405][A
Trainging:  75%|███████▌  | 403/537 [28:36<15:19,  6.86s/it, running train loss: 0.46558][A
Trainging:  75%|███████▌  | 404/537 [28:36<12:58,  5.86s/it, running train loss: 0.46558][A
Trainging:  75%|███████▌  | 404/537 [28:40<12:58,  5.86s/it, running train loss: 0.52522][A
Trainging:  75%|███████▌  | 405/537 [28:40<11:13,  5.10s/it, running train loss: 0.52522][A
Trainging:  75%|███████▌  | 405/537 [28:44<11:13,  5.10s/it, running 


>>> EMA starting .....




Evaluation:   0%|          | 0/68 [00:00<?, ?it/s][A[A

Evaluation:   1%|▏         | 1/68 [00:00<00:41,  1.62it/s][A[A

Evaluation:   3%|▎         | 2/68 [00:01<00:33,  1.94it/s][A[A

Evaluation:   4%|▍         | 3/68 [00:01<00:28,  2.31it/s][A[A

Evaluation:   6%|▌         | 4/68 [00:01<00:27,  2.33it/s][A[A

Evaluation:   7%|▋         | 5/68 [00:02<00:27,  2.32it/s][A[A

Evaluation:   9%|▉         | 6/68 [00:02<00:24,  2.57it/s][A[A

Evaluation:  10%|█         | 7/68 [00:02<00:24,  2.51it/s][A[A

Evaluation:  12%|█▏        | 8/68 [00:03<00:24,  2.43it/s][A[A

Evaluation:  13%|█▎        | 9/68 [00:03<00:24,  2.42it/s][A[A

Evaluation:  15%|█▍        | 10/68 [00:04<00:23,  2.44it/s][A[A

Evaluation:  16%|█▌        | 11/68 [00:04<00:23,  2.42it/s][A[A

Evaluation:  18%|█▊        | 12/68 [00:05<00:23,  2.39it/s][A[A

Evaluation:  19%|█▉        | 13/68 [00:05<00:23,  2.37it/s][A[A

Evaluation:  21%|██        | 14/68 [00:05<00:22,  2.36it/s][A[A

Evaluation:


>>> training loss: 0.528752, valid loss: 0.508450,valid f1: 0.525030, valid acc:0.720806



Trainging:  93%|█████████▎| 500/537 [35:28<07:55, 12.85s/it, running train loss: 0.59890][A
Trainging:  93%|█████████▎| 501/537 [35:28<06:04, 10.12s/it, running train loss: 0.59890][A
Trainging:  93%|█████████▎| 501/537 [35:31<06:04, 10.12s/it, running train loss: 0.48773][A
Trainging:  93%|█████████▎| 502/537 [35:31<04:44,  8.12s/it, running train loss: 0.48773][A
Trainging:  93%|█████████▎| 502/537 [35:37<04:44,  8.12s/it, running train loss: 0.51034][A
Trainging:  94%|█████████▎| 503/537 [35:37<04:13,  7.46s/it, running train loss: 0.51034][A
Trainging:  94%|█████████▎| 503/537 [35:41<04:13,  7.46s/it, running train loss: 0.55732][A
Trainging:  94%|█████████▍| 504/537 [35:41<03:28,  6.33s/it, running train loss: 0.55732][A
Trainging:  94%|█████████▍| 504/537 [35:46<03:28,  6.33s/it, running train loss: 0.56393][A
Trainging:  94%|█████████▍| 505/537 [35:46<03:10,  5.96s/it, running train loss: 0.56393][A
Trainging:  94%|█████████▍| 505/537 [35:50<03:10,  5.96s/it, running 

(BertForSequenceClassification(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(21128, 768, padding_idx=1)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0): BertLayer(
           (attention): BertAttention(
             (self): BertSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): LayerNorm((768