In [19]:
!nvidia-smi

Wed Apr 27 11:32:57 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8    29W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [20]:
from google.colab import drive
import sys
drive.mount('/content/drive')
#设置路径
sys.path.append('/content/drive/MyDrive/Colab Notebooks')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
! pip install transformers==4.0.1



In [22]:
# torch版本为1.6
! pip install torch==1.6.0



In [23]:
# torchvision 是PyTorch中专门用来处理图像的库
! pip install torchvision==0.7.0



In [24]:
import torch
import random
import os
import numpy as np
import json
import pandas as pd
from tqdm import tqdm


config = {
    'train_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/ESIM/train.json',
    'dev_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/ESIM/dev.json',
    'test_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/ESIM/test.json',
    'model_path':'/content/drive/MyDrive/Colab Notebooks/dataset/BERT_model',
    'output_path': '.',
    'train_val_ratio':0.1,
    'vocab_size':30000,
    'batch_size':64,
    'max_seq_len':64,
    'num_epochs':1,
    'learning_rate':2e-5,
    'eps': 0.1,
    'alpha': 0.3,
    'adv': 'fgm',
    'warmup_ratio': 0.05,
    'weight_decay': 0.01,
    'use_bucket': True,
    'bucket_multiplier': 200,
    'n_gpus': 0,
    'use_amp': True, # 只针对有 tensor core 的gpu有效
    'ema_start_step': 500,
    'ema_start': False,
    'logging_step':100,
    'seed':2022
}

if not torch.cuda.is_available():
  config['device'] = 'cpu'
else:
  config['n_gpus'] = torch.cuda.device_count()
  config['batch_size'] *= config['n_gpus']

if not os.path.exists(config['output_path']):
    os.makedirs((config['output_path']))

    
def seed_everything(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  return seed

seed_everything(config['seed'])

2022

In [25]:
def parse_data(path, data_type='train'):
  sentence_a = []
  sentence_b = []
  labels = []

  with open(path, 'r', encoding = 'utf8') as f:
    for line in tqdm(f.readlines(), desc=f'Reading {data_type} data'):
      line = json.loads(line)
      sentence_a.append(line['sentence1'])
      sentence_b.append(line['sentence2'])
      if data_type != 'test':
        labels.append(int(line['label']))
      else:
        labels.append(0)

  df = pd.DataFrame(zip(sentence_a, sentence_b, labels), columns = ['text_a', 'text_b', 'labels'])
  return df

## encode和encode_plus的区别
1. encode仅返回input_ids
2. encode_plus返回所有的编码信息，具体如下：
’input_ids:是单词在词典中的编码; 
‘token_type_ids’:区分两个句子的编码（上句全为0，下句全为1）; 
‘attention_mask’:指定对哪些词进行self-Attention操作

```
model_name = 'bert-base-uncased'

# a.通过词典导入分词器
tokenizer = BertTokenizer.from_pretrained(model_name)
sentence = "Hello, my son is laughing."

print(tokenizer.encode(sentence))
print(tokenizer.encode_plus(sentence))


运行结果：

[101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102]
{'input_ids': [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
```

In [26]:
# inputs: defaultdict(list)
def build_bert_inputs(inputs, label, sentence_a, sentence_b, tokenizer):
  # add_special_tokens [CLS] [SEP]
  # return_token_type_ids 该词属于sentence_a(返回0) or sentence_b(返回1). 
  # return_attention_mask pad=0, 不是pad的部分标为1， 是pad标为0.
  inputs_dict = tokenizer.encode_plus(sentence_a, sentence_b, add_special_tokens = True,
                     return_token_type_ids = True,
                     return_attention_mask = True)
  inputs['input_ids'].append(inputs_dict['input_ids'])
  inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
  inputs['attention_mask'].append(inputs_dict['attention_mask'])
  inputs['labels'].append(label)

## defaultdict(list)
```
from collections import defaultdict
result = defaultdict(list)
data = [("p", 1), ("p", 2), ("p", 3),
     ("h", 1), ("h", 2), ("h", 3)]
 
for (key, value) in data:
    result[key].append(value)
print(result)#defaultdict(<class 'list'>, {'p': [1, 2, 3], 'h': [1, 2, 3]})

```

In [27]:
from collections import defaultdict
def read_data(config, tokenizer):
  train_df = parse_data(config['train_file_path'], data_type = 'train')
  dev_df = parse_data(config['dev_file_path'], data_type = 'dev')
  test_df = parse_data(config['test_file_path'], data_type = 'test')

  # 把这些 df 打包成字典
  data_df = {'train': train_df, 'dev': dev_df, 'test': test_df}
  #保存 BERT 的输入
  processed_data = {}
  # 遍历字典(data_df)
  for data_type, df in data_df.items():
    inputs = defaultdict(list)
    #遍历每一行
    for i, row in tqdm(df.iterrows(), desc= f'Preprocessing {data_type} data', total = len(df)):
      label = row[2]
      sentence_a, sentence_b = row[0], row[1]
      build_bert_inputs(inputs, label, sentence_a, sentence_b, tokenizer)

    processed_data[data_type] = inputs
  return processed_data

In [28]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(config['model_path'])
dt = read_data(config, tokenizer)

Reading train data: 100%|██████████| 34334/34334 [00:00<00:00, 144188.33it/s]
Reading dev data: 100%|██████████| 4316/4316 [00:00<00:00, 113366.67it/s]
Reading test data: 100%|██████████| 3861/3861 [00:00<00:00, 117546.09it/s]
Preprocessing train data: 100%|██████████| 34334/34334 [00:36<00:00, 943.31it/s] 
Preprocessing dev data: 100%|██████████| 4316/4316 [00:03<00:00, 1412.76it/s]
Preprocessing test data: 100%|██████████| 3861/3861 [00:03<00:00, 1238.07it/s]


In [29]:
print('train_df中 input_ids的第一条数据',dt['train']['input_ids'][0])
print('dev_df中 token_type_ids的第一条数据',dt['dev']['token_type_ids'][0])

train_df中 input_ids的第一条数据 [101, 6010, 6009, 955, 1446, 5023, 7583, 6820, 3621, 1377, 809, 2940, 2768, 1044, 2622, 1400, 3315, 1408, 102, 955, 1446, 3300, 1044, 2622, 1168, 3309, 6820, 3315, 1408, 102]
dev_df中 token_type_ids的第一条数据 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [30]:
from torch.utils.data import Dataset
class AFQMCDataset(Dataset):
  def __init__(self, data_dict):
    super(AFQMCDataset, self).__init__()
    self.data_dict = data_dict
  
  # 返回一个example
  def __getitem__(self, idx):
    data = (self.data_dict['input_ids'][idx],
         self.data_dict['token_type_ids'][idx],
         self.data_dict['attention_mask'][idx],
         self.data_dict['labels'][idx])
    return data
  
  def __len__(self):
    return len(self.data_dict['input_ids'])

In [31]:
class Collator:
  def __init__(self, max_seq_len, tokenizer):
    self.max_seq_len = max_seq_len
    self.tokenizer = tokenizer
  
  def pad_and_truncate(self, input_ids_list, token_type_ids_list, attention_mask_list, labels_list, max_seq_len):
    input_ids = torch.zeros((len(input_ids_list), max_seq_len),dtype=torch.long)
    token_type_ids = torch.zeros_like(input_ids)
    attention_mask = torch.zeros_like(input_ids)
    
    for i in range(len(input_ids_list)):
      seq_len = len(input_ids_list[i])
      if seq_len <= max_seq_len:
        input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype = torch.long)
        token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype = torch.long)
        attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype = torch.long)
      else:
        # input_ids 最后一位放上一个特殊的token
        input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len-1] + [self.tokenizer.sep_token_id], dtype = torch.long)
        # token_type_ids 和 attention_mask 不需要加上特殊token
        token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype = torch.long)
        attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype = torch.long)
    labels = torch.tensor(labels_list, dtype = torch.long)
    return input_ids, token_type_ids, attention_mask, labels

  def __call__(self, examples):
    input_ids_list, token_type_ids_list, attention_mask_list, labels_list = list(zip(*examples))
    cur_max_seq_len = max(len(input_id) for input_id in input_ids_list)
    max_seq_len = min(cur_max_seq_len, self.max_seq_len)
    
    input_ids, token_type_ids, attention_mask, labels = self.pad_and_truncate(input_ids_list, token_type_ids_list, attention_mask_list, labels_list, max_seq_len)                     
    
    data_dict = {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }
    return data_dict

In [32]:
collate_fn = Collator(config['max_seq_len'], tokenizer)

## 采样（Dataloader）

![Dataloader](https://img-blog.csdnimg.cn/b80cee8a1c7d49b79e7b80cc81150d66.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

## Sampler 
所有采样器都继承自Sampler这个类

每个Sampler子类都要实现iter方法【迭代数据集example索引的方法】，以及返回迭代器长度的len方法

![sampler](https://img-blog.csdnimg.cn/1c40aedade9f40a493b4df97d0c1def0.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

### 顺序采样
![sequentialsampler](https://img-blog.csdnimg.cn/9e8ee018cea84729ac6b5742395d8ea2.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

***在初始化时拿到数据集data_source， 按顺序对元素进行采样，每次只返回一个索引值 ***

In [33]:
# 顺序采样举例
# randperm 把 0-23 数据打乱 形成3维tensor
# (2,3,4) batch_size:2 seq_len=3, embedding_dim=4，每个 batch 有2条数据，每个句子包含3个词， 每个词的维度是4
a = torch.randperm(24).reshape((2,3,4))
print('a:',a)
b = torch.utils.data.SequentialSampler(a)
print('b:',b)
# i 是索引
for i in b:
    print(i)

a: tensor([[[ 5, 14,  9,  2],
         [13, 12, 20,  1],
         [16, 15,  7,  4]],

        [[17,  0,  3, 19],
         [10, 22,  6, 18],
         [ 8, 23, 11, 21]]])
b: <torch.utils.data.sampler.SequentialSampler object at 0x7fa2131abc50>
0
1


### 随机采样
replacement : True 表示可以重复采样

num_samples: 指定采样的数量

PS:当使用replacement=False，不应制定num_samples
![randomsampler](https://img-blog.csdnimg.cn/9d2e2afdbe4d4df4aee3102e46054650.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

In [34]:
# 随机采样举例
a = torch.randperm(60).reshape((5,3,4))
print('a:',a)
# 随机采样3条数据
b = torch.utils.data.RandomSampler(a, replacement=True, num_samples=3)
print('b:',b)
for i in b:
    print(i)

a: tensor([[[47, 36, 58, 12],
         [49,  7, 24,  9],
         [27, 13, 45,  0]],

        [[ 3, 28, 23, 39],
         [37, 29, 10, 59],
         [ 4, 35, 56, 53]],

        [[54, 32, 18, 42],
         [41, 46, 30, 14],
         [38, 22, 11,  5]],

        [[48, 33, 57, 26],
         [15, 19, 55, 16],
         [20, 40, 31,  6]],

        [[51, 17,  1, 25],
         [34,  2, 43, 21],
         [52, 50,  8, 44]]])
b: <torch.utils.data.sampler.RandomSampler object at 0x7fa1855a5e90>
3
3
2


### Subset随机采样
SubsetRandomSampler： 从给定的索引列表中随机采样元素，不放回采样 

indices(sequence): 索引序列
![sunsetRandomSampler](https://img-blog.csdnimg.cn/e80f6a1bafe042f28da652dc5a2388ab.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

In [35]:
# Subset采样举例
a = torch.randperm(60).reshape((5,3,4))
print('a:',a)
# 从索引2以后的样本中随机采样
b = torch.utils.data.SubsetRandomSampler(indices=a[2:])
for i in b:
    print(i)

a: tensor([[[ 8, 18, 55, 16],
         [49, 54, 14,  7],
         [33, 37, 39,  2]],

        [[45,  6, 24, 29],
         [58, 57,  3, 47],
         [46, 56, 26, 21]],

        [[12, 25, 52, 40],
         [ 9, 53, 10, 50],
         [48, 59, 27, 22]],

        [[ 0, 20, 34, 13],
         [41, 32, 35, 51],
         [15,  4, 36, 38]],

        [[11, 19,  5, 43],
         [23, 31, 44, 30],
         [28,  1, 42, 17]]])
tensor([[ 0, 20, 34, 13],
        [41, 32, 35, 51],
        [15,  4, 36, 38]])
tensor([[12, 25, 52, 40],
        [ 9, 53, 10, 50],
        [48, 59, 27, 22]])
tensor([[11, 19,  5, 43],
        [23, 31, 44, 30],
        [28,  1, 42, 17]])


### 分批采样
sampler: 基采样器 

batch_size: size of mini-batch

drop_last=True, 如果一个batch的长度小于batch_size则丢弃
![BatchSampler](https://img-blog.csdnimg.cn/8a1b2f5ae320453c9fae8ae8e0ef2080.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)



In [36]:
# 分批采样举例
a = torch.randperm(60).reshape((5,3,4))
print('a:',a)
# 要传一个基采样器torch.utils.data.RandomSampler(a)
b = torch.utils.data.BatchSampler(torch.utils.data.RandomSampler(a), 2, drop_last=True)
# 上面的i都是一个数；现在是batch_size的列表
for i in b:
    print(i)

a: tensor([[[34, 50, 28, 24],
         [46,  0, 35, 21],
         [51, 52, 33, 59]],

        [[31,  5, 26, 42],
         [11, 49,  8, 29],
         [ 9, 17, 53, 36]],

        [[ 1, 37, 22, 40],
         [18, 20, 45,  7],
         [10, 47, 19, 32]],

        [[38, 14, 58,  3],
         [13, 25, 27, 48],
         [ 6, 44, 55, 30]],

        [[56,  2, 57, 12],
         [ 4, 23, 16, 15],
         [43, 54, 41, 39]]])
[3, 4]
[1, 0]


### 桶采样
sort_key: 按XXX排序

bucket_sampler: batch_size * bucket_size_multiplier 相当于 n * batch_size
；len(sampler)最大为数据集的长度
![BucketSampler](https://img-blog.csdnimg.cn/6413cea5dfbf4494a6b2b64504f74a97.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![SortedSampler](https://img-blog.csdnimg.cn/64f60217df474cf1b0d7aa1c3558cc1f.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![BucketSampler](https://img-blog.csdnimg.cn/d7e03938f2824f9cb8a6c3a895f5a78a.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

In [45]:
# 桶采样举例
# Dataset -> 得到‘大桶'的排序索引

# 真实train中数据，前6条
mini_dataset = {k: v[:6] for k, v in dt['train'].items()}
mini_data = AFQMCDataset(mini_dataset)
print(mini_data)
# mini_data 的前6条数据的长度
for i, d in enumerate(mini_data):
    print(d[0]) # input_ids
    print(len(d[0]))

<__main__.AFQMCDataset object at 0x7fa2131b1090>
[101, 6010, 6009, 955, 1446, 5023, 7583, 6820, 3621, 1377, 809, 2940, 2768, 1044, 2622, 1400, 3315, 1408, 102, 955, 1446, 3300, 1044, 2622, 1168, 3309, 6820, 3315, 1408, 102]
30
[101, 6010, 6009, 5709, 1446, 6432, 2769, 6824, 5276, 671, 3613, 102, 6010, 6009, 5709, 1446, 6824, 5276, 6121, 711, 3221, 784, 720, 102]
24
[101, 2376, 2769, 4692, 671, 678, 3315, 3299, 5709, 1446, 6572, 1296, 3300, 3766, 3300, 5310, 3926, 102, 678, 3299, 5709, 1446, 6572, 1296, 102]
25
[101, 6010, 6009, 955, 1446, 1914, 7270, 3198, 7313, 5341, 1394, 6397, 844, 671, 3613, 102, 955, 1446, 2533, 6397, 844, 1914, 719, 102]
24
[101, 2769, 4638, 5709, 1446, 6572, 1296, 3221, 115, 115, 115, 8024, 6820, 3621, 2582, 720, 3221, 115, 115, 115, 102, 2769, 4638, 5709, 1446, 8024, 3299, 5310, 1139, 3341, 6432, 6375, 2769, 6820, 115, 115, 115, 1039, 8024, 2769, 5632, 2346, 5050, 749, 671, 678, 6422, 5301, 1399, 1296, 2769, 2418, 6421, 6820, 115, 115, 115, 1039, 102]
59
[101, 

In [44]:
from bucket_sampler import SortedSampler
random_sampler = torch.utils.data.RandomSampler(mini_data, replacement=False)
# print(list(random_sampler))
# 关于dataset的随机索引 [3, 5, 4, 1, 0, 2]

batch_sampler = torch.utils.data.BatchSampler(random_sampler, 4, drop_last=True)
# [0, 5, 2, 4] 【还有[1, 3] 但是丢弃了】

for samp in batch_sampler:
    print('samp:',samp)
    sorted_sampler = SortedSampler(samp, sort_key=lambda x:len(mini_data[x][0]))
    print('list_sorted_sampler:',list(sorted_sampler))

samp: [0, 4, 2, 3]
list_sorted_sampler: [3, 2, 0, 1]


```
[0, 5, 2, 4]分别对应mini_data中的长度[30, 28, 25, 59]

[2, 1, 0, 3] 

2（位置2的数据len最小） -> 2 -> 25 

1 -> 5 -> 28 

0 -> 0 -> 30 

3（位置3的数据len最大） -> 4 -> 59
```

In [42]:
# 得到‘大桶'的排序索引 -> 返回‘小桶'在‘大桶'中的位置
c = list(torch.utils.data.BatchSampler(sorted_sampler, 2, drop_last=True))
print(c)
# c 把大桶 分成 batch_size大小的小桶

[[1, 2], [0, 3]]


```
[[2, 1], [0, 3]]
```

In [46]:
for batch in torch.utils.data.SubsetRandomSampler(c):
    print('从给定的索引列表中随机采样元素')
    print(batch)
    print('所对应的原序列是什么：')
    print([samp[i] for i in batch])
    # 参考上面 from bucket_sampler import SortedSampler 单元格对应法则

从给定的索引列表中随机采样元素
[1, 2]
所对应的原序列是什么：
[4, 2]
从给定的索引列表中随机采样元素
[0, 3]
所对应的原序列是什么：
[0, 3]


```
从给定的索引列表中随机采样元素
[2, 1]
所对应的原序列是什么：
[2, 5]
从给定的索引列表中随机采样元素
[0, 3]
所对应的原序列是什么：
[0, 4]
```