In [21]:
!nvidia-smi

Tue Apr 26 09:12:12 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8    30W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [22]:
from google.colab import drive
import sys
drive.mount('/content/drive')
#设置路径
sys.path.append('/content/drive/MyDrive/Colab Notebooks')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
! pip install transformers==4.0.1



In [24]:
# torch版本为1.6
! pip install torch==1.6.0



In [25]:
# torchvision 是PyTorch中专门用来处理图像的库
! pip install torchvision==0.7.0



In [26]:
import torch
import random
import os
import numpy as np
import json
import pandas as pd
from tqdm import tqdm


config = {
    'train_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/ESIM/train.json',
    'dev_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/ESIM/dev.json',
    'test_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/ESIM/test.json',
    'model_path':'/content/drive/MyDrive/Colab Notebooks/dataset/BERT_model',
    'output_path': '.',
    'train_val_ratio':0.1,
    'vocab_size':30000,
    'batch_size':64,
    'max_seq_len':64,
    'num_epochs':1,
    'learning_rate':2e-5,
    'eps': 0.1,
    'alpha': 0.3,
    'adv': 'fgm',
    'warmup_ratio': 0.05,
    'weight_decay': 0.01,
    'use_bucket': True,
    'bucket_multiplier': 200,
    'n_gpus': 0,
    'use_amp': True, # 只针对有 tensor core 的gpu有效
    'ema_start_step': 500,
    'ema_start': False,
    'logging_step':100,
    'seed':2022
}

if not torch.cuda.is_available():
  config['device'] = 'cpu'
else:
  config['n_gpus'] = torch.cuda.device_count()
  config['batch_size'] *= config['n_gpus']

if not os.path.exists(config['output_path']):
    os.makedirs((config['output_path']))

    
def seed_everything(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  return seed

seed_everything(config['seed'])

2022

In [27]:
def parse_data(path, data_type='train'):
  sentence_a = []
  sentence_b = []
  labels = []

  with open(path, 'r', encoding = 'utf8') as f:
    for line in tqdm(f.readlines(), desc=f'Reading {data_type} data'):
      line = json.loads(line)
      sentence_a.append(line['sentence1'])
      sentence_b.append(line['sentence2'])
      if data_type != 'test':
        labels.append(int(line['label']))
      else:
        labels.append(0)

  df = pd.DataFrame(zip(sentence_a, sentence_b, labels), columns = ['text_a', 'text_b', 'labels'])
  return df

## encode和encode_plus的区别
1. encode仅返回input_ids
2. encode_plus返回所有的编码信息，具体如下：
’input_ids:是单词在词典中的编码; 
‘token_type_ids’:区分两个句子的编码（上句全为0，下句全为1）; 
‘attention_mask’:指定对哪些词进行self-Attention操作

```
model_name = 'bert-base-uncased'

# a.通过词典导入分词器
tokenizer = BertTokenizer.from_pretrained(model_name)
sentence = "Hello, my son is laughing."

print(tokenizer.encode(sentence))
print(tokenizer.encode_plus(sentence))


运行结果：

[101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102]
{'input_ids': [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
```

In [28]:
# inputs: defaultdict(list)
def build_bert_inputs(inputs, label, sentence_a, sentence_b, tokenizer):
  # add_special_tokens [CLS] [SEP]
  # return_token_type_ids 该词属于sentence_a(返回0) or sentence_b(返回1). 
  # return_attention_mask pad=0, 不是pad的部分标为1， 是pad标为0.
  inputs_dict = tokenizer.encode_plus(sentence_a, sentence_b, add_special_tokens = True,
                     return_token_type_ids = True,
                     return_attention_mask = True)
  inputs['input_ids'].append(inputs_dict['input_ids'])
  inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
  inputs['attention_mask'].append(inputs_dict['attention_mask'])
  inputs['labels'].append(label)

## defaultdict(list)
```
from collections import defaultdict
result = defaultdict(list)
data = [("p", 1), ("p", 2), ("p", 3),
     ("h", 1), ("h", 2), ("h", 3)]
 
for (key, value) in data:
    result[key].append(value)
print(result)#defaultdict(<class 'list'>, {'p': [1, 2, 3], 'h': [1, 2, 3]})

```

In [29]:
from collections import defaultdict
def read_data(config, tokenizer):
  train_df = parse_data(config['train_file_path'], data_type = 'train')
  dev_df = parse_data(config['dev_file_path'], data_type = 'dev')
  test_df = parse_data(config['test_file_path'], data_type = 'test')

  # 把这些 df 打包成字典
  data_df = {'train': train_df, 'dev': dev_df, 'test': test_df}
  #保存 BERT 的输入
  processed_data = {}
  # 遍历字典(data_df)
  for data_type, df in data_df.items():
    inputs = defaultdict(list)
    #遍历每一行
    for i, row in tqdm(df.iterrows(), desc= f'Preprocessing {data_type} data', total = len(df)):
      label = row[2]
      sentence_a, sentence_b = row[0], row[1]
      build_bert_inputs(inputs, label, sentence_a, sentence_b, tokenizer)

    processed_data[data_type] = inputs
  return processed_data

In [30]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(config['model_path'])
dt = read_data(config, tokenizer)

Reading train data: 100%|██████████| 34334/34334 [00:00<00:00, 229081.44it/s]
Reading dev data: 100%|██████████| 4316/4316 [00:00<00:00, 238474.72it/s]
Reading test data: 100%|██████████| 3861/3861 [00:00<00:00, 214663.41it/s]
Preprocessing train data: 100%|██████████| 34334/34334 [00:29<00:00, 1149.34it/s]
Preprocessing dev data: 100%|██████████| 4316/4316 [00:02<00:00, 1439.76it/s]
Preprocessing test data: 100%|██████████| 3861/3861 [00:02<00:00, 1326.49it/s]


In [34]:
print('train_df中 input_ids的第一条数据',dt['train']['input_ids'][0])
print('dev_df中 token_type_ids的第一条数据',dt['dev']['token_type_ids'][0])

train_df中 input_ids的第一条数据 [101, 6010, 6009, 955, 1446, 5023, 7583, 6820, 3621, 1377, 809, 2940, 2768, 1044, 2622, 1400, 3315, 1408, 102, 955, 1446, 3300, 1044, 2622, 1168, 3309, 6820, 3315, 1408, 102]
dev_df中 token_type_ids的第一条数据 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
