In [None]:
!nvidia-smi

Tue Mar 22 15:05:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8    12W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip3 install transformers



In [None]:
!pip3 install torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio===0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

Looking in links: https://download.pytorch.org/whl/cu113/torch_stable.html


In [None]:
import torch

In [None]:
torch.cuda.is_available()

True

In [None]:
import torch
import random
import numpy as np
config = {
    'train_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/train.csv',
    'test_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/test.csv',
    'train_val_ratio':0.1,
    'model_path':'/content/drive/MyDrive/Colab Notebooks/dataset/BERT_model',
    'batch_size':16,
    'num_epochs':1,
    'learning_rate':2e-5,
    'logging_step':500,
    'seed':2022
}
config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(config['seed'])

2022

In [None]:
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
def read_data(config, tokenizer, mode = 'train'):
    data_df = pd.read_csv(config[f'{mode}_file_path'], sep=',')
    if mode == 'train':
        # 如果是训练文件 则划分训练集、测试集
        X_train, y_train = defaultdict(list),[]
        X_val, y_val = defaultdict(list),[]
        num_val = int(len(data_df) * config['train_val_ratio'])
    else:
        #只生成测试集
        X_test, y_test = defaultdict(list),[]
        
        
    for i, row in tqdm(data_df.iterrows(), desc=f'preprocess {mode} data', total =len(data_df)):
        #得到每个句子的标签
        label = row[1] if mode == 'train' else 0
        #得到每个句子
        sentence = row[-1]
        # add_special_tokens 添加一些特殊的toke： CLS、 SEP
        # return_token_type_ids 返回每个词所对应的id
        # return_attention_mask 批量时， padding部分无需注意
        inputs = tokenizer.encode_plus(sentence, add_special_tokens = True, return_token_type_ids =True, return_attention_mask = True )
        # return：
        # input_ids；
        # token_type_ids
        # attention_mask
        if mode == 'train':
            if i < num_val:
                X_val['inputs_ids'].append(inputs['input_ids'])
                y_val.append(label)
                X_val['token_type_ids'].append(inputs['token_type_ids'])
                X_val['attention_mask'].append(inputs['attention_mask'])
            else:
                X_train['inputs_ids'].append(inputs['input_ids'])
                y_train.append(label)
                X_train['token_type_ids'].append(inputs['token_type_ids'])
                X_train['attention_mask'].append(inputs['attention_mask'])
                
        else:
                X_test['inputs_ids'].append(inputs['input_ids'])
                y_test.append(label) 
                X_test['token_type_ids'].append(inputs['token_type_ids'])
                X_test['attention_mask'].append(inputs['attention_mask'])
                
                
    if mode == 'train':
        label2id ={label: i for i, label in enumerate(np.unique(y_train))}
        id2label ={i: label for label, i in label2id.items()}
        y_train = torch.tensor([label2id[i] for i in y_train],dtype = torch.long)
        y_val = torch.tensor([label2id[i] for i in y_val],dtype = torch.long)
        return X_train, y_train, X_val, y_val, label2id, id2label
    
    else:
        y_test = torch.tensor(y_test, dtype = torch.long)
        return X_test,y_test        

In [None]:
from torch.utils.data import Dataset
class TNEWSData(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y 
    
    def __getitem__(self, idx):
        return {
            'inputs_ids': self.x['inputs_ids'][idx],
            'label': self.y[idx],
            'token_type_ids': self.x['token_type_ids'][idx],
            'attention_mask': self.x['attention_mask'][idx]
        }
    def __len__(self):
        return self.y.size(0)

In [None]:
def collate_fn(example):
    #从 TNEWData 返回的{}中，多了两个key：token_type_ids、attention_mask
    input_ids_list = []
    labels = []
    token_type_ids_list = []
    attention_mask_list = []
    
    for ex in example:
        input_ids_list.append(ex['inputs_ids'])
        labels.append(ex['label'])
        token_type_ids_list.append(ex['token_type_ids'])
        attention_mask_list.append(ex['attention_mask'])
    
    #对齐放入tensor
    max_len = max(len(input_ids) for input_ids in input_ids_list)
    # shape (len(labels), max_length)
    input_ids_tensor = torch.zeros((len(labels), max_len), dtype=torch.long)
    token_type_ids_tensor = torch.zeros_like(input_ids_tensor)
    attention_mask_tensor = torch.zeros_like(input_ids_tensor)
    
    
    # 把列表中的数据放入tensor里
    for i, input_ids in enumerate(input_ids_list):
        input_ids_tensor[i, :len(input_ids)] = torch.tensor(input_ids, dtype = torch.long)
        token_type_ids_tensor[i, :len(input_ids)] = torch.tensor(token_type_ids_list[i], dtype= torch.long)
        attention_mask_tensor[i, :len(input_ids)] = torch.tensor(attention_mask_list[i], dtype= torch.long)
        
        
    return {
        'input_ids': input_ids_tensor,
        'label': torch.tensor(labels, dtype = torch.long),
        'token_type_ids': token_type_ids_tensor,
        'attention_mask':  attention_mask_tensor
    }

In [None]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader
def build_dataloader(config):
    #加载词表
    tokenizer = BertTokenizer.from_pretrained(config['model_path'])
    X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, tokenizer, mode='train')
    X_test, y_test = read_data(config, tokenizer, mode='test')
    
    train_dataset = TNEWSData(X_train, y_train)
    val_dataset=TNEWSData(X_val, y_val)
    test_dataset = TNEWSData(X_test, y_test)
    
    train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(val_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)
    test_dataloader = DataLoader(test_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)
    
    
    return train_dataloader, val_dataloader, test_dataloader, id2label

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
train_dataloader, val_dataloader, test_dataloader, id2label = build_dataloader(config)

preprocess train data: 100%|██████████| 53360/53360 [00:23<00:00, 2297.39it/s]
preprocess test data: 100%|██████████| 10000/10000 [00:04<00:00, 2360.47it/s]
  cpuset_checked))


In [None]:
for batch in tqdm(iter(train_dataloader)):
    print(batch)
    break

  cpuset_checked))
  0%|          | 0/3002 [00:00<?, ?it/s]

{'input_ids': tensor([[  101,  6206,  2682,  4381,  6760,  5994,  2877,  5500,  4873,  8024,
          7444,  6206,  2110,   739,  1290,   711,  4638,  6411,   928,  1501,
          6574,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  3299,  1057,  8283,  1914,  5543,  1762,  7028,  2412,  2356,
          2902,  2999,   743,  2791,  1408,  8043,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  3797,  1977,   677,  4028,   749,   671,  1767,  2661,  2552,
          1220,  7790,  4638,  4260,  6121,  6381,  8024,  3797,  1977,  1079,
          2552,  8038,  3315,  3797,  2218,  3221,  6821,   720,  7390,  2692,
           102,     0,     0],
        [  101,  1506,  4633,  8038,   924,  5384,  1213,   924,  2769,   812,
          8024,   800,  2970,  5052,   749,  3683,  6612,  8024,  2802,  2533,
          2523,  3472,  




In [None]:
from sklearn.metrics import f1_score
def evaluation(config, model, val_dataloader):
    model.eval()
    preds = []
    labels = []
    val_loss = 0.
    val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))
    
    with torch.no_grad():
        for batch in val_iterator:
            labels.append(batch['labels'])
            batch = {item: value.to(config['device']) for item, value in batch.items()}
            loss, logits = model(**batch)[:2]
            
            val_loss += loss.item()
            preds.append(logits.argmax(dim = -1).detach().cpu())
            
    avg_val_loss = val_loss / len(val_dataloader)
    labels = torch.cat(labels, dim = 0).numpy()
    preds = torch.cat(preds, dim=0).numpy()
    f1 = f1_score(labels, preds, average='macro')
    return avg_val_loss, f1

In [None]:
# train BERT
from transformers import BertConfig, BertForSequenceClassification
from transformers import AdamW
from tqdm import trange
def train(config, id2label, train_dataloader, val_dataloader):
    #配置文件
    bert_config = BertConfig.from_pretrained(config['model_path'])
    bert_config.num_labels = len(id2label)
    model = BertForSequenceClassification.from_pretrained(config['model_path'], config = bert_config)
    
    #优化器
    optimizer = AdamW(model.parameters(), lr = config['learning_rate'])
    
    #放入GPU
    model.to(config['device'])
    epoch_iterator = trange(config['num_epochs'])
    global_steps = 0
    train_loss = 0.
    logging_loss =0.
    
    #跑了几个迭代
    for epoch in epoch_iterator:
        train_iterator = tqdm(train_dataloader, desc='Train', total=len(train_dataloader))
        model.train()
        for batch in train_dataloader:
            #字典中的value送入GPU
            batch = {item: value.to(config['device']) for item, value in batch.items()}
            
            
            #将字典作为关键字参数传递给python中函数
            loss = model(**batch)[0]
            
            #模型参数梯度设置为0
            model.zero_grad()
            
            #反向传播
            loss.backward()
            
            #更新参数
            optimizer.step()
            
            #叠加loss
            train_loss += loss.item()
            global_steps += 1
            
            if gloval_steps % config['logging_step'] == 0:
                print_train_loss = (train_loss - logging_loss) / config['logging_step']
                
                logging_loss = train_loss
                
                avg_val_loss, f1 = evaluation(config, model, val_dataloader)
                
                print_log = f'>>> training loss: {print_train_loss: .5f}, val loss: {avg_val_loss: .5f}, valid f1 score: {f1: .5f}'
                print(print_log)
                model.train()
    return model
            

In [None]:
model = train(config, id2label, train_dataloader, val_dataloader)

OSError: ignored