In [1]:
import torch
import random
import numpy as np
config = {
    'train_file_path':'dataset/train.csv',
    'test_file_path':'dataset/test.csv',
    'train_val_ratio':0.1,
    'model_path':'dataset/BERT_model',
    'batch_size':16,
    'num_epochs':1,
    'learning_rate':2e-5,
    'logging_step':500,
    'seed':2022
}
config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(config['seed'])

2022

In [2]:
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
def read_data(config, tokenizer, mode = 'train'):
    data_df = pd.read_csv(config[f'{mode}_file_path'], sep=',')
    if mode == 'train':
        # 如果是训练文件 则划分训练集、测试集
        X_train, y_train = defaultdict(list),[]
        X_val, y_val = defaultdict(list),[]
        num_val = int(len(data_df) * config['train_val_ratio'])
    else:
        #只生成测试集
        X_test, y_test = defaultdict(list),[]
        
        
    for i, row in tqdm(data_df.iterrows(), desc=f'preprocess {mode} data', total =len(data_df)):
        #得到每个句子的标签
        label = row[1] if mode == 'train' else 0
        #得到每个句子
        sentence = row[-1]
        # add_special_tokens 添加一些特殊的toke： CLS、 SEP
        # return_token_type_ids 返回每个词所对应的id
        # return_attention_mask 批量时， padding部分无需注意
        inputs = tokenizer.encode_plus(sentence, add_special_tokens = True, return_token_type_ids =True, return_attention_mask = True )
        # return：
        # input_ids；
        # token_type_ids
        # attention_mask
        if mode == 'train':
            if i < num_val:
                X_val['inputs_ids'].append(inputs['input_ids'])
                y_val.append(label)
                X_val['token_type_ids'].append(inputs['token_type_ids'])
                X_val['attention_mask'].append(inputs['attention_mask'])
            else:
                X_train['inputs_ids'].append(inputs['input_ids'])
                y_train.append(label)
                X_train['token_type_ids'].append(inputs['token_type_ids'])
                X_train['attention_mask'].append(inputs['attention_mask'])
                
        else:
                X_test['inputs_ids'].append(inputs['input_ids'])
                y_test.append(label) 
                X_test['token_type_ids'].append(inputs['token_type_ids'])
                X_test['attention_mask'].append(inputs['attention_mask'])
                
                
    if mode == 'train':
        label2id ={label: i for i, label in enumerate(np.unique(y_train))}
        id2label ={i: label for label, i in label2id.items()}
        y_train = torch.tensor([label2id[i] for i in y_train],dtype = torch.long)
        y_val = torch.tensor([label2id[i] for i in y_val],dtype = torch.long)
        return X_train, y_train, X_val, y_val, label2id, id2label
    
    else:
        y_test = torch.tensor(y_test, dtype = torch.long)
        return X_test,y_test        

In [3]:
from torch.utils.data import Dataset
class TNEWSData(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y 
    
    def __getitem__(self, idx):
        return {
            'inputs_ids': self.x['inputs_ids'][idx],
            'label': self.y[idx],
            'token_type_ids': self.x['token_type_ids'][idx],
            'attention_mask': self.x['attention_mask'][idx]
        }
    def __len__(self):
        return self.y.size(0)

从TNEWSData返回的一个example 
     
        {
            'inputs_ids' : self.x['inputs_ids'][idx],
            'label' : self.y[idx],
            # ---------------------- part 1 ---------------------- #
            'token_type_ids': self.x['token_type_ids'][idx],
            'attention_mask': self.x['attention_mask'][idx]
            # ---------------------- part 1 ---------------------- #
        }
        
        
取多次 examples，使用collate_fn整合到一起，变成一个tensor, 以便并行加载

In [4]:
def collate_fn(example):
    #从 TNEWData 返回的{}中，多了两个key：token_type_ids、attention_mask
    input_ids_list = []
    labels = []
    token_type_ids_list = []
    attention_mask_list = []
    
    for ex in example:
        input_ids_list.append(ex['inputs_ids'])
        labels.append(ex['label'])
        token_type_ids_list.append(ex['token_type_ids'])
        attention_mask_list.append(ex['attention_mask'])
    
    #对齐放入tensor
    max_len = max(len(input_ids) for input_ids in input_ids_list)
    # shape (len(labels), max_length)
    input_ids_tensor = torch.zeros((len(labels), max_len), dtype=torch.long)
    token_type_ids_tensor = torch.zeros_like(input_ids_tensor)
    attention_mask_tensor = torch.zeros_like(input_ids_tensor)
    
    
    # 把列表中的数据放入tensor里
    for i, input_ids in enumerate(input_ids_list):
        input_ids_tensor[i, :len(input_ids)] = torch.tensor(input_ids, dtype = torch.long)
        token_type_ids_tensor[i, :len(input_ids)] = torch.tensor(token_type_ids_list[i], dtype= torch.long)
        attention_mask_tensor[i, :len(input_ids)] = torch.tensor(attention_mask_list[i], dtype= torch.long)
        
        
    return {
        'input_ids': input_ids_tensor,
        'label': torch.tensor(labels, dtype = torch.long),
        'token_type_ids': token_type_ids_tensor,
        'attention_mask':  attention_mask_tensor
    }

In [5]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader
def build_dataloader(config):
    #加载词表
    tokenizer = BertTokenizer.from_pretrained(config['model_path'])
    X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, tokenizer, mode='train')
    X_test, y_test = read_data(config, tokenizer, mode='test')
    
    train_dataset = TNEWSData(X_train, y_train)
    val_dataset=TNEWSData(X_val, y_val)
    test_dataset = TNEWSData(X_test, y_test)
    
    train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(val_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)
    test_dataloader = DataLoader(test_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)
    
    
    return train_dataloader, val_dataloader, test_dataloader, id2label

In [6]:
train_dataloader, val_dataloader, test_dataloader, id2label = build_dataloader(config)

preprocess train data: 100%|███████████████████████████████████████████████████| 53360/53360 [00:15<00:00, 3543.85it/s]
preprocess test data: 100%|████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 3580.50it/s]


In [None]:
for batch in tqdm(iter(train_dataloader)):
    print(batch)
    break

In [None]:
for _id, batch in tqdm(train_dataloader):
    print(batch)
    break

  0%|                                                                                         | 0/3002 [00:00<?, ?it/s]