In [1]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [2]:
! pip install transformers==4.0.1



In [3]:
!pip install torch==1.4.0



In [4]:
import torch
import random
import numpy as np
config = {
    'train_file_path': '/content/drive/MyDrive/Colab Notebooks/dataset/train.csv',
    'test_file_path': '/content/drive/MyDrive/Colab Notebooks/dataset/test.csv',
    'train_val_ratio': 0.1,

    # ------------- part 1 --------------- #
    # 'model_path': 'data/data94445',
    # ------------- part 1 ----------------#
    
    # ------------- part 2 --------------- #
    'head': 'cnn',
    # ------------- part 2 ----------------#
    
    # ------------- part 3 --------------- #
    'model_path': '/content/drive/MyDrive/Colab Notebooks/dataset/BERT_model',
    # ------------- part 3 ----------------#

    'batch_size': 16,   
    'num_epochs': 1,
    'learning_rate': 2e-5,
    'logging_step': 500,
    'seed': 2021}

config['device']='cuda' if torch.cuda.is_available() else 'cpu'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(config['seed'])

2021

In [5]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [6]:
torch.cuda.is_available()

False

In [7]:
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
def read_data(config, tokenizer, mode='train'):
    # 读取训练文件/测试文件
    data_df = pd.read_csv(config[f'{mode}_file_path'], sep=',')
    if mode == 'train':
        # 如果是训练文件 则划分训练集、验证集
        X_train, y_train = defaultdict(list), []
        X_val, y_val = defaultdict(list), []
        num_val = int(len(data_df) * config['train_val_ratio'])
    else:
        # 否则，只生成测试集
        X_test, y_test = defaultdict(list), []

    # 遍历文件中的美一行
    for i, row in tqdm(data_df.iterrows(), desc=f'preprocess {mode} data', total=len(data_df)):
        # 得到每个句子的标签 
        label=row[1] if mode == 'train' else 0
        # 得到每个句子
        sentence = row[-1]

        # ---------------------- part 1 ---------------------- #
        # add_special_tokens 添加一些特殊的toke： CLS、 SEP
        # return_token_type_ids 返回每个词所对应的id
        # return_attention_mask 批量时， padding部分无需注意
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True)

        # return：
        # input_ids；
        # token_type_ids
        # attention_mask
        # ---------------------- part 1 ---------------------- #

        if mode == 'train':
            # 当读取训练文件时
            if i < num_val:
                X_val['inputs_ids'].append(inputs['input_ids'])
                y_val.append(label)
                
                # ---------------------- part 1 ---------------------- #
                X_val['token_type_ids'].append(inputs['token_type_ids'])
                X_val['attention_mask'].append(inputs['attention_mask'])
                # ---------------------- part 1 ---------------------- #
                                
            else:
                X_train['inputs_ids'].append(inputs['input_ids'])
                y_train.append(label)

                # ---------------------- part 1 ---------------------- #
                X_train['token_type_ids'].append(inputs['token_type_ids'])
                X_train['attention_mask'].append(inputs['attention_mask'])
                # ---------------------- part 1 ---------------------- #

        else:
            X_test['inputs_ids'].append(inputs['input_ids'])
            y_test.append(label) 
            # ---------------------- part 1 ---------------------- #
            X_test['token_type_ids'].append(inputs['token_type_ids'])
            X_test['attention_mask'].append(inputs['attention_mask'])
            # ---------------------- part 1 ---------------------- #
            
    # 返回
    if mode == 'train':
        label2id = {label: i for i, label in enumerate(np.unique(y_train))} 
        id2label = {i: label for label, i in label2id.items()} 
        y_train = torch.tensor([label2id[i] for i in y_train], dtype=torch.long)  
        y_val = torch.tensor([label2id[i] for i in y_val], dtype=torch.long)  
        return X_train, y_train, X_val, y_val, label2id, id2label
        
    else:
        y_test = torch.tensor(y_test, dtype=torch.long)
        return X_test, y_test

In [8]:
from torch.utils.data import Dataset
class TNEWSData(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y

    def __getitem__(self, idx):
        return {
            'inputs_ids' : self.x['inputs_ids'][idx],
            'label' : self.y[idx],
            # ---------------------- part 1 ---------------------- #
            'token_type_ids': self.x['token_type_ids'][idx],
            'attention_mask': self.x['attention_mask'][idx]
            # ---------------------- part 1 ---------------------- #
        }
    
    def __len__(self):
        return self.y.size(0)

In [9]:
def collate_fn(examples):
    # 从 TNEWData 中返回的{}中，多了两个key: token_type_ids, attention_mask
    input_ids_list = []
    labels = []
    # ---------------------- part 1 ---------------------- #
    token_type_ids_list = []
    attention_mask_list = []
    # ---------------------- part 1 ---------------------- #

    for example in examples:
        #遍历样本， 我们先把句子放入列表中
        input_ids_list.append(example['inputs_ids'])
        labels.append(example['label'])
        # ---------------------- part 1 ---------------------- #
        token_type_ids_list.append(example['token_type_ids'])
        attention_mask_list.append(example['attention_mask'])
        # ---------------------- part 1 ---------------------- #
    
    # 对齐放入tensor中
    max_length = max(len(input_ids) for input_ids in input_ids_list)
    # shape (len(labels), max_length)
    input_ids_tensor = torch.zeros((len(labels), max_length), dtype=torch.long)

    # ---------------------- part 1 ---------------------- #
    token_type_ids_tensor = torch.zeros_like(input_ids_tensor)
    attention_mask_tensor = torch.zeros_like(input_ids_tensor)
    # ---------------------- part 1  ---------------------- #

    # 把列表中的数据放入tensor里
    for i, input_ids in enumerate(input_ids_list):
        input_ids_tensor[i, :len(input_ids)] = torch.tensor(input_ids, dtype=torch.long)
        # ---------------------- part 1 ---------------------- #
        # 第i个句子，填充 seq_len 这么长
        token_type_ids_tensor[i, :len(input_ids)] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
        attention_mask_tensor[i, :len(input_ids)] = torch.tensor(attention_mask_list[i], dtype=torch.long)
        # ---------------------- part 1 ---------------------- #
    
    return{
        'input_ids' : input_ids_tensor,
        'labels' : torch.tensor(labels, dtype=torch.long),
        # ---------------------- part 1 ---------------------- #
        'token_type_ids': token_type_ids_tensor,
        'attention_mask': attention_mask_tensor
        # ---------------------- part 1 ---------------------- #
    }

In [10]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader
def build_dataloader(config):
    # ---------------------- part 1 ---------------------- #
    # 加载词表
    tokenizer = BertTokenizer.from_pretrained(config['model_path'])
    # ---------------------- part 1 ---------------------- #
    X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, tokenizer, mode='train')
    X_test, y_test = read_data(config, tokenizer, mode='test')

    train_dataset = TNEWSData(X_train, y_train)
    val_dataset = TNEWSData(X_val, y_val)
    test_dataset = TNEWSData(X_test, y_test)

    train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(val_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)
    test_dataloader = DataLoader(test_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)

    return train_dataloader, val_dataloader, test_dataloader, id2label

In [11]:
train_dataloader, val_dataloader, test_dataloader, id2label = build_dataloader(config)

preprocess train data: 100%|██████████| 53360/53360 [00:44<00:00, 1189.78it/s]
preprocess test data: 100%|██████████| 10000/10000 [00:08<00:00, 1225.94it/s]


In [12]:
for batch in train_dataloader:
    print(batch)
    break

{'input_ids': tensor([[  101,  6205,  2128,  4415,  2339,  1920,  2110,  2247,   754,   784,
           720,  2231,  3613,  4638,  1920,  2110,  8043,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  1963,   862,  6397,   817,  4906,  3683,  2456,  6379,  6205,
          5885,  3172,  3121,  1359,  2832,  5074,  2013,  1232,  4638,   752,
          2658,  8043,   102,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  8391,  4413,  3215,  2853,  4170,  6443,  3297,  7464,  3698,
          8043,  2001,  3209,  6804,  2802,  3952,  2767,  6804,  2853,  4170,
          8024, 12695,  1402,  4170,  1008,  1613,  4125,   102,     0,     0,
             0],
        [  101,  1395,  1164,  9101,  2797,  5632,   671,   860,  4638,  2797,
          1220,  2913,  3221,  2582,   720,  4500,  4638,  8043,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,  

In [13]:
from sklearn.metrics import f1_score
def evaluation(config, model, val_dataloader):
    model.eval()
    preds = []
    labels = []
    val_loss = 0.
    val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))

    with torch.no_grad():
        for batch in val_iterator:
            labels.append(batch['labels'])
            batch = {item: value.to(config['device']) for item, value in batch.items()}
            loss, logits = model(**batch)[:2]

            val_loss += loss.item()
            preds.append(logits.argmax(dim=-1).detach().cpu())

    avg_val_loss = val_loss / len(val_dataloader)
    labels = torch.cat(labels, dim=0).numpy()
    preds = torch.cat(preds, dim=0).numpy()
    f1 = f1_score(labels, preds, average='macro')
    return avg_val_loss, f1

In [14]:
# Bert_model train
from transformers import BertConfig, BertForSequenceClassification
from transformers import AdamW
from tqdm import trange
def train(config, id2label, train_dataloader, val_dataloader):
    # ---------------------- part 1 ---------------------- #
    # 配置文件
    bert_config = BertConfig.from_pretrained(config['model_path'])
    bert_config.num_labels = len(id2label)
    model = BertForSequenceClassification.from_pretrained(config['model_path'], config=bert_config)
    # ---------------------- part 1 ---------------------- #

    # 定义优化器
    optimizer = AdamW(model.parameters(), lr=config['learning_rate'])

    # 有GPU， 将模型放入GPU
    model.to(config['device'])
    epoch_iterator = trange(config['num_epochs'])
    global_steps = 0
    train_loss = 0.
    logging_loss = 0.

    #一共跑了多少个迭代
    for epoch in epoch_iterator:

        train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
        model.train()
        # 遍历每个batch
        for batch in train_iterator:
            # 字典中的 value 送入GPU
            batch = {item: value.to(config['device']) for item, value in batch.items()}
            
            # 将字典作为关键字参数传递给Python中的函数
            loss = model(**batch)[0]
            
            # 模型中的参数梯度设为0
            model.zero_grad()
            
            # 反向传播
            loss.backward()
            
            # 更新参数
            optimizer.step()

            # 叠加loss
            train_loss += loss.item()
            global_steps += 1

            if global_steps % config['logging_step'] == 0:
                print_train_loss = (train_loss - logging_loss) / config['logging_step']
                logging_loss = train_loss

                avg_val_loss, f1 = evaluation(config, model, val_dataloader)

                print_log = f'>>> training loss: {print_train_loss:.4f}, valid loss: {avg_val_loss:.4f}, ' \
                            f'valid f1 score: {f1:.4f}'
                print(print_log)
                model.train()

    return model

In [15]:
model = train(config, id2label, train_dataloader, val_dataloader)

Some weights of the model checkpoint at /content/drive/MyDrive/Colab Notebooks/dataset/BERT_model were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

>>> training loss: 1.6373, valid loss: 1.4193, valid f1 score: 0.4969



Training:  17%|█▋        | 501/3002 [1:18:19<110:41:27, 159.33s/it][A
Training:  17%|█▋        | 502/3002 [1:18:27<79:11:09, 114.03s/it] [A
Training:  17%|█▋        | 503/3002 [1:18:35<57:00:41, 82.13s/it] [A
Training:  17%|█▋        | 504/3002 [1:18:42<41:25:48, 59.71s/it][A
Training:  17%|█▋        | 505/3002 [1:18:49<30:27:24, 43.91s/it][A
Training:  17%|█▋        | 506/3002 [1:18:58<23:05:48, 33.31s/it][A
Training:  17%|█▋        | 507/3002 [1:19:05<17:35:52, 25.39s/it][A
Training:  17%|█▋        | 508/3002 [1:19:13<14:07:08, 20.38s/it][A
Training:  17%|█▋        | 509/3002 [1:19:21<11:22:10, 16.42s/it][A
Training:  17%|█▋        | 510/3002 [1:19:28<9:35:14, 13.85s/it] [A
Training:  17%|█▋        | 511/3002 [1:19:36<8:12:39, 11.87s/it][A
Training:  17%|█▋        | 512/3002 [1:19:43<7:13:25, 10.44s/it][A
Training:  17%|█▋        | 513/3002 [1:19:50<6:32:41,  9.47s/it][A
Training:  17%|█▋        | 514/3002 [1:19:57<6:07:18,  8.86s/it][A
Training:  17%|█▋        | 515/3

>>> training loss: 1.4121, valid loss: 1.3300, valid f1 score: 0.5082



Training:  33%|███▎      | 1001/3002 [2:32:51<90:59:10, 163.69s/it] [A
Training:  33%|███▎      | 1002/3002 [2:32:59<64:54:22, 116.83s/it][A
Training:  33%|███▎      | 1003/3002 [2:33:07<46:47:19, 84.26s/it] [A
Training:  33%|███▎      | 1004/3002 [2:33:14<33:54:25, 61.09s/it][A
Training:  33%|███▎      | 1005/3002 [2:33:21<24:55:33, 44.93s/it][A
Training:  34%|███▎      | 1006/3002 [2:33:28<18:35:53, 33.54s/it][A
Training:  34%|███▎      | 1007/3002 [2:33:36<14:17:23, 25.79s/it][A
Training:  34%|███▎      | 1008/3002 [2:33:45<11:32:01, 20.82s/it][A
Training:  34%|███▎      | 1009/3002 [2:33:52<9:14:42, 16.70s/it] [A
Training:  34%|███▎      | 1010/3002 [2:33:59<7:36:54, 13.76s/it][A
Training:  34%|███▎      | 1011/3002 [2:34:07<6:37:01, 11.96s/it][A
Training:  34%|███▎      | 1012/3002 [2:34:16<6:10:37, 11.17s/it][A
Training:  34%|███▎      | 1013/3002 [2:34:23<5:29:30,  9.94s/it][A
Training:  34%|███▍      | 1014/3002 [2:34:30<5:00:12,  9.06s/it][A
Training:  34%|███▍ 

>>> training loss: 1.3530, valid loss: 1.3165, valid f1 score: 0.5050



Training:  50%|█████     | 1501/3002 [3:48:37<69:08:43, 165.84s/it][A
Training:  50%|█████     | 1502/3002 [3:48:44<49:15:11, 118.21s/it][A
Training:  50%|█████     | 1503/3002 [3:48:51<35:22:06, 84.94s/it] [A
Training:  50%|█████     | 1504/3002 [3:48:58<25:37:10, 61.57s/it][A
Training:  50%|█████     | 1505/3002 [3:49:06<18:51:48, 45.36s/it][A
Training:  50%|█████     | 1506/3002 [3:49:13<14:05:49, 33.92s/it][A
Training:  50%|█████     | 1507/3002 [3:49:20<10:44:15, 25.86s/it][A
Training:  50%|█████     | 1508/3002 [3:49:28<8:34:14, 20.65s/it] [A
Training:  50%|█████     | 1509/3002 [3:49:35<6:51:51, 16.55s/it][A
Training:  50%|█████     | 1510/3002 [3:49:44<5:48:38, 14.02s/it][A
Training:  50%|█████     | 1511/3002 [3:49:55<5:32:26, 13.38s/it][A
Training:  50%|█████     | 1512/3002 [3:50:03<4:47:45, 11.59s/it][A
Training:  50%|█████     | 1513/3002 [3:50:10<4:17:24, 10.37s/it][A
Training:  50%|█████     | 1514/3002 [3:50:18<3:53:24,  9.41s/it][A
Training:  50%|█████  

>>> training loss: 1.3088, valid loss: 1.2774, valid f1 score: 0.5347



Training:  67%|██████▋   | 2001/3002 [5:06:35<46:55:23, 168.76s/it][A
Training:  67%|██████▋   | 2002/3002 [5:06:42<33:24:33, 120.27s/it][A
Training:  67%|██████▋   | 2003/3002 [5:06:49<23:57:21, 86.33s/it] [A
Training:  67%|██████▋   | 2004/3002 [5:06:57<17:26:24, 62.91s/it][A
Training:  67%|██████▋   | 2005/3002 [5:07:05<12:49:01, 46.28s/it][A
Training:  67%|██████▋   | 2006/3002 [5:07:12<9:34:42, 34.62s/it] [A
Training:  67%|██████▋   | 2007/3002 [5:07:21<7:27:22, 26.98s/it][A
Training:  67%|██████▋   | 2008/3002 [5:07:29<5:48:21, 21.03s/it][A
Training:  67%|██████▋   | 2009/3002 [5:07:36<4:39:28, 16.89s/it][A
Training:  67%|██████▋   | 2010/3002 [5:07:43<3:50:59, 13.97s/it][A
Training:  67%|██████▋   | 2011/3002 [5:07:50<3:17:26, 11.95s/it][A
Training:  67%|██████▋   | 2012/3002 [5:07:57<2:51:37, 10.40s/it][A
Training:  67%|██████▋   | 2013/3002 [5:08:04<2:35:58,  9.46s/it][A
Training:  67%|██████▋   | 2014/3002 [5:08:13<2:32:18,  9.25s/it][A
Training:  67%|██████▋  

>>> training loss: 1.2811, valid loss: 1.2828, valid f1 score: 0.4986



Training:  83%|████████▎ | 2501/3002 [6:16:49<19:34:13, 140.63s/it][A
Training:  83%|████████▎ | 2502/3002 [6:16:57<14:01:34, 100.99s/it][A
Training:  83%|████████▎ | 2503/3002 [6:17:03<10:02:51, 72.49s/it] [A
Training:  83%|████████▎ | 2504/3002 [6:17:12<7:22:15, 53.28s/it] [A
Training:  83%|████████▎ | 2505/3002 [6:17:18<5:25:09, 39.26s/it][A
Training:  83%|████████▎ | 2506/3002 [6:17:24<4:02:28, 29.33s/it][A
Training:  84%|████████▎ | 2507/3002 [6:17:31<3:04:52, 22.41s/it][A
Training:  84%|████████▎ | 2508/3002 [6:17:37<2:25:33, 17.68s/it][A
Training:  84%|████████▎ | 2509/3002 [6:17:43<1:56:41, 14.20s/it][A
Training:  84%|████████▎ | 2510/3002 [6:17:49<1:36:07, 11.72s/it][A
Training:  84%|████████▎ | 2511/3002 [6:17:59<1:31:08, 11.14s/it][A
Training:  84%|████████▎ | 2512/3002 [6:18:05<1:19:05,  9.68s/it][A
Training:  84%|████████▎ | 2513/3002 [6:18:12<1:10:11,  8.61s/it][A
Training:  84%|████████▎ | 2514/3002 [6:18:18<1:03:59,  7.87s/it][A
Training:  84%|████████▍ |

>>> training loss: 1.2837, valid loss: 1.2428, valid f1 score: 0.5326



Training: 100%|█████████▉| 3001/3002 [7:24:19<02:20, 140.66s/it][A
Training: 100%|██████████| 3002/3002 [7:24:23<00:00,  8.88s/it]
100%|██████████| 1/1 [7:24:23<00:00, 26663.35s/it]


In [16]:
def predict(config, id2label, model, test_dataloader):
    test_iterator = tqdm(test_dataloader, desc='Predicting', total=len(test_dataloader))
    model.eval()
    test_preds = []
    with torch.no_grad():
        for batch in test_iterator:
            batch = {item: value.to(config['device']) for item, value in batch.items()}
            logits = model(**batch)[1]

            test_preds.append(logits.argmax(dim=-1).detach().cpu())
    test_preds = torch.cat(test_preds, dim=0).numpy()
    test_preds = [id2label[id_] for id_ in test_preds]
    test_df = pd.read_csv(config['test_file_path'], sep=',')
    test_df.insert(1, column='label', value=test_preds)
    test_df.drop(columns=['sentence'], inplace=True)
    test_df.to_csv('submission.csv', index=False, encoding='utf8')

In [17]:
predict(config, id2label, model, test_dataloader)

Predicting: 100%|██████████| 625/625 [18:49<00:00,  1.81s/it]
