In [1]:
!wandb login

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='6'

In [3]:
# Inside my model training code
import wandb

wandb.init(project="meld-emo-cls",entity='qftie',group='bert')

config = wandb.config          # Initialize config
config.batch_size = 8          # input batch size for training (default: 64)
config.test_batch_size = 64    # input batch size for testing (default: 1000)
config.epochs = 6             # number of epochs to train (default: 10)
config.lr = 2e-5               # learning rate (default: 0.01)
config.momentum = 0.1          # SGD momentum (default: 0.5) 
config.no_cuda = False         # disables CUDA training
config.bert_path = 'roberta-base'
config.exam_name = 'roberta-forwardUt'
config.max_seq_len = 400
config.max_turn = 4
config.seed = 2022


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
2022-04-22 16:59:03.991600: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-22 16:59:03.991640: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [4]:
import pandas as pd 
import numpy as np 
import json, time
from tqdm import tqdm 
from sklearn.metrics import accuracy_score, classification_report, f1_score
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel, BertConfig, AutoTokenizer, AdamW, get_cosine_schedule_with_warmup, AutoModel, AutoConfig
import warnings
warnings.filterwarnings('ignore')

bert_path = config.bert_path
tokenizer = AutoTokenizer.from_pretrained(bert_path, truncation_side="left")   # 初始化分词器

2022-04-22 16:59:10.540148: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-22 16:59:10.540181: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# seed everything

In [5]:
import numpy as np
import random

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_seed(config.seed)

# 预处理数据集

In [6]:
emo2id = {'neutral': 0, 'surprise': 1, 'fear': 2, 'sadness': 3, 'joy': 4, 'disgust': 5, 'anger': 6}

## 准备forward utterance作为sentence

In [7]:
# back_stride = int(data_text_valid['Utterance_ID'][50][-3:])
# ' '.join(data_text_valid['Utterance'][ 50- back_stride: 50].tolist())

In [8]:
# print(x_valid_forward_utt)

## tokenize

In [9]:
def load_meld_and_tokenize(file_path, train=False, context_turn = config.max_turn):
    data_text = pd.read_csv(file_path)
    data_text['Speaker_Utterance'] = data_text['Speaker'] + ':' + data_text['Utterance']
    x= (data_text['Speaker_Utterance']).tolist()
    label = [emo2id[x] for x in data_text['Emotion']]
    context = []
    for i in range(len(data_text)):
        if data_text['Utterance_ID'][i]<context_turn:
            back_stride = data_text['Utterance_ID'][i]
        else:
            back_stride = context_turn
        utt_forward = ' '.join(data_text['Speaker_Utterance'][ i- back_stride: i].tolist())
        context.append(utt_forward)
    for i in range(len(x)):
        x[i] = context[i] + '[SEP]' + x[i]
    return tokenizer(x, truncation=True, padding=True, max_length=512),label
    
train_encoding, train_label = load_meld_and_tokenize(file_path='/148Dataset/data-tie.qianfeng/MELD/data/MELD/train_sent_emo.csv')
valid_encoding, valid_label = load_meld_and_tokenize(file_path='/148Dataset/data-tie.qianfeng/MELD/data/MELD/dev_sent_emo.csv')
test_encoding, test_label = load_meld_and_tokenize(file_path='/148Dataset/data-tie.qianfeng/MELD/data/MELD/test_sent_emo.csv')

In [10]:
train_encoding['input_ids'][0]

[0,
 10975,
 3388,
 510,
 742,
 4771,
 463,
 1371,
 35,
 19726,
 38,
 21,
 5,
 477,
 621,
 15,
 127,
 138,
 17,
 27,
 29,
 3868,
 31,
 5,
 26544,
 12,
 245,
 7,
 8837,
 12,
 401,
 467,
 4,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

# 准备训练集，验证集，测试集

In [11]:
# 数据集读取 转成dict形式
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encoding, train_label)
valid_dataset = NewsDataset(valid_encoding, valid_label)
test_dataset = NewsDataset(test_encoding, test_label)


# 加载到torch的dataloader

In [12]:
# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)

# train_loader = DataLoader(train_dataset, batch_size=config.batch_size, sampler=sampler)
valid_loader = DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config.test_batch_size, shuffle=True)



# 定义bert模型

In [13]:

# 定义model
class Bert_Model(nn.Module):
    def __init__(self, bert_path, classes=7):
        super(Bert_Model, self).__init__()
        self.config = AutoConfig.from_pretrained(bert_path)  # 导入模型超参数
        self.bert = AutoModel.from_pretrained(bert_path)     # 加载预训练模型权重
        self.fc = nn.Linear(self.config.hidden_size, classes)  # 直接分类
        self.dense = nn.Linear(self.config.hidden_size, self.config.hidden_size)
        self.activation = nn.Tanh()
        self.pred = nn.Sequential(
            nn.Linear(self.config.hidden_size, 64),
            nn.Dropout(0.3),
            nn.Linear(64, classes)
        )


        
        
    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask, output_hidden_states= True, return_dict=True)
        # out_pool = outputs[1]   # 池化后的输出 [bs, config.hidden_size]
        out_pool = torch.mean(outputs.last_hidden_state, 1)
        # out_pool = self.dense(out_pool)
        # out_pool = self.activation(out_pool)
        logit = self.pred(out_pool)   #  [bs, classes]
        return logit

# 实例化bert模型

In [14]:
def get_parameter_number(model):
    #  打印模型参数量
    total_num = sum(p.numel() for p in model.parameters())
    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return 'Total parameters: {}, Trainable parameters: {}'.format(total_num, trainable_num)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = config.epochs
model = Bert_Model(bert_path)
model = nn.DataParallel(model)
model = model.cuda()
print(get_parameter_number(model))


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Total parameters: 125291278, Trainable parameters: 125291278


# 优化器定义

In [15]:
optimizer = AdamW(model.parameters(), lr=config.lr, weight_decay=1e-4) #AdamW优化器
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0.3*len(train_loader),
                                            num_training_steps=EPOCHS*len(train_loader))
# 学习率先线性warmup一个epoch，然后cosine式下降。
# 这里给个小提示，一定要加warmup（学习率从0慢慢升上去），如果把warmup去掉，可能收敛不了。

In [16]:
# # 准备放入多卡环境
# model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)

# 定义训练函数和验证测试函数

In [17]:
# 评估模型性能，在验证集上
def evaluate(model, data_loader, device):
    model.eval()
    criterion = nn.CrossEntropyLoss()
    val_true, val_pred = [], []
    valid_loss_sum = 0.0
    with torch.no_grad():
        for idx, batch in enumerate(data_loader):
            y_pred = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
            loss = criterion(y_pred, batch['labels'].to(device))
            y_pred = torch.argmax(y_pred, dim=1).detach().cpu().numpy().tolist()
            val_pred.extend(y_pred)
            val_true.extend(batch['labels'].cpu().numpy().tolist())
            valid_loss_sum += loss.item()
            
    print(classification_report(val_true, val_pred, digits=4))
    return accuracy_score(val_true, val_pred), valid_loss_sum/len(data_loader), f1_score(val_true, val_pred, average='macro')  #返回accuracy, loss, f1-macro


# 测试集没有标签，需要预测提交
def predict(model, data_loader, device):
    model.eval()
    val_pred = []
    with torch.no_grad():
        for batch in data_loader:
            y_pred = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
            y_pred = torch.argmax(y_pred, dim=1).detach().cpu().numpy().tolist()
            val_pred.extend(y_pred)
    return val_pred


def train_and_eval(model, train_loader, valid_loader, 
                   optimizer, scheduler, device, epoch):
    best_acc = 0.0
    patience = 0
    best_loss = 100
    best_macro_f1 = 0
    b = 0.6
    criterion = nn.CrossEntropyLoss()
    for i in range(epoch):
        """训练模型"""
        start = time.time()
        model.train()
        print("***** Running training epoch {} *****".format(i+1))
        train_loss_sum = 0.0
        for idx, batch in enumerate(train_loader):
            ids = batch['input_ids'].to(device)
            att = batch['attention_mask'].to(device)
            y = batch['labels'].to(device)  
            y_pred = model(ids, att)
            loss = criterion(y_pred, y)
            loss = (loss - b).abs() + b # This is it!
            step_lr = np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()   # 学习率变化
            
            train_loss_sum += loss.item()
            if (idx + 1) % (len(train_loader)//20) == 0:    # 只打印五次结果
                wandb.log({
                            'Epoch': i+1, 
                            'train_loss': loss,
                            'lr': step_lr
                            })
                print("Epoch {:04d} | Step {:04d}/{:04d} | Loss {:.4f} | Time {:.4f}".format(
                          i+1, idx+1, len(train_loader), train_loss_sum/(idx+1), time.time() - start))
                # print("Learning rate = {}".format(optimizer.state_dict()['param_groups'][0]['lr']))

        """验证模型"""
        model.eval()
        acc, valid_loss, valid_macro_f1 = evaluate(model, valid_loader, device)  # 验证模型的性能
        wandb.log({'valid_acc': acc, 'valid_loss': valid_loss})
        # 保存最优模型
        if valid_loss < best_loss:
            best_loss = valid_loss
            

        if valid_macro_f1 > best_macro_f1:
            best_macro_f1 = valid_macro_f1
            torch.save(model.state_dict(), 'pytorch_model.bin') 
            
        
        print("current macro_f1 is {:.4f}, best macro_f1 is {:.4f}".format(valid_macro_f1, best_macro_f1))
        print("time costed = {}s \n".format(round(time.time() - start, 5)))

# 训练和验证模型

In [18]:
# 训练和验证评估
train_and_eval(model, train_loader, valid_loader, optimizer, scheduler, DEVICE, EPOCHS)


***** Running training epoch 1 *****
Epoch 0001 | Step 0062/1249 | Loss 1.8102 | Time 7.5361
Epoch 0001 | Step 0124/1249 | Loss 1.6821 | Time 14.2847
Epoch 0001 | Step 0186/1249 | Loss 1.6213 | Time 21.0937
Epoch 0001 | Step 0248/1249 | Loss 1.5718 | Time 27.9596
Epoch 0001 | Step 0310/1249 | Loss 1.5353 | Time 34.9044
Epoch 0001 | Step 0372/1249 | Loss 1.5057 | Time 42.2575
Epoch 0001 | Step 0434/1249 | Loss 1.4843 | Time 49.5967
Epoch 0001 | Step 0496/1249 | Loss 1.4614 | Time 56.9776
Epoch 0001 | Step 0558/1249 | Loss 1.4457 | Time 64.0088
Epoch 0001 | Step 0620/1249 | Loss 1.4246 | Time 70.9422
Epoch 0001 | Step 0682/1249 | Loss 1.4057 | Time 77.8246
Epoch 0001 | Step 0744/1249 | Loss 1.3946 | Time 84.7977
Epoch 0001 | Step 0806/1249 | Loss 1.3891 | Time 91.7821
Epoch 0001 | Step 0868/1249 | Loss 1.3774 | Time 98.6883
Epoch 0001 | Step 0930/1249 | Loss 1.3671 | Time 105.5869
Epoch 0001 | Step 0992/1249 | Loss 1.3609 | Time 112.5378
Epoch 0001 | Step 1054/1249 | Loss 1.3455 | Time 1

# 加载最优模型测试

In [19]:
# 加载最优权重对测试集测试
model.load_state_dict(torch.load("pytorch_model.bin"))
pred_test = evaluate(model, test_loader, DEVICE)
# print("\n Test Accuracy = {} \n".format(accuracy_score(test_label, pred_test)))
# print(classification_report(test_label, pred_test, digits=4))
wandb.finish()

              precision    recall  f1-score   support

           0     0.7559    0.8185    0.7859      1256
           1     0.5151    0.5480    0.5310       281
           2     0.2800    0.2800    0.2800        50
           3     0.4400    0.3173    0.3687       208
           4     0.6053    0.6219    0.6135       402
           5     0.2917    0.2059    0.2414        68
           6     0.5310    0.4464    0.4850       345

    accuracy                         0.6437      2610
   macro avg     0.4884    0.4626    0.4722      2610
weighted avg     0.6307    0.6437    0.6350      2610




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Epoch,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▄▄▄▄▄▄▅▅▅▅▅▅▅▇▇▇▇▇▇███████
lr,▂▆███████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
train_loss,▇▃▇▁▂▃▄▂▂▅▅█▆▃▂▁▂▅▁▂▃▂▆▂▂▃▂▁▂▂▂▁▁▃▂▁▃▂▁▂
valid_acc,▂▁█▅█▆
valid_loss,█▆▃▃▁▁

0,1
Epoch,6.0
lr,0.0
train_loss,0.84557
valid_acc,0.61497
valid_loss,1.11149


In [20]:
# output_dir = 'output/ch-roberta-dorwardUt'
# os.makedirs(output_dir, exist_ok=True)
# torch.save(model.state_dict(), output_dir+"/pytorch_model.bin")
# # torch.save(model, "pytorch_model_whole.bin")

In [21]:
torch.cuda.empty_cache()
torch.cuda.empty_cache()
torch.cuda.empty_cache()