In [1]:
import os
import sys
import time
import math

import tensorboard_logger as tb_logger
import torch
from torchvision import transforms, datasets

from util import AverageMeter
# from util import adjust_learning_rate, warmup_learning_rate
# from util import set_optimizer, save_model
# from networks.resnet_big import SupConResNet

In [None]:
# build data loader

from transformers import AutoTokenizer, AutoModel
# added_token=['##char##']
# tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese",additional_special_tokens=added_token)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def text2token(text,tokenizer,max_length=100):
    text2id = tokenizer(
        text, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt"
    )
    input_ids=text2id["input_ids"].tolist()
    attention_mask=text2id["attention_mask"].tolist()
    return input_ids,attention_mask
def data2token(data_,tokenizer):
    text=[i for i in data_['title'].values]
    input_ids,attention_mask=text2token(text,tokenizer)
    data_['input_ids']=input_ids
    data_['attention_mask']=attention_mask
    return data_
from torch.utils.data import Dataset
class SentimentDataset(Dataset):
    def __init__(self,df):
        self.dataset = df
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        text = self.dataset.loc[idx, "title"]
        label = self.dataset.loc[idx, "label"]
        input_ids = self.dataset.loc[idx, "input_ids"]
        attention_mask = self.dataset.loc[idx, "attention_mask"]
        sample = {"text": text, "label": label,"input_ids":input_ids,"attention_mask":attention_mask}
        # print(sample)
        return sample
    
import numpy as np
import pandas as pd
data_train=pd.read_csv("../data/data_train.csv")
data_val=pd.read_csv("../data/data_val.csv")

data_train=data2token(data_train,tokenizer)
data_val=data2token(data_val,tokenizer)

#按batch_size分
from torch.utils.data import DataLoader,TensorDataset
import numpy as np
import torch



In [4]:
batch_size=3
train_loader = DataLoader(
    SentimentDataset(data_train), 
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=0
)
test_loader = DataLoader(
    SentimentDataset(data_val), 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=0
)

In [6]:
import torch
import torch.nn as nn
import math

class SupConLoss(nn.Module):
    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
    It also supports the unsupervised contrastive loss in SimCLR"""
    def __init__(self, temperature=0.07, contrast_mode='all',
                 base_temperature=0.07):
        super(SupConLoss, self).__init__()
        self.temperature = temperature
        self.contrast_mode = contrast_mode
        self.base_temperature = base_temperature

    def forward(self, features,labels=None, mask=None):
        """Compute loss for model. If both `labels` and `mask` are None,
        it degenerates to SimCLR unsupervised loss:
        https://arxiv.org/pdf/2002.05709.pdf

        Args:
            features: hidden vector of shape [bsz, n_views, ...].
            labels: ground truth of shape [bsz].
            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
                has the same class as sample i. Can be asymmetric.
        Returns:
            A loss scalar.
        """
        device=device0

        batch_size = features.shape[0]
        
        labels = labels.contiguous().view(-1, 1)
        if labels.shape[0] != batch_size:
            raise ValueError('Num of labels does not match num of features')
        mask = torch.eq(labels, labels.T).float().to(device)
        
        
        mask_contrast=1-mask
#         print('mask_contrast\n',mask_contrast)
        
        diag = torch.diag(mask)
        a_diag = torch.diag_embed(diag)
        mask_same= mask - a_diag
#         print('mask_same\n',mask_same)
        
        # compute logits
#         print(features.shape)
        logits = torch.div(torch.matmul(features, features.T),self.temperature)
#         print('logits\n',logits)
        
#         logits_max, _ =torch.max(logits, dim=1, keepdim=True)
#         logits = logits - logits_max.detach()
#         print('logits\n',logits)
        diagl = torch.diag(logits)
        logits=logits-diagl
#         print('logits\n',logits)

        
        exp_logits=torch.exp(logits)
#         print('exp_logits\n',exp_logits)
        
        b = torch.zeros(batch_size, batch_size).to(device)
        exp_logits=torch.where(exp_logits !=1, exp_logits, b)
#         print('exp_logits\n',exp_logits)

        
        # compute log_prob
        logits_contrast = exp_logits * mask_contrast
#         print('logits_contrast\n',logits_contrast)
        logits_same=exp_logits * mask_same
#         print('logits_same\n',logits_same)
        
        logits_contrast_sum=logits_contrast.sum(1, keepdim=True)
#         print('logits_contrast_sum\n',logits_contrast_sum)
        logits_same_sum=logits_same.sum(1, keepdim=True)
#         print('logits_same_sum\n',logits_same_sum)
        
#         print('torch.log(logits_same_sum)-torch.log(logits_contrast_sum)\n',torch.log(logits_same_sum)-torch.log(logits_contrast_sum))
        mask_same_sum=mask_same.sum(1,keepdim=True)
#         print('mask_same.sum(1,keepdim=True)\n',mask_same_sum)
        mean_log_prob_pos=(torch.log(logits_same_sum)-torch.log(logits_contrast_sum))/mask_same_sum
#         print('mean_log_prob_pos',mean_log_prob_pos)
        
        
        loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
        sum_=torch.tensor([0.0]).to(device)
        for i in loss:
            if not math.isinf(i) and not math.isnan(i):
                sum_+=i
#         print('sum_\n',sum_)
        if sum_==0:
            return sum_
        else:
            return torch.exp(sum_)
# criterion= SupConLoss(temperature=opt.temp)
# criterion.to(device0)

In [7]:
# # X = torch.tensor([[[0.2,0.2,0.6],[0.2,0.2,0.6]],[[0.1,0.1,0.8],[0.1,0.1,0.8]]])
# X = torch.tensor([[0.1,0.1,0.8],[0.2,0.2,0.6],[0.3,0.3,0.4],[0.4,0.4,0.2]])
# # X = torch.tensor([[0.1,0.1,0.8],[0.1,0.1,0.8],[0.1,0.1,0.8],[0.1,0.1,0.8]])
# y = torch.tensor([2,2,1,2])
# print(X.shape,X,y)

# criterion(X.to(device0),y.to(device0))

In [15]:
# build model and criterion
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.nn.functional as F
device0 = torch.device('cuda:7' if torch.cuda.is_available() else "cpu")#训练集gpu

class fn_cls(nn.Module):
    def __init__(self,device):
        super(fn_cls, self).__init__()
        self.model = AutoModel.from_pretrained("bert-base-uncased")
        self.model.resize_token_embeddings(len(tokenizer))##############
        self.model.to(device)
#         self.dropout = nn.Dropout(0.5)
        self.l1 = nn.Linear(768, 4)

    def forward(self, x, attention_mask=None):
        outputs = self.model(x, attention_mask=attention_mask)
#         print(outputs)
#         print(outputs[0])torch.Size([8, 100, 768])
#         print(outputs[1])torch.Size([8, 768])
#         print(outputs[0][:,0,:])torch.Size([8, 768])
        x = outputs[1]
#         x = self.dropout(x)
        x = self.l1(x)
        return x
    

model = fn_cls(device0)
criterion = SupConLoss()
model.to(device0)
criterion.to(device0)

# build optimizer
from torch import optim
optimizer = optim.Adam(model.parameters(), lr=1e-6)

# tensorboard
# logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

softmax = nn.Softmax(dim=1)
criterioncc = nn.CrossEntropyLoss().to(device0)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
def train(train_loader, model, criterion, optimizer, epoch):
    """one epoch training"""
    model.train()
    label_all=[]
    batch_time = AverageMeter()
    data_time = AverageMeter()
    end = time.time()
    losses = AverageMeter()
    
    for idx, batch in enumerate(train_loader):
        print('___________________________',str(idx),'___________________________')
        data_time.update(time.time() - end)
        
        labels=batch['label'].to(device0)#batch size * 1
        label_all.append(labels.view(-1,1))
        input_ids=torch.stack(batch['input_ids']).t().to(device0)#batch size * 100
        attention_mask=torch.stack(batch['attention_mask']).t().to(device0)#batch size * 100

        bsz = labels.shape[0]

        # warm-up learning rate
#         warmup_learning_rate(opt, epoch, idx, len(train_loader), optimizer)

        # compute loss
        features = model(input_ids, attention_mask=attention_mask)
        print(features)
        losscc=criterioncc(features,labels)
        
        features=softmax(features)
        print(features)

        loss = criterion(features,labels)
        
#         loss_a=losscc+loss
        loss_a=losscc+loss/1000
        # update metric
        

        # SGD
        if loss_a!=0:
            print('losscc\n',losscc)
            print('loss\n',loss)
            losses.update(loss_a.item(), bsz)
            optimizer.zero_grad()
            loss_a.backward()
            optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print info
        if (idx + 1) % 100 == 0:
            print('Train: [{0}][{1}/{2}]\t'
                  'batch_time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'data_time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'loss {loss.val:.3f} ({loss.avg:.3f})'.format(
                   epoch, idx + 1, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses))
            sys.stdout.flush()

    return losses.avg

In [17]:
# training routine
for epoch in range(10):
#     adjust_learning_rate(opt, optimizer, epoch)

    # train for one epoch
    time1 = time.time()
    loss = train(train_loader, model, criterion, optimizer, epoch)
    time2 = time.time()
    print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))
    
    

___________________________ 0 ___________________________
tensor([[-0.3796, -0.7810, -0.5871,  0.2546],
        [-0.2860, -0.7125, -0.5380,  0.3207],
        [ 0.1331, -0.3649,  0.1013, -0.5504]], device='cuda:7',
       grad_fn=<AddmmBackward0>)
tensor([[0.2290, 0.1533, 0.1861, 0.4317],
        [0.2345, 0.1531, 0.1823, 0.4302],
        [0.3245, 0.1972, 0.3144, 0.1638]], device='cuda:7',
       grad_fn=<SoftmaxBackward0>)
losscc
 tensor(1.1385, device='cuda:7', grad_fn=<NllLossBackward0>)
loss
 tensor([1.6525], device='cuda:7', grad_fn=<ExpBackward0>)
___________________________ 1 ___________________________
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], device='cuda:7', grad_fn=<AddmmBackward0>)
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], device='cuda:7', grad_fn=<SoftmaxBackward0>)
losscc
 tensor(nan, device='cuda:7', grad_fn=<NllLossBackward0>)
loss
 tensor([0.], device='cuda:7')
_____________

___________________________ 20 ___________________________
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], device='cuda:7', grad_fn=<AddmmBackward0>)
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], device='cuda:7', grad_fn=<SoftmaxBackward0>)
losscc
 tensor(nan, device='cuda:7', grad_fn=<NllLossBackward0>)
loss
 tensor([0.], device='cuda:7')
___________________________ 21 ___________________________
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], device='cuda:7', grad_fn=<AddmmBackward0>)
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], device='cuda:7', grad_fn=<SoftmaxBackward0>)
losscc
 tensor(nan, device='cuda:7', grad_fn=<NllLossBackward0>)
loss
 tensor([0.], device='cuda:7')
___________________________ 22 ___________________________
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, na

tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], device='cuda:7', grad_fn=<AddmmBackward0>)
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], device='cuda:7', grad_fn=<SoftmaxBackward0>)
losscc
 tensor(nan, device='cuda:7', grad_fn=<NllLossBackward0>)
loss
 tensor([0.], device='cuda:7')
___________________________ 42 ___________________________
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], device='cuda:7', grad_fn=<AddmmBackward0>)
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], device='cuda:7', grad_fn=<SoftmaxBackward0>)
losscc
 tensor(nan, device='cuda:7', grad_fn=<NllLossBackward0>)
loss
 tensor([0.], device='cuda:7')
___________________________ 43 ___________________________
tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], device='cuda:7', grad_fn=<AddmmBackward0>)
tensor

KeyboardInterrupt: 

In [None]:
# save the last model
save_file = os.path.join(
    opt.save_folder, 'last.pth')
save_model(model, optimizer, opt, opt.epochs, save_file)