In [1]:
import pandas as pd
bad_case=pd.read_csv("../data/data_test1_bad.csv")

In [2]:
from transformers import AutoTokenizer, AutoModel
# added_token=['##char##']
# tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese",additional_special_tokens=added_token)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def text2token(text,tokenizer,max_length=100):
    text2id = tokenizer(
        text, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt"
    )
    input_ids=text2id["input_ids"].tolist()
    attention_mask=text2id["attention_mask"].tolist()
    return input_ids,attention_mask
def data2token(data_,tokenizer):
    text=[i for i in data_['title'].values]
    input_ids,attention_mask=text2token(text,tokenizer)
    data_['input_ids']=input_ids
    data_['attention_mask']=attention_mask
    return data_

In [3]:
from torch.utils.data import Dataset
class SentimentDataset(Dataset):
    def __init__(self,df):
        self.dataset = df
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        text = self.dataset.loc[idx, "title"]
        label = self.dataset.loc[idx, "label"]
        input_ids = self.dataset.loc[idx, "input_ids"]
        attention_mask = self.dataset.loc[idx, "attention_mask"]
        sample = {"text": text, "label": label,"input_ids":input_ids,"attention_mask":attention_mask}
        # print(sample)
        return sample
    
#按batch_size分
from torch.utils.data import DataLoader,TensorDataset
import numpy as np
import torch

batch_size=16

In [4]:
bad_case=data2token(bad_case,tokenizer)
bad_loader = DataLoader(
    SentimentDataset(bad_case), 
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=0
)

In [5]:
data_test1=pd.read_csv("../data/data_test1.csv")
data_test1=data2token(data_test1,tokenizer)
test1_loader = DataLoader(
    SentimentDataset(data_test1), 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=0
)
data_test2=pd.read_csv("../data/data_test2.csv")
data_test2=data2token(data_test2,tokenizer)
test2_loader = DataLoader(
    SentimentDataset(data_test2), 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=0
)

In [6]:
from tqdm import tqdm
from sklearn import metrics
def predict_loader(device,test_loader,cls):
    with torch.no_grad():
        cls.to(device)
        cls.eval()
        output_all=[]
        label_all=[]
        for batch_idx,batch in enumerate(test_loader):
            print(str(batch_idx)+'/'+str(len(test_loader)),end='\r')
            label=batch['label'].to(device)#batch size * 1
            label_all.append(label.view(-1,1))
            input_ids=torch.stack(batch['input_ids']).t().to(device)#batch size * 100
            attention_mask=torch.stack(batch['attention_mask']).t().to(device)#batch size * 100
            
            #计算输出
            output = cls(input_ids, attention_mask=attention_mask)#batch size * 1
            
            #四舍五入
            softmax = nn.Softmax(dim=1)
            output=softmax(output)
            output=output.argmax(dim=1)
            output_all.append(output)
        output_all=torch.cat(output_all,0)
        label_all=torch.cat(label_all,0)

        output_all=np.array(output_all.cpu())
        label_all=np.array(label_all.cpu())
        acc_score=metrics.accuracy_score(label_all,output_all)
        print("准确率:"+str(acc_score))
        print(metrics.classification_report(label_all,output_all))
        return label_all,output_all
        

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.nn.functional as F

class fn_cls(nn.Module):
    def __init__(self,device):
        super(fn_cls, self).__init__()
        self.model = AutoModel.from_pretrained("bert-base-uncased")
        self.model.resize_token_embeddings(len(tokenizer))##############
        self.model.to(device)
#         self.dropout = nn.Dropout(0.5)
        self.l1 = nn.Linear(768, leishu)

    def forward(self, x, attention_mask=None):
        outputs = self.model(x, attention_mask=attention_mask)
#         print(outputs[0])torch.Size([8, 100, 768])
#         print(outputs[1])torch.Size([8, 768])
#         print(outputs[0][:,0,:])torch.Size([8, 768])
        x = outputs[1]
#         x = self.dropout(x)
        x = self.l1(x)
        return x


import torch
device0 = torch.device('cuda:6' if torch.cuda.is_available() else "cpu")#训练集gpu
cls=torch.load("../data/cls_6_0.88785_266.58456.model",map_location=device0)


In [8]:
cls2=torch.load("../data/cls_6_0.88785_266.58456.model",map_location=device0)

In [9]:
import numpy as np
import torch
def asymmetricKL(P,Q):
#     print(P,Q)
    return sum(P * torch.log(P / Q)) #calculate the kl divergence between P and Q
 
def symmetricalKL(P,Q):
    return (asymmetricKL(P,Q)+asymmetricKL(Q,P))/2.00

print(symmetricalKL(torch.tensor([0.8,0.1,0.1]),torch.tensor([0.8,0.1,0.1])))
print(symmetricalKL(torch.tensor([0.8,0.1,0.1]),torch.tensor([0.7,0.15,0.15])))
print(symmetricalKL(torch.tensor([0.8,0.1,0.1]),torch.tensor([0.5,0.25,0.25])))

tensor(0.)
tensor(0.0269)
tensor(0.2079)


In [70]:
# model1对每个badcase预测，然后修正model2
def train_one_epoch(device_train,epoch_num):
    softmax = nn.Softmax(dim=1)
    cls.to(device_train)
    cls2.to(device_train)
    
    epoch_loss=0
    total=0
    correct=0
    output_all=[]
    label_all=[]
    for batch_idx,batch in enumerate(bad_loader):
#         print('___________batch'+str(batch_idx)+'___________')
        with torch.no_grad():
            label=batch['label'].to(device_train)#batch size * 1
            input_ids=torch.stack(batch['input_ids']).t().to(device_train)#batch size * 100
            attention_mask=torch.stack(batch['attention_mask']).t().to(device_train)#batch size * 100
            output = cls(input_ids, attention_mask=attention_mask)#batch size * 1
            output=softmax(output)
            for i in range(len(output)):
                pre_=output[i].argmax(dim=0)
                output[i][pre_]=output[i][pre_]*0.3
            output=softmax(output)
                
                
        output2=cls2(input_ids, attention_mask=attention_mask)
        output2=softmax(output2)

        
        loss=0
        for i in range(len(output2)):
#             print(output[i].tolist(),'\n',output2[i].tolist(),'\n','\n')
            loss = loss + symmetricalKL(output2[i],output[i])
            
        print(str(batch_idx)+'/'+str(len(bad_loader))+' batch_loss:'+str(loss.item()),end='\r')
        with torch.no_grad():
            epoch_loss+=loss.item()

        optimizer2.zero_grad() # 将所有参数的梯度都置零
        loss.backward()    # 误差反向传播计算参数梯度
        optimizer2.step()    # 通过梯度做一步参数更新
    return epoch_loss
        

In [74]:
# from torch import optim
# cls2.load_state_dict(cls.state_dict())
# optimizer2 = optim.Adam(cls2.parameters(), lr=1e-6)

epoch_loss=train_one_epoch(device0,0)
epoch_loss

178/179 batch_loss:0.11105266213417053

68.44869008660316

In [75]:
label_all,output_all=predict_loader(device0,test1_loader,cls2)

准确率:0.871512539184953
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      6380
           1       0.88      0.95      0.92      6380
           2       0.83      0.83      0.83      6380
           3       0.87      0.82      0.85      6380

    accuracy                           0.87     25520
   macro avg       0.87      0.87      0.87     25520
weighted avg       0.87      0.87      0.87     25520



In [76]:
label_all,output_all=predict_loader(device0,test2_loader,cls2)

准确率:0.8690047021943573
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      6380
           1       0.88      0.95      0.91      6380
           2       0.84      0.82      0.83      6380
           3       0.86      0.83      0.85      6380

    accuracy                           0.87     25520
   macro avg       0.87      0.87      0.87     25520
weighted avg       0.87      0.87      0.87     25520



In [22]:
label_all,output_all=predict_loader(device0,test1_loader,cls)

准确率:0.8880485893416928
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      6380
           1       0.92      0.95      0.93      6380
           2       0.85      0.85      0.85      6380
           3       0.87      0.87      0.87      6380

    accuracy                           0.89     25520
   macro avg       0.89      0.89      0.89     25520
weighted avg       0.89      0.89      0.89     25520



In [24]:
label_all,output_all=predict_loader(device0,test2_loader,cls)

准确率:0.8887931034482759
              precision    recall  f1-score   support

           0       0.92      0.88      0.90      6380
           1       0.92      0.95      0.93      6380
           2       0.86      0.84      0.85      6380
           3       0.86      0.88      0.87      6380

    accuracy                           0.89     25520
   macro avg       0.89      0.89      0.89     25520
weighted avg       0.89      0.89      0.89     25520



In [11]:
from torch import optim
cls2=torch.load("../data/cls_6_0.88785_266.58456.model",map_location=device0)
optimizer2 = optim.Adam(cls2.parameters(), lr=1e-4)

for i in range(5):
    print("____________________epoch:"+str(i)+"____________________")
    epoch_loss=train_one_epoch(device0,0)
    print(epoch_loss)
    label_all,output_all=predict_loader(device0,test1_loader,cls2)


____________________epoch:0____________________
nan/179 batch_loss:nan
准确率:0.255


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.25      1.00      0.40      6380
           1       0.00      0.00      0.00      6380
           2       0.00      0.00      0.00      6380
           3       0.00      0.00      0.00      6380

    accuracy                           0.25     25520
   macro avg       0.06      0.25      0.10     25520
weighted avg       0.06      0.25      0.10     25520

____________________epoch:1____________________
37/179 batch_loss:nan

KeyboardInterrupt: 

In [9]:
def predict(device,s_l,cls):
    with torch.no_grad():
        cls.to(device)
        cls.eval()
        text2id = tokenizer(
            s_l, max_length=100, padding='max_length', truncation=True, return_tensors="pt"
        )
        input_ids=text2id["input_ids"].to(device)
        mask=text2id["attention_mask"].to(device)
        output = cls(input_ids, attention_mask=mask)
        softmax = nn.Softmax(dim=1)
        output1=softmax(output)
        output2=output.argmax(dim=1)
        return output1,output2

In [10]:
s=['Echoes Repeats Success','"Stocks Finish Lower, Retail Sector Weighs"','Report indicates Wannstedt out','Conference Members Back Iraqi Efforts']
print(predict(device0,s,cls))
print(predict(device0,s,cls2))

(tensor([[1.6244e-02, 9.7378e-01, 5.3007e-03, 4.6791e-03],
        [6.1406e-02, 1.4047e-03, 9.3121e-01, 5.9830e-03],
        [7.6313e-03, 9.8954e-01, 2.2822e-03, 5.4985e-04],
        [9.9728e-01, 8.1377e-04, 1.0767e-03, 8.2985e-04]], device='cuda:6'), tensor([1, 2, 1, 0], device='cuda:6'))
(tensor([[1.6244e-02, 9.7378e-01, 5.3007e-03, 4.6791e-03],
        [6.1406e-02, 1.4047e-03, 9.3121e-01, 5.9830e-03],
        [7.6313e-03, 9.8954e-01, 2.2822e-03, 5.4985e-04],
        [9.9728e-01, 8.1377e-04, 1.0767e-03, 8.2985e-04]], device='cuda:6'), tensor([1, 2, 1, 0], device='cuda:6'))


In [38]:
import time
end=time.time()
torch.save(cls2,"../data/cls2_bad_"+str(end)+".model")

In [None]:
# 0.02694982503663435
# 0.20794415416798367
