In [None]:
!pip install transformers

!nvidia-smi

In [None]:
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn

import pandas as pd
from transformers import  BertTokenizer,BertModel,RobertaTokenizer
from sklearn.metrics import accuracy_score, roc_auc_score,f1_score
from  sklearn.model_selection import train_test_split
import os
import random
import glob
import torch
import re
# choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
# 'n2': for regular n2 attantion
# 'tvm': a custom CUDA kernel implementation of our sliding window attention
# 'sliding_chunks': a PyTorch implementation of our sliding window attention

In [None]:
def qingli(s):
    #pattern  = r"(https?://|[@#])\S*"
    #a = re.sub(pattern, '', s)
    #string1 = s.apply(lambda x:re.sub('[A-z]','*',str(x)))#去除字母
    string1 = s.apply(lambda x: re.sub('[0-9]', '*',str(x)))#去除数字
    m=re.compile('\s+')#定义空格
    string2 = string1.apply(lambda x: re.sub(m, '*',x))#去除空格
    punctuation = """，！？｡＂#＄％＆＇（）＊＋－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏"""
    re_punctuation = "[{}]+".format(punctuation)#去除标点符号
    string3 = string2.apply(lambda x: re.sub(re_punctuation, '*', x))
    a = string3.apply(lambda x: re.sub('\*','',x))
    return a

In [None]:
class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, eps=0.4, reduction='mean',gamma=2,alpha=0.7,size_average=True):
        super(LabelSmoothingCrossEntropy, self).__init__()
        self.eps = eps
        self.reduction = reduction
        self.gamma = gamma
        self.alpha = alpha
        # if isinstance(alpha, (float, int)): self.alpha = torch.Tensor([alpha, 1 - alpha])  # long
        # if isinstance(alpha, list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average
    def forward(self, output, target):
        c = output.size()[-1] #K
        log_preds = F.log_softmax(output, dim=-1)
        if self.reduction=='sum':
            loss = -log_preds.sum()
        else:
            loss = -log_preds.sum(dim=-1)
            if self.reduction=='mean':
                loss = loss.mean()
        # return loss*self.eps/c + (1-self.eps) * F.nll_loss(log_preds, target, reduction=self.reduction)
        return loss*self.eps/c + (1-self.eps) * self.Focalloss(output, target)

    def Focalloss(self, input, target):
            # input:size is M*2. M　is the batch　number
            # target:size is M.
            pt = torch.softmax(input, dim=1)
            p = pt[:, 1]
            loss = -self.alpha * (1 - p) ** self.gamma * (target * torch.log(p)) - \
                   (1 - self.alpha) * p ** self.gamma * ((1 - target) * torch.log(1 - p))
            return loss.mean()


In [None]:
import math
import torch
from torch.optim.optimizer import Optimizer, required

# from tensorboardX import SummaryWriter
# writer = SummaryWriter(logdir='/cps/gadam/n_cifa/')
# iter_idx = 0

# from ipdb import set_trace
import torch.optim

class RAdam(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0):
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay)

        super(RAdam, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(RAdam, self).__setstate__(state)

    def step(self, closure=None):
        loss = None
        beta2_t = None
        ratio = None
        N_sma_max = None
        N_sma = None

        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

                state['step'] += 1
                if beta2_t is None:
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    beta1_t = 1 - beta1 ** state['step']
                    if N_sma >= 5:
                        ratio = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / beta1_t

                if group['weight_decay'] != 0:
                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)

                # more conservative since it's an approximated value
                if N_sma >= 5:                    
                    step_size = group['lr'] * ratio
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
                else:
                    step_size = group['lr'] / beta1_t
                    p_data_fp32.add_(-step_size, exp_avg)

                p.data.copy_(p_data_fp32)

        return loss

In [None]:
from collections import defaultdict
from torch.optim import Optimizer
import torch
 
class Lookahead(Optimizer):
    def __init__(self, optimizer, k=5, alpha=0.5):
        self.optimizer = optimizer
 
        self.k = k
        self.alpha = alpha
        self.param_groups = self.optimizer.param_groups
        self.state = defaultdict(dict)
        self.fast_state = self.optimizer.state
        for group in self.param_groups:
            group["counter"] = 0
 
    def update(self, group):
        for fast in group["params"]:
            param_state = self.state[fast]
            if "slow_param" not in param_state:
                param_state["slow_param"] = torch.zeros_like(fast.data)
                param_state["slow_param"].copy_(fast.data)
            slow = param_state["slow_param"]
            slow += (fast.data - slow) * self.alpha
            fast.data.copy_(slow)
 
    def update_lookahead(self):
        for group in self.param_groups:
            self.update(group)
 
    def step(self, closure=None):
        loss = self.optimizer.step(closure)
        for group in self.param_groups:
            if group["counter"] == 0:
                self.update(group)
            group["counter"] += 1
            if group["counter"] >= self.k:
                group["counter"] = 0
        return loss
 
    def state_dict(self):
        fast_state_dict = self.optimizer.state_dict()
        slow_state = {
            (id(k) if isinstance(k, torch.Tensor) else k): v
            for k, v in self.state.items()
        }
        fast_state = fast_state_dict["state"]
        param_groups = fast_state_dict["param_groups"]
        return {
            "fast_state": fast_state,
            "slow_state": slow_state,
            "param_groups": param_groups,
        }
 
    def load_state_dict(self, state_dict):
        slow_state_dict = {
            "state": state_dict["slow_state"],
            "param_groups": state_dict["param_groups"],
        }
        fast_state_dict = {
            "state": state_dict["fast_state"],
            "param_groups": state_dict["param_groups"],
        }
        super(Lookahead, self).load_state_dict(slow_state_dict)
        self.optimizer.load_state_dict(fast_state_dict)
        self.fast_state = self.optimizer.state
 
    def add_param_group(self, param_group):
        param_group["counter"] = 0
        self.optimizer.add_param_group(param_group)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class FocalLoss(nn.Module):

    def __init__(self, gamma=2, alpha=0.7):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, input, target):
        # input:size is M*2. M　is the batch　number
        # target:size is M.
        pt = torch.softmax(input, dim=1)
        p = pt[:, 1]
        loss = -self.alpha * (1 - p) ** self.gamma * (target * torch.log(p)) - \
               (1 - self.alpha) * p ** self.gamma * ((1 - target) * torch.log(1 - p))
        return loss.mean()

In [None]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}
 
    def attack(self, epsilon=0.3, emb_name='embedding.'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)
 
    def restore(self, emb_name='embedding.'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name: 
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

In [None]:
!pip install jieba
import jieba
from transformers import BasicTokenizer, BertTokenizer


class CustomBasicTokenizer(BasicTokenizer):
    def __init__(self,
                 vocab,
                 do_lower_case=True,
                 never_split=None,
                 tokenize_chinese_chars=True,
                 strip_accents=None):
        super().__init__(do_lower_case=do_lower_case,
                         never_split=never_split,
                         tokenize_chinese_chars=tokenize_chinese_chars,
                         strip_accents=strip_accents)

        self.vocab = vocab

    def _tokenize_chinese_chars(self, text):
        output = []
        '''
        1、输入一个句子s，用pre_tokenize先分一次词，得到[w1,w2,…,wl]；
        2、遍历各个wi，如果wi在词表中则保留，否则将wi用BERT自带的tokenize函数再分一次；
        3、将每个wi的tokenize结果有序拼接起来，作为最后的tokenize结果。
        '''
        for wholeword in jieba.cut(text, HMM=False):
            if wholeword in self.vocab:
                output.append(" ")
                output.append(wholeword)
                output.append(" ")
            else:
                for char in wholeword:
                    cp = ord(char)
                    if self._is_chinese_char(cp):
                        output.append(" ")
                        output.append(char)
                        output.append(" ")
                    else:
                        output.append(char)
        return "".join(output)


class WoBertTokenizer(BertTokenizer):
    def __init__(self,
                 vocab_file,
                 do_lower_case=True,
                 do_basic_tokenize=True,
                 never_split=None,
                 unk_token="[UNK]",
                 sep_token="[SEP]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]",
                 tokenize_chinese_chars=True,
                 strip_accents=None,
                 **kwargs):
        super().__init__(vocab_file,
                         do_lower_case=do_lower_case,
                         do_basic_tokenize=do_basic_tokenize,
                         never_split=never_split,
                         unk_token=unk_token,
                         sep_token=sep_token,
                         pad_token=pad_token,
                         cls_token=cls_token,
                         mask_token=mask_token,
                         tokenize_chinese_chars=tokenize_chinese_chars,
                         strip_accents=strip_accents,
                         **kwargs)
        if self.do_basic_tokenize:
            self.basic_tokenizer = CustomBasicTokenizer(
                vocab=self.vocab,
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )

In [None]:

# ————————————————确定种子——————————————————
def seed_torch(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.enabled = False
seed_torch(42)

# ——————————————————1. 加载模型——————————————————————
pretrain_model_path ="hfl/chinese-bert-wwm"#hfl/chinese-bert-wwm
# tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')


class Model(nn.Module):
    def __init__(self, pretrain_model_path, hidden_size:int=768):
        super(Model, self).__init__()
        self.pretrain_model_path = pretrain_model_path
        self.bert = BertModel.from_pretrained(pretrain_model_path)
        self.avgpool = nn.AvgPool1d(512)
        #冻结了除pooler层的所有的参数更新--->可选参数为encoder,pooler,embedding
        for param in self.bert.parameters():
            param.requires_grad = True
#             else:
#                 param.requires_grad = False
#                 # param.requires_grad_(False)
        self.embed_size  = hidden_size
        self.linear1 = nn.Linear(2 * hidden_size, hidden_size)
        self.embed = nn.Embedding(3,768)
        self.cls = nn.Linear(self.embed_size, 2)
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Linear(512, 2),
        )
    def forward(self, input_ids, token_type_ids, attention_mask,cond):
        weight=self.embed(cond).squeeze(1)
        output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,output_hidden_states =True,output_attentions=True)
        sequence_out, cls_out = output['last_hidden_state'],output['pooler_output']
#         sequence_out = sequence_out.permute(0,2,1).contiguous()
#         avgpool_out =self.avgpool(sequence_out).squeeze(2)
#         # maxpool_out = self.maxpool(sequence_out).squeeze(2)
#         outputs = torch.cat((avgpool_out, cls_out), 1)
#         outputs =self.linear1(outputs)
#         cls_out=cls_out+weight
        logits = self.linear_relu_stack(sequence_out[:,0])
        return  logits



# 2. 建立Dataset
class SohuDataset(Dataset):
    def __init__(self,corpus,pretrain_model_path,max_length,is_test:bool):
        self.corpus = corpus
        self.tokenizer = BertTokenizer.from_pretrained(pretrain_model_path)
        self.max_length = max_length
        self.is_test = is_test
    def __len__(self):
        return self.corpus.shape[0]

    def __getitem__(self, idx):
        content = self.corpus.iloc[idx].values
        if self.is_test:
            output = self.tokenizer.encode_plus(content[0], content[1], truncation=True, max_length=self.max_length,
                                                padding='max_length')
            return {'input_ids': np.array(output['input_ids']),
                    'token_type_ids': np.array(output['token_type_ids']),
                    'attention_mask': np.array(output['attention_mask']),
                    'cond':np.array([content[3]])
                    }
        else:
            output = self.tokenizer.encode_plus(content[0],content[1],truncation=True,max_length=self.max_length,padding='max_length')
            return{'input_ids':np.array(output['input_ids']),
                  'token_type_ids':np.array(output['token_type_ids']),
                    'attention_mask':np.array(output['attention_mask']),
                   'label':np.array(content[2]),
                    'cond':np.array([content[3]])
                   }




# def load_data(data):
#     df = pd.DataFrame()
#     for i in glob.glob('../input/sohudataset/*_{}'.format(data)):#sorted(glob.glob('*_{}'.format(data)),key=os.path.getsize)---->按照文件大小排序
#         print(i)
#         df = pd.concat([df,pd.read_csv(i)],axis=0,ignore_index=True)
#     return df
data = pd.read_csv('../input/sohudata-b/train_b.csv')
data['text_a']=qingli(data['text_a'])
data['text_b']=qingli(data['text_b'])
train,valid = train_test_split(data,test_size=0.1,random_state=42,shuffle=True)
test = pd.read_csv('../input/sohudata-b/test_b.csv')
test['text_a']=qingli(test['text_a'])
test['text_b']=qingli(test['text_b'])
seq_len=512
train_data = SohuDataset(train,pretrain_model_path,seq_len,is_test=False)
valid_data = SohuDataset(valid,pretrain_model_path,seq_len,is_test=False)
test_data = SohuDataset(test,pretrain_model_path,seq_len,is_test=True)
BATCH_SIZE = 8
train_loadar =DataLoader(train_data,batch_size=BATCH_SIZE,shuffle=True)
valid_loader =DataLoader(dataset=valid_data,batch_size=BATCH_SIZE)
test_loader =DataLoader(dataset=test_data,batch_size=BATCH_SIZE)
threshold =0.15
# 3. 训练板块
#——————————————————3.1 评估函数(验证集)————————————————————————————
def evaluate(valid_loader):
    model.eval()
    pbar = tqdm(valid_loader)
    labels_list = []
    pred = []
    for input in pbar:
        input_ids = input['input_ids'].to(device).long()
        token_type_ids = input['token_type_ids'].to(device).long()
        attention_mask = input['attention_mask'].to(device).long()
        label = input['label'].to(device).long()
        cond = input['cond'].to(device).long()
        logits = model(input_ids, token_type_ids, attention_mask,cond)
        logits = nn.Softmax(dim=-1)(logits)
        labels_list.append(label.cpu().detach().numpy())
        pred.append(logits.cpu().detach().numpy().argmax(1))
    pred =np.concatenate(pred)
    labels_list = np.concatenate(labels_list)
    return f1_score(labels_list,pred),roc_auc_score(labels_list, pred)
# 3.2 ————————————预测函数(测试集)——————————————————————————
def predict(test_loader):
    model = torch.load('./best_acc.pkl')
    model.eval()
    pbar = tqdm(test_loader)
    pred = []
    for input in pbar:
        # 这里np.array不用转为tensor之间加上cuda就行
        input_ids = input['input_ids'].to(device).long()
        token_type_ids = input['token_type_ids'].to(device).long()
        attention_mask = input['attention_mask'].to(device).long()
        cond = input['cond'].to(device).long()
        logits = model(input_ids, token_type_ids, attention_mask,cond)
        logits = nn.Softmax(dim=-1)(logits)
        pred.append(logits.cpu().detach().numpy().argmax(1))
    pred =np.concatenate(pred)
    test['label']=pred
    test.drop(['text_a','text_b','cond'],axis=1,inplace=True)
    test.to_csv('sub_b.csv',index=False)


# ——————————————————3.3训练部分————————————————————————————————————
EPOCH =1
LR = 2e-5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Model(pretrain_model_path).to(device)
# model = torch.load('../input/sohubert/best_acc.pkl')
optim = RAdam(model.parameters(),lr=LR,weight_decay=0.1,betas=(0.9, 0.999))
optim = Lookahead(optim, k=5, alpha=0.5)
criterion = nn.CrossEntropyLoss()
best_score = 0.5
f1_best=0.2
fgm = FGM(model)
for epoch in range(EPOCH):
    pbar = tqdm(train_loadar)
    losses = []
    labels_list = []
    pred = []
    for input in pbar:
        # 这里np.array不用转为tensor之间加上cuda就行
        input_ids =  input['input_ids'].to(device).long()
        token_type_ids =  input['token_type_ids'].to(device).long()
        attention_mask =  input['attention_mask'].to(device).long()
        label =  input['label'].to(device).long()#[32]
        cond = input['cond'].to(device).long()
        fgm.attack() # 在embedding上添加对抗扰动
        logits = model(input_ids, token_type_ids, attention_mask,cond)
        # loss = nn.CrossEntropyLoss()(nn.LogSoftmax(dim=-1)(logits), label.view(-1))
        loss = LabelSmoothingCrossEntropy()(logits.view(-1,2), label.view(-1))
        #三件套
        loss.backward()
        fgm.restore()
        optim.step()
        optim.zero_grad()
        losses.append(loss.cpu().detach().numpy())
        labels_list.append(label.cpu().detach().numpy())
        pred.append(logits.cpu().detach().numpy().argmax(1))#[:,1]
        auc_score = roc_auc_score(np.concatenate(labels_list),np.concatenate(pred))#np.int64(np.concatenate(pred)>threshold)
        f1 = f1_score(np.concatenate(labels_list),np.concatenate(pred))
        pbar.set_description(f'Epoch:{epoch+1}   Loss:{np.mean(losses):.4f}   Accuracy:{auc_score:.4f}  F1_Score:{f1:.4f}')
    print('*='*50)
    f1,acc_score = evaluate(valid_loader)
    if best_score<acc_score and f1_best<f1:
        best_score = acc_score
        f1_best=f1                    
        print('最好验证集准确率为:{} 这次的准确率为{}  F1为{} 最好F1为{}'.format(best_score,acc_score,f1,f1_best))
        torch.save(model, 'best_acc.pkl')
    else:
        print('最好验证集准确率为:{} 这次的准确率为{}  F1为{} 最好F1为{}'.format(best_score,acc_score,f1,f1_best))
    print('*=' * 50)

predict(test_loader)
# 4. 保存模型