In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '0,1,2,3'
import torch
print(torch.cuda.device_count())
import pandas as pd
import pickle
import numpy as np

from transformers import AdamW
from operator import itemgetter
from sklearn.model_selection import StratifiedKFold

from torch.utils.data import Dataset,DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

device = 'cuda:0'
gpus = [0,1,2,3]

print(torch.cuda.device_count())

label2id = pickle.load(open('../temp_results/mini_label2id_dict.pkl','rb'))
id2label = pickle.load(open('../temp_results/mini_id2label_lst.pkl','rb'))

train_data = pd.read_csv('../data/mini_train_data.csv')
test_data = pd.read_csv('../data/mini_test_data.csv')


from transformers import AutoModelForMaskedLM,AutoTokenizer,BertConfig
from tqdm import tqdm


model_path = "anferico/bert-for-patents"
tokenizer = AutoTokenizer.from_pretrained(model_path)
Config = BertConfig.from_pretrained(model_path)
Config.attention_probs_dropout_prob = 0.1
Config.hidden_dropout_prob = 0.1
output_way = 'pooler'

class NeuralNetwork(nn.Module):
    def __init__(self,model_path,output_way):
        super(NeuralNetwork, self).__init__()
        self.bert = AutoModelForMaskedLM.from_pretrained(model_path,config=Config)
        self.output_way = output_way
    def forward(self, input_ids, attention_mask, token_type_ids):
        x1 = self.bert(input_ids = input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,output_hidden_states=True)
        if self.output_way == 'cls':
            output = x1.hidden_states[-1][:,0]
        elif self.output_way == 'pooler':
            output = x1.hidden_states[-1].mean(dim=1)
        return output
    
model = NeuralNetwork(model_path,output_way)
model = nn.DataParallel(model.to(device), device_ids=gpus, output_device=gpus[0])
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

def compute_loss(y_pred,lamda=0.05):
    idxs = torch.arange(0,y_pred.shape[0],device='cuda:0')
    y_true = idxs + 1 - idxs % 2 * 2
    similarities = F.cosine_similarity(y_pred.unsqueeze(1), y_pred.unsqueeze(0), dim=2)
    #torch自带的快速计算相似度矩阵的方法
    similarities = similarities-torch.eye(y_pred.shape[0],device='cuda:0') * 1e12
    #屏蔽对角矩阵即自身相等的loss
    similarities = similarities / lamda
    #论文中除以 temperature 超参 0.05
    loss = F.cross_entropy(similarities,y_true)
    return torch.mean(loss)

def str2id_lst(str_label):
    id_lst = []
    for l in str_label.split(','):
        id_lst.append(label2id[l])
    return id_lst

class PatentDataset(Dataset):
    def __init__(self,df,labeled = True):
        self.df = df
        self.labeled = labeled
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self,idx):
        text = self.df.iloc[idx]['text'][3:]
        label = str2id_lst(self.df.iloc[idx]['cpc_ids'])
        
        if self.labeled:
            return text,label
        else:
            return text,None
        
def collate_fn(data):
    sents = []
    for i in data:
        sents.append(i[0])
        sents.append(i[0])
    labels = []
    for i in data:
        labels.append(i[1])
        labels.append(i[1])
    
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                       truncation=True,
                                       padding='max_length',
                                       max_length=128,
                                       return_tensors='pt',
                                       return_length=True)
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    
    batch_label = np.zeros((len(labels),len(id2label)))
    for i,_label in enumerate(labels):
        batch_label[i,_label]=1
    
    batch_label = torch.tensor(batch_label,dtype=torch.float32)
    
    return input_ids, attention_mask, token_type_ids, batch_label

train_dataset = PatentDataset(train_data)
train_dataloader = DataLoader(dataset = train_dataset,
                              batch_size = 64,
                              collate_fn = collate_fn,
                              shuffle = True)

epochs = 1
save_path = './patent_bert_simcse/simcsepatent_bs64.pth'
for i in range(epochs):
    model.train()
    for iter,(input_ids, attention_mask, token_type_ids, batch_label) in enumerate(tqdm(train_dataloader)):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        pred = model(input_ids,attention_mask,token_type_ids)
        loss = compute_loss(pred)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    torch.save(model.state_dict(),save_path)

4
4


Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 219/219 [03:25<00:00,  1.07it/s]
