In [3]:
import pandas as pd
import pickle
import numpy as np
from torch.utils.data import Dataset,DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoModelForMaskedLM,AutoTokenizer

from transformers import AdamW
from operator import itemgetter
from sklearn.model_selection import StratifiedKFold

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
torch.cuda.set_device(0)

In [4]:
label2id = pickle.load(open('temp_results/label2id_dict.pkl','rb'))
id2label = pickle.load(open('temp_results/id2label_lst.pkl','rb'))

train_data = pd.read_csv('data/Patent14K/train.csv')
test_data = pd.read_csv('data/Patent14K/test.csv')
 
model = AutoModelForMaskedLM.from_pretrained("anferico/bert-for-patents")
tokenizer = AutoTokenizer.from_pretrained("anferico/bert-for-patents")

Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def str2id_lst(str):
    id_lst = []
    for l in str.split(','):
        id_lst.append(label2id[l])
    return id_lst

class PatentDataset(Dataset):
    def __init__(self,df,labeled = True):
        self.df = df
        self.labeled = labeled
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self,idx):
        text = self.df.iloc[idx]['text'][3:]
        label = str2id_lst(self.df.iloc[idx]['cpc_ids'])
        
        if self.labeled:
            return text,label
        else:
            return text,None
        

In [6]:
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]
    
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                      truncation=True,
                                      padding='max_length',
                                      max_length=500,
                                      return_tensors='pt',
                                      return_length=True)
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    
    batch_label = np.zeros((len(labels),len(id2label)))
    for i,_label in enumerate(labels):
        batch_label[i,_label]=1
    
    batch_label = torch.tensor(batch_label,dtype=torch.float32)
    
    return input_ids, attention_mask, token_type_ids, batch_label
    

In [7]:
train_dataset = PatentDataset(train_data)
test_dataset = PatentDataset(test_data)

train_dataloader = DataLoader(dataset = train_dataset,
                             batch_size = 1,
                             collate_fn = collate_fn)
test_dataloader = DataLoader(dataset = test_dataset,
                            batch_size = 1,
                            collate_fn = collate_fn)

train_bert_cls_outputs = []

In [None]:
%%time
from tqdm import tqdm
model.cuda()
model.eval()
with torch.no_grad():
    for input_ids, attention_mask, token_type_ids, batch_label in tqdm(train_dataloader):
        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()
        token_type_ids = token_type_ids.cuda()
        batch_label = batch_label.cuda()
        prediction = model(input_ids = input_ids,
                           attention_mask = attention_mask,
                           token_type_ids = token_type_ids,
                           output_hidden_states=True).hidden_states[-1]
        
        train_bert_cls_outputs.append(prediction.squeeze(dim = 0).cpu().detach().numpy()[0])

  1%|          | 14164/2059636 [05:52<20:07:39, 28.23it/s]

In [None]:
pickle.dump(train_bert_cls_outputs,open('outputs/bert_embedding/train_bert_cls_outputs.pkl','wb'))

In [None]:
test_bert_cls_outputs = []
model.eval()
with torch.no_grad():
    for input_ids, attention_mask, token_type_ids, batch_label in tqdm(test_dataloader):
        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()
        token_type_ids = token_type_ids.cuda()
        batch_label = batch_label.cuda()
        prediction = model(input_ids = input_ids,
                           attention_mask = attention_mask,
                           token_type_ids = token_type_ids,
                           output_hidden_states=True).hidden_states[-1]
        
        test_bert_cls_outputs.append(prediction.squeeze(dim = 0).cpu().detach().numpy())

pickle.dump(test_bert_cls_outputs,open('outputs/bert_embedding/test_bert_cls_outputs.pkl','wb'))