In [1]:
import numpy as np 
import pandas as pd 
import os 
from tqdm.auto import tqdm 
from transformers import (
    AdamW, 
    AutoConfig, 
    AutoModel, 
    AutoTokenizer, 
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup
)
import torch 
import torch.nn.functional as F
import torch.nn as nn 
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler, IterableDataset 
import math 
import time 
import datetime
import re

In [2]:
files = os.listdir("../storage/FGH_spec_ind_claim_triplet_v1.4.1s") 
len(files)

2590216

In [3]:
df = pd.read_excel("0919_라벨링세트_9주차_병합.xlsx") 

df = df.loc[df["라벨링"].notnull(), ["쿼리 번호", "IPC 분류", "쿼리 문장", "후보 문장", "쿼리 문서 번호", "Positive 문서 번호", "라벨링"]] 
df = df.dropna() 
labels_fixed = [] 
labels = df["라벨링"].values 

for i in range(len(labels)): 
    if labels[i] == 0.1: 
        labels_fixed.append(1.0) 
    elif labels[i] not in [0, 0.5, 0.8, 1.0]: 
        labels_fixed.append(None) 
    else: 
        labels_fixed.append(labels[i]) 

df["라벨링"] = labels_fixed 
df = df.dropna() 
query_numbers = df["쿼리 번호"].values 
unique_queries = np.unique(query_numbers) 

In [4]:
train_size = int(len(unique_queries) * 0.8) 
val_size = int(len(unique_queries) * 0.1) 

train_unique_queries = unique_queries[:train_size] 
val_unique_queries = unique_queries[train_size:train_size+val_size] 
test_unique_queries = unique_queries[train_size+val_size:] 

In [5]:
df.head(2)

Unnamed: 0,쿼리 번호,IPC 분류,쿼리 문장,후보 문장,쿼리 문서 번호,Positive 문서 번호,라벨링
0,166,F21V504,an adjustable lens positioned so as to alter ...,Several mechanisms for altering the beam prod...,20080259600,6474837,0.8
1,166,F21V504,an adjustable lens positioned so as to alter ...,When a plurality of aperture plates are incor...,20080259600,6474837,0.0


In [6]:
df.columns

Index(['쿼리 번호', 'IPC 분류', '쿼리 문장', '후보 문장', '쿼리 문서 번호', 'Positive 문서 번호',
       '라벨링'],
      dtype='object')

In [7]:
train_queries, train_candidates, train_labels = [], [], [] 
valid_queries, valid_candidates, valid_labels = [], [], [] 
test_queries, test_candidates, test_labels = [], [], [] 

test_query_nums, test_candidate_nums = [], [] 

query_nums = df["쿼리 번호"].values 
queries = df["쿼리 문장"].values
candidates = df["후보 문장"].values 
labels = df["라벨링"].values 
query_document_ids = df["쿼리 문서 번호"].values 
candidate_document_ids = df["Positive 문서 번호"].values 

for i in tqdm(range(len(queries)), position=0, leave=True): 
    if query_nums[i] in train_unique_queries: 
        train_queries.append(queries[i]) 
        train_candidates.append(candidates[i]) 
        train_labels.append(labels[i]) 
    elif query_nums[i] in val_unique_queries: 
        valid_queries.append(queries[i]) 
        valid_candidates.append(candidates[i]) 
        valid_labels.append(labels[i]) 
    elif query_nums[i] in test_unique_queries: 
        test_queries.append(queries[i]) 
        test_candidates.append(candidates[i]) 
        test_labels.append(labels[i]) 
        test_query_nums.append(query_document_ids[i]) 
        test_candidate_nums.append(candidate_document_ids[i]) 

  0%|          | 0/33077 [00:00<?, ?it/s]

In [8]:
tokenizer = AutoTokenizer.from_pretrained("tanapatentlm/patentdeberta_large_spec_128_pwi") 

In [9]:
class WeightedLayerPooling(nn.Module): 
    def __init__(self, num_hidden_layers, layer_start, layer_weights=None): 
        super(WeightedLayerPooling, self).__init__() 
        self.layer_start = layer_start 
        self.num_hidden_layers = num_hidden_layers 
        self.layer_weights = nn.Parameter(torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)) 
    def forward(self, all_hidden_states): 
        all_layer_embedding = torch.stack(list(all_hidden_states), dim=0) 
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :] 
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()  
        return weighted_average 

class SentenceRanker(nn.Module): 
    def __init__(self, plm="tanapatentlm/patentdeberta_large_spec_128_pwi"): 
        super(SentenceRanker, self).__init__() 
        self.config = AutoConfig.from_pretrained(plm) 
        self.config.hidden_dropout = 0 
        self.config.hidden_dropout_prob = 0 
        self.config.attention_dropout = 0 
        self.config.attention_probs_dropout_prob = 0 
        self.net = AutoModel.from_pretrained(plm, config=self.config) 
        self.tokenizer = AutoTokenizer.from_pretrained(plm) 
        self.tokenizer.add_special_tokens({"additional_special_tokens":["[IPC]", "[TTL]", "[CLMS]", "[ABST]"]}) 
        self.net.resize_token_embeddings(len(self.tokenizer)) 
        self.weighted_layer_pooling = WeightedLayerPooling(self.config.num_hidden_layers, 6, None) 
        self.fc = nn.Linear(self.config.hidden_size, 1) 
        self._init_weights(self.fc) 
    
    def _init_weights(self, module): 
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None: 
                module.bias.data.zero_() 
        elif isinstance(module, nn.Embedding): 
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None: 
                module.weight.data[module.padding_idx].zero_() 
        elif isinstance(module, nn.LayerNorm): 
            module.bias.data.zero_() 
            module.weight.data.fill_(1.0) 
    
    def forward(self, input_ids, attn_masks): 
        x = self.net(input_ids, attn_masks, output_hidden_states=True)
        x = self.weighted_layer_pooling(x.hidden_states) 
        x = x[:, 0] 
        x = self.fc(x) 
        return x 

In [10]:
model = SentenceRanker() 
model.cuda()
print()

Some weights of the model checkpoint at tanapatentlm/patentdeberta_large_spec_128_pwi were not used when initializing DebertaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





In [11]:
# transfer learning 
ckpt = "../storage/epoch_end_checkpoints-epoch=00-val_loss=0.20442404.ckpt" 
checkpoint = torch.load(ckpt) 
new_weights = model.state_dict() 
old_weights = list(checkpoint["state_dict"].items()) 
for j in range(len(old_weights)): 
    new_weights[old_weights[j][0]] = old_weights[j][1] 
print(model.load_state_dict(new_weights))

<All keys matched successfully>


In [13]:
train_input_ids, train_attn_masks = [], [] 
valid_input_ids, valid_attn_masks = [], [] 

max_len = 256 # 512 

for i in tqdm(range(len(train_queries)), position=0, leave=True): 
    encoded_input = tokenizer(train_queries[i], train_candidates[i], max_length=max_len, truncation=True, padding="max_length") 
    train_input_ids.append(encoded_input["input_ids"]) 
    train_attn_masks.append(encoded_input["attention_mask"]) 

for i in tqdm(range(len(valid_queries)), position=0, leave=True): 
    encoded_input = tokenizer(valid_queries[i], valid_candidates[i], max_length=max_len, truncation=True, padding="max_length") 
    valid_input_ids.append(encoded_input["input_ids"])
    valid_attn_masks.append(encoded_input["attention_mask"]) 
    
train_input_ids = torch.tensor(train_input_ids, dtype=int) 
train_attn_masks = torch.tensor(train_attn_masks, dtype=int) 
train_labels = torch.tensor(train_labels).float() 
train_labels = torch.reshape(train_labels, (-1,1)) 

valid_input_ids = torch.tensor(valid_input_ids, dtype=int) 
valid_attn_masks = torch.tensor(valid_attn_masks, dtype=int) 
valid_labels = torch.tensor(valid_labels).float() 
valid_labels = torch.reshape(valid_labels, (-1,1)) 

print(train_input_ids.shape, train_attn_masks.shape, train_labels.shape, valid_input_ids.shape, valid_attn_masks.shape, valid_labels.shape)

batch_size = 32 #24

train_data = TensorDataset(train_input_ids, train_attn_masks, train_labels) 
train_sampler = RandomSampler(train_data) 
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) 

val_data = TensorDataset(valid_input_ids, valid_attn_masks, valid_labels) 
val_sampler = SequentialSampler(val_data) 
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size) 

val_losses = [] 

loss_func = nn.SmoothL1Loss() 
model.cuda() 
optimizer = AdamW(model.parameters(), lr=2e-5) 
epochs = 10
total_steps = len(train_dataloader) * epochs 
'''
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=100,
                                            num_training_steps=total_steps) 
''' 

scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps, 
                                            num_cycles=0.5) 


device = torch.device("cuda") 
model.zero_grad() 
for epcoh_i in tqdm(range(0, epochs), desc="Epochs", position=0, leave=True, total=epochs): 
    train_loss = 0 
    model.train() 
    with tqdm(train_dataloader, unit="batch") as tepoch: 
        for step, batch in enumerate(tepoch): 
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_masks, b_labels = batch 
            outputs = model(b_input_ids, b_input_masks) 
            loss = loss_func(outputs, b_labels) 
            train_loss += loss.item() 
            loss.backward() 
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 
            optimizer.step() 
            scheduler.step() 
            model.zero_grad() 
            tepoch.set_postfix(loss=train_loss / (step+1)) 
            time.sleep(0.1) 
    avg_train_loss = train_loss / len(train_dataloader) 
    print(f"average train loss : {avg_train_loss}")
    
    val_loss = 0 
    model.eval() 
    for step, batch in tqdm(enumerate(val_dataloader), desc="Validating", position=0, leave=True, total=len(val_dataloader)): 
        batch = tuple(t.to(device) for t in batch) 
        b_input_ids, b_input_masks, b_labels = batch 
        with torch.no_grad(): 
            outputs = model(b_input_ids, b_input_masks) 
        loss = loss_func(outputs, b_labels) 
        val_loss += loss.item() 
    avg_val_loss = val_loss / len(val_dataloader) 
    print(f"average validation loss : {avg_val_loss}") 
    val_losses.append(avg_val_loss) 
    
    if np.min(val_losses) == val_losses[-1]:
        torch.save(model.state_dict(), f"M4_DeBERTa_Cross_Encoder_{avg_val_loss}.pt")

  0%|          | 0/26947 [00:00<?, ?it/s]

  0%|          | 0/3060 [00:00<?, ?it/s]

torch.Size([26947, 256]) torch.Size([26947, 256]) torch.Size([26947, 1]) torch.Size([3060, 256]) torch.Size([3060, 256]) torch.Size([3060, 1])


  train_labels = torch.tensor(train_labels).float()
  valid_labels = torch.tensor(valid_labels).float()


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/843 [00:00<?, ?batch/s]

average train loss : 0.01052044426637325


Validating:   0%|          | 0/96 [00:00<?, ?it/s]

average validation loss : 0.03465540017108045


  0%|          | 0/843 [00:00<?, ?batch/s]

average train loss : 0.009046979622765138


Validating:   0%|          | 0/96 [00:00<?, ?it/s]

average validation loss : 0.03713473094103392


  0%|          | 0/843 [00:00<?, ?batch/s]

average train loss : 0.0061002446640689425


Validating:   0%|          | 0/96 [00:00<?, ?it/s]

average validation loss : 0.03787937156175758


  0%|          | 0/843 [00:00<?, ?batch/s]

average train loss : 0.004337847244971093


Validating:   0%|          | 0/96 [00:00<?, ?it/s]

average validation loss : 0.04044487747402551


  0%|          | 0/843 [00:00<?, ?batch/s]

average train loss : 0.0030240793149003565


Validating:   0%|          | 0/96 [00:00<?, ?it/s]

average validation loss : 0.03772842030836424


  0%|          | 0/843 [00:00<?, ?batch/s]

average train loss : 0.0021756847254242964


Validating:   0%|          | 0/96 [00:00<?, ?it/s]

average validation loss : 0.03614244644995779


  0%|          | 0/843 [00:00<?, ?batch/s]

average train loss : 0.0016182212966194178


Validating:   0%|          | 0/96 [00:00<?, ?it/s]

average validation loss : 0.037259216374271396


  0%|          | 0/843 [00:00<?, ?batch/s]

average train loss : 0.0012974387577984912


Validating:   0%|          | 0/96 [00:00<?, ?it/s]

average validation loss : 0.037801968050189316


  0%|          | 0/843 [00:00<?, ?batch/s]

average train loss : 0.0011131044573443063


Validating:   0%|          | 0/96 [00:00<?, ?it/s]

average validation loss : 0.03819194790654971


  0%|          | 0/843 [00:00<?, ?batch/s]

average train loss : 0.0010246235364814581


Validating:   0%|          | 0/96 [00:00<?, ?it/s]

average validation loss : 0.0382451373928537
