In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from torch import Tensor
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler, IterableDataset
import math
import time
import datetime
import os
import re
from sklearn.model_selection import train_test_split, StratifiedKFold
import logging
import addict
from pathlib import Path
import pickle 


In [4]:
files = os.listdir("../storage/FGH_spec_ind_claims_triplet_v0.3.1")

In [19]:
def clean_text(t):
    x = re.sub("\d+.","", t) 
    x = x.replace("\n"," ") 
    x = x.strip() 
    return x 

In [97]:
class TripletData(Dataset):
    def __init__(self, path):
        super(TripletData, self).__init__()
        self.data = [txt for txt in Path(path).glob('*.txt')]

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

class custom_collate_metric_learning(object):
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("tanapatentlm/patentdeberta_large_spec_128_pwi")
        self.chunk_size = 512
    
    def clean_text(self, t):
        x = re.sub("\d+.","", t) 
        x = x.replace("\n"," ") 
        x = x.strip() 
        return x 
    
    def __call__(self, batch):
        b = len(batch) 
        input_ids, attn_masks, labels = [], [], [] 
        ids = 0 
        for idx, txt_file in enumerate(batch):
            with txt_file.open("r", encoding="utf8") as f:
                data = f.read()
            triplet = data.split('\n\n\n') 
            q,p,n = triplet 
            
            q_ttl = re.search("<TTL>([\s\S]*?)<IPC>", q).group(1) 
            q_ipc = re.search("<IPC>([\s\S]*?)<ABST>", q).group(1)
            q_abst = re.search("<ABST>([\s\S]*?)<CLMS>", q).group(1) 
            q_clms = re.search("<CLMS>([\s\S]*?)<DESC>", q).group(1) 
            q_ttl = q_ttl.lower() # convert title to lower case 
            q_ipc = q_ipc[:3] # get first three characters 
            # get first claim as long as it is not canceled 
            q_ind_clms = q_clms.split('\n\n') 
            selected_q_clm = q_ind_clms[0] 
            for q_ind_clm in q_ind_clms:
                if '(canceled)' in q_ind_clm:
                    continue
                else:
                    selected_q_clm = q_ind_clm
                    break 
            selected_q_clm = self.clean_text(selected_q_clm)
            q_text_input = q_ipc + " " + q_ttl + self.tokenizer.sep_token + q_abst + tokenizer.sep_token + selected_q_clm
            encoded_q = self.tokenizer(q_text_input, return_tensors='pt', max_length=self.chunk_size, padding='max_length', truncation=True)
            input_ids.append(encoded_q['input_ids'])  
            attn_masks.append(encoded_q['attention_mask']) 
            labels.append(ids*2)  
            
            p_ttl = re.search("<TTL>([\s\S]*?)<IPC>", p).group(1)  
            p_ipc = re.search("<IPC>([\s\S]*?)<ABST>", p).group(1) 
            p_abst = re.search("<ABST>([\s\S]*?)<CLMS>", p).group(1) 
            p_clms = re.search("<CLMS>([\s\S]*?)<DESC>", p).group(1) 
            p_ttl = p_ttl.lower() 
            p_ipc = p_ipc[:3] 
            p_ind_clms = p_clms.split('\n\n')
            selected_p_clm = p_ind_clms[0] 
            for p_ind_clm in p_ind_clms:
                if '(canceled)' in p_ind_clm:
                    continue
                else:
                    selected_p_clm = p_ind_clm
                    break 
            selected_p_clm = self.clean_text(selected_p_clm) 
            p_text_input = p_ipc + " " + p_ttl + self.tokenizer.sep_token + p_abst + tokenizer.sep_token + selected_p_clm 
            encoded_p = self.tokenizer(p_text_input, return_tensors='pt', max_length=self.chunk_size, padding='max_length', truncation=True) 
            input_ids.append(encoded_p['input_ids']) 
            attn_masks.append(encoded_p['attention_mask']) 
            labels.append(ids*2) 
            
            n_ttl = re.search("<TTL>([\s\S]*?)<IPC>", n).group(1) 
            n_ipc = re.search("<IPC>([\s\S]*?)<ABST>", n).group(1) 
            n_abst = re.search("<ABST>([\s\S]*?)<CLMS>", n).group(1) 
            n_clms = re.search("<CLMS>([\s\S]*?)<DESC>", n).group(1) 
            n_ttl = n_ttl.lower() 
            n_ipc = n_ipc[:3] 
            n_ind_clms = n_clms.split('\n\n') 
            selected_n_clm = n_ind_clms[0] 
            for n_ind_clm in n_ind_clms:
                if '(canceled)' in n_ind_clm:
                    continue 
                else:
                    selected_n_clm = n_ind_clm
                    break 
            selected_n_clm = self.clean_text(selected_n_clm) 
            n_text_input = n_ipc + " " + n_ttl + self.tokenizer.sep_token + n_abst + tokenizer.sep_token + selected_n_clm 
            encoded_n = self.tokenizer(n_text_input, return_tensors='pt', max_length=self.chunk_size, padding='max_length', truncation=True) 
            input_ids.append(encoded_n['input_ids']) 
            attn_masks.append(encoded_n['attention_mask']) 
            labels.append(ids*2+1) 
        input_ids = torch.stack(input_ids, dim=0).squeeze(dim=1)  
        attn_masks = torch.stack(attn_masks, dim=0).squeeze(dim=1)  
        labels = torch.tensor(labels, dtype=int) 
        return input_ids, attn_masks, labels 
            
            
class custom_collate(object):
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("tanapatentlm/patentdeberta_large_spec_128_pwi")
        self.chunk_size = 512
    
    def clean_text(self, t):
        x = re.sub("\d+.","", t) 
        x = x.replace("\n"," ") 
        x = x.strip() 
        return x 

    def __call__(self, batch):
        b = len(batch) 
        qb_input_ids, qb_attn_masks = torch.zeros((b, self.chunk_size),dtype=int), torch.zeros((b, self.chunk_size),dtype=int)
        pb_input_ids, pb_attn_masks = torch.zeros((b, self.chunk_size),dtype=int), torch.zeros((b, self.chunk_size),dtype=int)
        nb_input_ids, nb_attn_masks = torch.zeros((b, self.chunk_size),dtype=int), torch.zeros((b, self.chunk_size),dtype=int)
        for idx, txt_file in enumerate(batch):
            with txt_file.open("r", encoding="utf8") as f:
                data = f.read()
            triplet = data.split('\n\n\n') 
            q,p,n = triplet 
            
            q_ttl = re.search("<TTL>([\s\S]*?)<IPC>", q).group(1) 
            q_ipc = re.search("<IPC>([\s\S]*?)<ABST>", q).group(1)
            q_abst = re.search("<ABST>([\s\S]*?)<CLMS>", q).group(1) 
            q_clms = re.search("<CLMS>([\s\S]*?)<DESC>", q).group(1) 
            q_ttl = q_ttl.lower() # convert title to lower case 
            q_ipc = q_ipc[:3] # get first three characters 
            # get first claim as long as it is not canceled 
            q_ind_clms = q_clms.split('\n\n') 
            selected_q_clm = q_ind_clms[0] 
            for q_ind_clm in q_ind_clms:
                if '(canceled)' in q_ind_clm:
                    continue
                else:
                    selected_q_clm = q_ind_clm
                    break 
            selected_q_clm = self.clean_text(selected_q_clm)
            q_text_input = q_ipc + " " + q_ttl + self.tokenizer.sep_token + q_abst + tokenizer.sep_token + selected_q_clm  
            encoded_q = self.tokenizer(q_text_input, return_tensors='pt', max_length=self.chunk_size, padding='max_length', truncation=True)
            
            p_ttl = re.search("<TTL>([\s\S]*?)<IPC>", p).group(1)  
            p_ipc = re.search("<IPC>([\s\S]*?)<ABST>", p).group(1) 
            p_abst = re.search("<ABST>([\s\S]*?)<CLMS>", p).group(1) 
            p_clms = re.search("<CLMS>([\s\S]*?)<DESC>", p).group(1) 
            p_ttl = p_ttl.lower() 
            p_ipc = p_ipc[:3] 
            p_ind_clms = p_clms.split('\n\n')
            selected_p_clm = p_ind_clms[0] 
            for p_ind_clm in p_ind_clms:
                if '(canceled)' in p_ind_clm:
                    continue
                else:
                    selected_p_clm = p_ind_clm
                    break 
            selected_p_clm = self.clean_text(selected_p_clm) 
            p_text_input = p_ipc + " " + p_ttl + self.tokenizer.sep_token + p_abst + tokenizer.sep_token + selected_p_clm 
            encoded_p = self.tokenizer(p_text_input, return_tensors='pt', max_length=self.chunk_size, padding='max_length', truncation=True) 
            
            n_ttl = re.search("<TTL>([\s\S]*?)<IPC>", n).group(1) 
            n_ipc = re.search("<IPC>([\s\S]*?)<ABST>", n).group(1) 
            n_abst = re.search("<ABST>([\s\S]*?)<CLMS>", n).group(1) 
            n_clms = re.search("<CLMS>([\s\S]*?)<DESC>", n).group(1) 
            n_ttl = n_ttl.lower() 
            n_ipc = n_ipc[:3] 
            n_ind_clms = n_clms.split('\n\n') 
            selected_n_clm = n_ind_clms[0] 
            for n_ind_clm in n_ind_clms:
                if '(canceled)' in n_ind_clm:
                    continue 
                else:
                    selected_n_clm = n_ind_clm
                    break 
            selected_n_clm = self.clean_text(selected_n_clm) 
            n_text_input = n_ipc + " " + n_ttl + self.tokenizer.sep_token + n_abst + tokenizer.sep_token + selected_n_clm 
            encoded_n = self.tokenizer(n_text_input, return_tensors='pt', max_length=self.chunk_size, padding='max_length', truncation=True) 
            
            qb_input_ids[idx] = encoded_q['input_ids'] 
            qb_attn_masks[idx] = encoded_q['attention_mask']
            
            pb_input_ids[idx] = encoded_p['input_ids'] 
            pb_attn_masks[idx] = encoded_p['attention_mask'] 
            
            nb_input_ids[idx] = encoded_n['input_ids'] 
            nb_attn_masks[idx] = encoded_n['attention_mask'] 
        return qb_input_ids, qb_attn_masks, pb_input_ids, pb_attn_masks, nb_input_ids, nb_attn_masks     
          

In [98]:
train_set = TripletData("../storage/train_spec")
collate = custom_collate()
train_dataloader = DataLoader(train_set, batch_size=8, collate_fn=collate, shuffle=True)


In [99]:
tokenizer = AutoTokenizer.from_pretrained("tanapatentlm/patentdeberta_large_spec_128_pwi") 

cnt = 0 

for step, batch in tqdm(enumerate(train_dataloader)):
    q_input_id, q_attn_mask, p_input_id, p_attn_mask, n_input_id, n_attn_mask = batch 
    cnt += 1 
    if cnt == 5:
        break 
    

4it [00:00,  7.57it/s]


In [107]:
tokenizer.decode(q_input_id[4])

'[CLS]G06 link system[SEP]In a link system in which a plurality of MFPs and a server apparatus is connected so as to enable data communication over a network, a remote connection driver of the MFP starts an application on the server apparatus by remote control and at the time, on the basis of an operation start signal transmitted from the MFP, a device setting manager of the server apparatus identifies as a remote operation host apparatus the MFP having transmitted the operation start signal, and automatically sets the MFP as the MFP by which the application is used.[SEP]A link system in which a plurality of image forming apparatuses and a server apparatus are connected so as to enable data communication over a network, the image forming apparatus having a remote operating section adapted to make an application on the server apparatus available by remote control, and the server apparatus having: a remote operation host apparatus identifying section for identifying a remote operation ho

A system, method and program product for providing a voice response unit (VRU) proxy. A system is provided that includes: a graphical user interface (GUI) for dynamically displaying information from a VRU and for receiving data from a user; a system for initiating a call with the VRU; and a VRU interface system for transmitting data from the user to the VRU, and for using speech recognition to capture broadcasts from the VRU for display within the GUI.

A voice response unit (VRU) proxy system, comprising: a graphical user interface (GUI) for dynamically displaying information from a VRU and for receiving data from a user; a system for initiating a call with the VRU; and a VRU interface system for transmitting data from the user to the VRU, and for using speech recognition to capture broadcasts from the VRU for display within the GUI.

In [112]:
tokenizer.decode(p_input_id[4])

'[CLS]H04 facsimile communication system and image processing apparatus[SEP]Leakage of communication information concerning facsimile communication is prevented. A facsimile server manages communication information concerning facsimile communication. An image processing apparatus connects itself to the facsimile server to acquire communication information. Only a part of communication information is displayed, and a user selects a transmission destination. The image processing apparatus sends the designated transmission destination and image data to the facsimile server and requests facsimile communication. The facsimile server sends image data to the designated transmission destination.[SEP]A facsimile communication system, wherein an image processing apparatus and a server having a facsimile communication function are communicatively connected, the server has a management section managing communication information concerning facsimile communication, and the image processing apparatus

In [111]:
tokenizer.decode(n_input_id[4])

'[CLS]G03 image forming system, an apparatus, and method for controlling the same[SEP]When a printing process on a first apparatus is interrupted due to an error and the printing process is resumed by a second apparatus, it is determined whether a predetermined function (for example, stapling) is selected for the printing process according to the printing mode. If the predetermined function is selected for the printing process, the second printing apparatus prints the number of copies including the copy interrupted in the first machine according to the print setting. If the predetermined function is not selected for the printing process, the second printing apparatus prints the number of copies excluding the interrupted copy and the unprinted pages in the interrupted copy according to the print setting.[SEP]An image forming system including a first apparatus and a second apparatus, the first apparatus reads an image of an original document, the read image data is transmitted to the sec

There is provided a display device of high display quality having a circuit substrate, on which electronic parts are correctly mounted in a manner to afford visually recognizing presence and absence of misalignment of a solder resist opening with ease.\nThe display device comprises a display panel and a circuit substrate connected to the display panel, the circuit substrate comprises an insulating substrate, a conductive layer, an insulating layer to cover a part of the conductive layer, a plating layer applied to that portion of the conductive layer, which is exposed from the insulating layer, and a misalignment detection pattern for detection of misalignment between the conductive layer and the insulating layer, and the misalignment detection pattern comprises a pattern covered by the insulating layer in a manner to prevent adherence of a plating material to the conductive layer.

A display device comprising a display panel and a circuit substrate connected to the display panel, wherein the display panel includes an array substrate, the circuit substrate is separately formed and positioned different from the array substrate and comprises an insulating substrate, a conductive layer, and an insulating layer to cover a part of the conductive layer, the conductive layer of the circuit substrate includes a wiring pattern, wiring terminals, and a meshy pattern, the wiring terminals are exposed from the insulating layer and are applied with a plating layer to a surface exposed from the insulating layer, a misalignment detection pattern for detecting the misalignment between the conductive layer and the insulating layer is formed on the circuit substrate, the misalignment detection pattern includes a first layer formed of the conductive layer, a second layer formed of the insulating layer having an opening, the first layer is connected with the meshy pattern and is supplied with a predetermined potential, if the misalignment between the conductive layer and the insulating layer is smaller than a predetermined misalignment tolerance, the first layer is wholly covered by the second layer in a manner to prevent adherence of a plating material to the first layer, and if the misalignment between the conductive layer and the insulating layer is larger than the predetermined misalignment tolerance, a part of the first layer is exposed from the second layer at the opening in a manner to be applied with plating.