In [1]:
import torch
from transformers import (
    AdamW,
    AutoModel,
    get_linear_schedule_with_warmup,
    AutoTokenizer,
    AutoConfig
)

In [2]:
import sys
module_path = '/home/x1112436/git/sent-semantic-repo'
sys.path.append(module_path)

In [3]:
from src.utils import set_seed
from src.trainer import SimcseTrainer
from src.dataset import DATASET_MAPPING_DICT
from src.utils import PreprocessorFactory 
from src.utils import get_model_argparse
from src.model import MODEL_MAPPING_DICT
from src.model import CONFIG_MAPPING_DICT
from src.logger import Experi_Logger
from config.nli_config import nli_parser_model_args

In [4]:
args = nli_parser_model_args()

In [5]:
#args.pretrained_model = '/home/x1112436/model_file/faq_sent_roberta/sent_roberta'
#args.pretrained_model = f'/home/x1112436/result/faq/modelfile/{args.pretrained_model}'
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.model_max_len = 100
args.is_preprocessed = True
args.valid_first = False
args.data_type='triple'
args.loss= 'TripletLoss'
args.margin = 1.0

In [6]:
#args.pretrained_model_final = '/home/x1112436/result/faq/modelfile/home/x1112436/model_file/sent_roberta'
#args.pretrained_model = '/home/x1112436/result/faq/modelfile2/klue/roberta-large'
args.pretrained_model = '/home/x1112436/final_result/faq/klue-faq-large-sent_robert/train_Triple_Triple'

In [7]:
model = MODEL_MAPPING_DICT['sent_roberta'].from_pretrained(
    args.pretrained_model, **vars(args), 
)
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

# LOAD DATA

In [8]:
from skt.gcp import load_bigquery_ipython_magic, \
                    bq_to_pandas, \
                    get_bigquery_client

In [9]:
dataset = 'x1112436'
log_table = 'faq_table'
query = f"""

SELECT  TRIM(query) as query,
        answer,
        REPLACE(TRIM(intent_nm), "'", "") as intent_nm,
        answer,
        domain,
        status
FROM `skt-datahub.{dataset}.{log_table}`
WHERE intent_nm !='' and intent_nm is not null
"""

In [10]:
faq_table = bq_to_pandas(query)

unsupported operand type(s) for /: 'NoneType' and 'int'
Downloading: 100%|[32m██████████[0m|


In [11]:
faq_table.head(2)

Unnamed: 0,query,answer,intent_nm,answer_1,domain,status
0,T RING,'T Ring(티 링)'에 대해 안내해 드릴게요.\nT Ring은 SK텔레콤의 상징...,T Ring,'T Ring(티 링)'에 대해 안내해 드릴게요.\nT Ring은 SK텔레콤의 상징...,sms_customer_center,INSERT
1,슬림,'슬림 요금제'에 대해 안내해 드릴게요.\n슬림 요금제는 5G 스마트폰을 사용하는 ...,슬림,'슬림 요금제'에 대해 안내해 드릴게요.\n슬림 요금제는 5G 스마트폰을 사용하는 ...,sms_customer_center,INSERT


In [21]:
# intent_nm_ans = faq_table[['intent_nm', 'answer']].drop_duplicates(keep='first')
# intent_nm_ans.to_csv('./result/intent_nm.csv', encoding='utf-8-sig', index=False)

In [12]:
idx2query = list(faq_table['query'].unique())
idx2intent_nm = list(faq_table.intent_nm.unique())

In [13]:
faq_table = faq_table[['query', 'intent_nm']]
query_to_intent = faq_table.set_index('query').to_dict()['intent_nm']

# INPUT for single embedding

In [14]:
from torch.utils.data import (
    DataLoader, Dataset
)

In [15]:
from dataclasses import dataclass
from typing import List, Any, Union, Dict

In [16]:
@dataclass
class SingleSentenceInput:
    sentence_a: str = None
    a_input_ids: List[int] = None
    a_attention_mask: List[int] = None

In [17]:
class EmbeddingDataset(Dataset):
    def __init__(
            self,
            args,
            features:List[SingleSentenceInput],
            max_length,
            tokenizer,
            **kwargs
    ):
        super(EmbeddingDataset, self).__init__()
        self.args = args
        self.features = features
        self.max_length = max_length
        self.pad_token_id = tokenizer.pad_token_id
        self.sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id else tokenizer.eos_token_id

    def __getitem__(self, index) -> Dict[str, Any]:
        feature = self.features[index]
        return {
            'a_sentence': feature.sentence_a,
            'a_input_ids': torch.tensor(feature.a_input_ids, dtype=torch.long),
            'a_attention_mask': torch.tensor(feature.a_attention_mask, dtype=torch.long)
        }
    def __len__(self):
        return len(self.features)
    
    def loader(self, shuffle:bool=True, batch_size:int=64):
        return DataLoader(self, shuffle=shuffle, batch_size=batch_size, collate_fn=self.collater)

    def collater(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:

        a_sentence = [data['a_sentence'] for data in batch]
        a_input_ids = [data['a_input_ids'] for data in batch]
        a_attention_mask = [data['a_attention_mask'] for data in batch]
        ##  token level encoding
        batch_size = len(batch)
        sizes = [len(s) for s in a_input_ids]
        target_size = min(max(sizes), self.max_length)
        """ torch.full -> creates a tensor of a given shape and fills it with a scalar value self.pad_token_id here"""
        a_collated_ids = torch.full((batch_size, target_size), self.pad_token_id, dtype=torch.long)
        a_collated_attention_masks = torch.zeros((batch_size, target_size), dtype=torch.long)

        """ cut data if size > target_size else: fill by self.pad_token_id """
        for i, (input_id, attention_m, size) in enumerate(
                zip(a_input_ids, a_attention_mask, sizes)):
            diff = target_size - size
            if diff < 0:
                a_collated_ids[i, :target_size] = input_id[:target_size]
                a_collated_ids[i, -1] = self.sep_token_id
                a_collated_attention_masks[i, :target_size] = attention_m[:target_size]

            else:
                a_collated_ids[i, :size] = input_id
                a_collated_attention_masks[i, :size] = attention_m

        return {
            'a_sentence': a_sentence,
            'a_input_ids': a_collated_ids,
            'a_attention_mask': a_collated_attention_masks
        }

In [18]:
from src.utils.abs_preprocess import AbsPreprocessor

class Testprocessor(AbsPreprocessor):

    @classmethod
    def preprocess(cls, tokenizer,  input_list:List) -> None:
        """ try read tsv file using pandas first if [memory or parse] error catched use other reading method  """
    
        feature_list = list()
        skipped_line = 0

        for i, line in enumerate(input_list):
            try:
                a_encoded_sentence = cls.tokenizing(input=line, tokenizer=tokenizer, tokenizer_input=None)
                feature_list.append(
                    SingleSentenceInput(
                        sentence_a = line,
                        a_input_ids = a_encoded_sentence.input_ids,
                        a_attention_mask=a_encoded_sentence.attention_mask,
                    )
                )
            except Exception as e:
                print(f'Error occurs in {i} lines in preprocessing')
                print(line)
                print(e)
                break

        return feature_list

In [19]:
query_input = Testprocessor.preprocess(tokenizer = tokenizer ,input_list = idx2query)

In [20]:
intent_input = Testprocessor.preprocess(tokenizer = tokenizer ,input_list = idx2intent_nm)

In [21]:
queryDataset = EmbeddingDataset(args=args, features=query_input, max_length=args.model_max_len, tokenizer=tokenizer)

In [22]:
intentDataset = EmbeddingDataset(args=args, features=intent_input, max_length=args.model_max_len, tokenizer=tokenizer)

# DataLoader

In [23]:
queryDataloader = queryDataset.loader(
            shuffle=False, batch_size=400 )

In [24]:
intentDataloader = intentDataset.loader(
            shuffle=False, batch_size=400 )

# embed query and intent

In [25]:
from tqdm.notebook import tqdm

In [26]:
model = model.to(args.device)

In [27]:
def embedding(model, dataloader):
    input_list = []
    embedding_result = []
    model.eval()
    with torch.no_grad():   
        for batch_idx, batch in enumerate(tqdm(dataloader)): 
            batch = {key: (item.to(args.device) if type(item) == torch.Tensor else item) for key, item in batch.items()}
            a_embedding = model(batch['a_input_ids'], batch['a_attention_mask'])
            a_sentence = batch['a_sentence']
            input_list.extend(a_sentence)
            embedding_result.append(a_embedding)
    embedding_result = torch.cat(embedding_result, 0) 
    return embedding_result, input_list


In [28]:
query_embedding, query_list = embedding(model=model, dataloader=queryDataloader)

  0%|          | 0/613 [00:00<?, ?it/s]

In [29]:
intent_embedding, intent_list = embedding(model=model, dataloader=intentDataloader)

  0%|          | 0/5 [00:00<?, ?it/s]

In [30]:
import torch.nn.functional as F
k = 3
_, indices = torch.topk(F.normalize(query_embedding, dim =1) @ F.normalize(intent_embedding, dim=1).T, k)

# TOP K HIT RATIO

In [41]:
correct_cnt = 0
for i in range(indices.shape[0]):
    query = idx2query[i]
    answer = query_to_intent[query]
    for j in range(1):
        if idx2intent_nm[indices[i][j]] == answer:
            correct_cnt += 1
    