In [2]:
import torch
from transformers import (
    AdamW,
    AutoModel,
    get_linear_schedule_with_warmup,
    AutoTokenizer,
    AutoConfig
)

In [3]:
from src.utils import set_seed
from src.trainer import SimcseTrainera
from src.dataset import DATASET_MAPPING_DICT
from src.utils import PreprocessorFactory 
from src.utils import get_model_argparse
from src.model import MODEL_MAPPING_DICT
from src.model import CONFIG_MAPPING_DICT
from src.logger import Experi_Logger
from config.nli_config import nli_parser_model_args

In [4]:
args = nli_parser_model_args()

In [5]:
args.pretrained_model = '/home/x1112436/model_file/faq_sent_roberta/sent_roberta'
#args.pretrained_model = f'/home/x1112436/result/faq/modelfile/{args.pretrained_model}'
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.output_dir = f'/home/x1112436/result/faq/modelfile/{args.pretrained_model}'
args.log_dir = f'/home/x1112436/result/faq/log/{args.pretrained_model}'
args.experiments_path = f'/home/x1112436/result/faq/experiment/{args.pretrained_model}/experiment.csv'
args.model_max_len = 100
args.is_preprocessed = True
args.valid_first = False
args.data_type='triple'
args.loss= 'TripletLoss'
args.margin = 1.0

In [6]:
args.pretrained_model

'/home/x1112436/model_file/faq_sent_roberta/sent_roberta'

In [7]:
model = MODEL_MAPPING_DICT['sent_roberta'].from_pretrained(
    args.pretrained_model, **vars(args), 
)
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

# LOAD DATA

In [8]:
from skt.gcp import load_bigquery_ipython_magic, \
                    bq_to_pandas, \
                    get_bigquery_client

In [172]:
dataset = 'x1112436'
log_table = 'faq_table'
query = f"""
        SELECT  qry_txt_cont,
                ans_cont,
                intent_nm,
                type
        FROM `skt-datahub.{dataset}.{log_table}`
        WHERE type = 'faq'
    """
#COALESCE(REGEXP_EXTRACT(ans_cont, r"'([^']*)'"), intent_nm) AS intent_nm,

In [173]:
faq_table = bq_to_pandas(query)

query: 
        SELECT  qry_txt_cont,
                ans_cont,
                intent_nm,
                type
        FROM `skt-datahub.x1112436.faq_table`
        WHERE type = 'faq'
    
destination: skt-datahub._775c5ccab1096b3cccd7ac34a5db11c0a354fb07.anon72064c4d9aea8277a8e33f4640e22f213697ace283466db5904aa8d444d33a3d
total_rows: 245417
slot_secs: 1.407

Downloading: 100%|[32m██████████[0m|


In [228]:
intent_nm_ans = faq_table[['intent_nm', 'ans_cont']].drop_duplicates(keep='first')
intent_nm_ans.to_csv('./result/intent_nm.csv', encoding='utf-8-sig', index=False)

In [174]:
idx2query = list(faq_table.qry_txt_cont.unique())
idx2intent_nm = list(faq_table.intent_nm.unique())

In [222]:
faq_table[['qry_txt_cont', 'intent_nm']].to_csv('./result/faq_labeled_data.csv', encoding='utf-8-sig')

In [176]:
faq_table_q_a = faq_table[['qry_txt_cont', 'intent_nm']]
query_to_answer = faq_table_q_a.set_index('qry_txt_cont').to_dict()['intent_nm']

# read real 인입 쿼리

In [177]:
service_query_file = '/home/x1112436/git/sent-semantic-repo/data/service_query/query.csv'

In [178]:
service_queries = []
with open(service_query_file, 'r') as f:
    for data in f:
        line = data.split('|')
        for i in range(5- len(line)):
            line.append('')
        service_queries.append({'query': line[0], 'count': line[1], 'p_rank1': line[2], 'p_rank2': line[3], 'p_rank3': line[4]})
    


In [179]:
import pandas as pd
service_queries_pd = pd.DataFrame(service_queries)

In [180]:
service_queries_pd.head(3)

Unnamed: 0,query,count,p_rank1,p_rank2,p_rank3
0,미납요금 납부 가능일 문의,49194,SKT미납센터연락처(0.4809759),미납이용정지(0.457217725),미납직권해지(0.43127817)
1,미납 문의할게,27234,SKT해지미납센터연락처(0.4995),SKT미납센터연락처(0.4940547),미납직권해지(0.4642235)
2,미납문의할게,1616,SKT미납센터연락처(0.49495625),미납직권해지(0.463400065),미납이용정지(0.4573068)


In [181]:
service_queries = list(service_queries_pd['query'].unique())

In [182]:
len(service_queries)

6756

# INPUT for single embedding

In [183]:
from torch.utils.data import (
    DataLoader, Dataset
)

In [184]:
from dataclasses import dataclass
from typing import List, Any, Union, Dict

In [185]:
@dataclass
class SingleSentenceInput:
    sentence_a: str = None
    a_input_ids: List[int] = None
    a_attention_mask: List[int] = None

In [186]:
class EmbeddingDataset(Dataset):
    def __init__(
            self,
            args,
            features:List[SingleSentenceInput],
            max_length,
            tokenizer,
            **kwargs
    ):
        super(EmbeddingDataset, self).__init__()
        self.args = args
        self.features = features
        self.max_length = max_length
        self.pad_token_id = tokenizer.pad_token_id
        self.sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id else tokenizer.eos_token_id

    def __getitem__(self, index) -> Dict[str, Any]:
        feature = self.features[index]
        return {
            'a_sentence': feature.sentence_a,
            'a_input_ids': torch.tensor(feature.a_input_ids, dtype=torch.long),
            'a_attention_mask': torch.tensor(feature.a_attention_mask, dtype=torch.long)
        }
    def __len__(self):
        return len(self.features)
    
    def loader(self, shuffle:bool=True, batch_size:int=64):
        return DataLoader(self, shuffle=shuffle, batch_size=batch_size, collate_fn=self.collater)

    def collater(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:

        a_sentence = [data['a_sentence'] for data in batch]
        a_input_ids = [data['a_input_ids'] for data in batch]
        a_attention_mask = [data['a_attention_mask'] for data in batch]
        ##  token level encoding
        batch_size = len(batch)
        sizes = [len(s) for s in a_input_ids]
        target_size = min(max(sizes), self.max_length)
        """ torch.full -> creates a tensor of a given shape and fills it with a scalar value self.pad_token_id here"""
        a_collated_ids = torch.full((batch_size, target_size), self.pad_token_id, dtype=torch.long)
        a_collated_attention_masks = torch.zeros((batch_size, target_size), dtype=torch.long)

        """ cut data if size > target_size else: fill by self.pad_token_id """
        for i, (input_id, attention_m, size) in enumerate(
                zip(a_input_ids, a_attention_mask, sizes)):
            diff = target_size - size
            if diff < 0:
                a_collated_ids[i, :target_size] = input_id[:target_size]
                a_collated_ids[i, -1] = self.sep_token_id
                a_collated_attention_masks[i, :target_size] = attention_m[:target_size]

            else:
                a_collated_ids[i, :size] = input_id
                a_collated_attention_masks[i, :size] = attention_m

        return {
            'a_sentence': a_sentence,
            'a_input_ids': a_collated_ids,
            'a_attention_mask': a_collated_attention_masks
        }

In [187]:
from src.utils.abs_preprocess import AbsPreprocessor

class Testprocessor(AbsPreprocessor):

    @classmethod
    def preprocess(cls, tokenizer,  input_list:List) -> None:
        """ try read tsv file using pandas first if [memory or parse] error catched use other reading method  """
    
        feature_list = list()
        skipped_line = 0

        for i, line in enumerate(input_list):
            try:
                a_encoded_sentence = cls.tokenizing(input=line, tokenizer=tokenizer, tokenizer_input=None)
                feature_list.append(
                    SingleSentenceInput(
                        sentence_a = line,
                        a_input_ids = a_encoded_sentence.input_ids,
                        a_attention_mask=a_encoded_sentence.attention_mask,
                    )
                )
            except Exception as e:
                print(f'Error occurs in {i} lines in preprocessing')
                print(line)
                print(e)
                break

        return feature_list

In [188]:
query_test_input = Testprocessor.preprocess(tokenizer = tokenizer ,input_list = idx2query)


KeyboardInterrupt



In [189]:
intent_test_input = Testprocessor.preprocess(tokenizer = tokenizer ,input_list = idx2intent_nm)

In [190]:
service_query_input = Testprocessor.preprocess(tokenizer = tokenizer ,input_list = service_queries)

In [191]:
query_test_dataset = EmbeddingDataset(args=args, features=query_test_input, max_length=args.model_max_len, tokenizer=tokenizer)

NameError: name 'query_test_input' is not defined

In [193]:
intent_test_dataset = EmbeddingDataset(args=args, features=intent_test_input, max_length=args.model_max_len, tokenizer=tokenizer)

In [194]:
service_query_dataset = EmbeddingDataset(args=args, features=service_query_input, max_length=args.model_max_len, tokenizer=tokenizer)

# DataLoader

In [None]:
query_dataloader = query_test_dataset.loader(
            shuffle=False, batch_size=400 )

In [195]:
intent_dataloader = intent_test_dataset.loader(
            shuffle=False, batch_size=400 )

In [196]:
service_query_dataloader = service_query_dataset.loader(shuffle=False, batch_size=400 )

# embed query and intent

In [197]:
from tqdm.notebook import tqdm

In [198]:
model = model.to(args.device)

In [None]:
#next(iter(query_dataloader))

In [None]:
model.eval()
query_list = []
embedding_query = []
with torch.no_grad():   
    for batch_idx, batch in enumerate(tqdm(query_dataloader)): 
        batch = {key: (item.to(args.device) if type(item) == torch.Tensor else item) for key, item in batch.items()}
        a_embedding = model(batch['a_input_ids'], batch['a_attention_mask'])
        a_sentence = batch['a_sentence']
        query_list.extend(a_sentence)
        embedding_query.append(a_embedding)
    embedding_query = torch.cat(embedding_query, 0) 

In [199]:
model.eval()
intent_list = []
embedding_intent = []
with torch.no_grad():   
    for batch_idx, batch in enumerate(tqdm(intent_dataloader)): 
        batch = {key: (item.to(args.device) if type(item) == torch.Tensor else item) for key, item in batch.items()}
        a_embedding = model(batch['a_input_ids'], batch['a_attention_mask'])
        a_sentence = batch['a_sentence']
        intent_list.extend(a_sentence)
        embedding_intent.append(a_embedding)
    embedding_intent = torch.cat(embedding_intent, 0) 


  0%|          | 0/5 [00:00<?, ?it/s]

In [200]:
model.eval()
service_query_list = []
embedding_service_query = []
with torch.no_grad():   
    for batch_idx, batch in enumerate(tqdm(service_query_dataloader)): 
        batch = {key: (item.to(args.device) if type(item) == torch.Tensor else item) for key, item in batch.items()}
        a_embedding = model(batch['a_input_ids'], batch['a_attention_mask'])
        a_sentence = batch['a_sentence']
        service_query_list.extend(a_sentence)
        embedding_service_query.append(a_embedding)
    embedding_service_query = torch.cat(embedding_service_query, 0) 


  0%|          | 0/17 [00:00<?, ?it/s]

In [201]:
import torch.nn.functional as F
values, indices = torch.topk(F.normalize(embedding_service_query, dim =1) @ F.normalize(embedding_intent.T, dim=1), 3)

In [202]:
threshold = 0.45

In [203]:
values[values < threshold] = -1
indices[values < threshold] = -1

In [204]:
indices = indices.cpu().numpy()

In [205]:
predict_dict = dict()
for i in range(embedding_service_query.size()[0]):        
    predict_dict[service_queries[i]] = [(intent_list[indices[i, j]], round(values[i,j].cpu().item(), 3)) for j in range(3) if indices[i,j] != -1]

predict_list = []
for key, value in predict_dict.items():
    query = key
    for i in range(3 - len(value)):
        value.append('')
        
    predict_list.append({'query': query, 'n_rank1': value[0], 'n_rank2': value[1], 'n_rank3': value[2]})
            

In [206]:
predict_pd = pd.DataFrame(predict_list)

In [207]:
join_data = pd.merge(predict_pd, service_queries_pd, on = 'query')

In [210]:
join_data.head(1)

Unnamed: 0,query,n_rank1,n_rank2,n_rank3,count,p_rank1,p_rank2,p_rank3
0,미납요금 납부 가능일 문의,"(요금납부일확인방법, 0.493)","(SKT미납센터연락처, 0.452)","(미납이용정지, 0.451)",49194,SKT미납센터연락처(0.4809759),미납이용정지(0.457217725),미납직권해지(0.43127817)


In [211]:
join_data.to_csv('./result/result.csv', encoding='utf-8-sig')

In [217]:
intent_pd = pd.DataFrame(idx2intent_nm, columns=['intent'])

In [218]:
intent_pd.to_csv('./result/intent.csv', encoding='utf-8-sig')