# add module path

In [31]:
import sys
module_path = '/home/x1112436/git/sent-semantic-repo'
sys.path.append(module_path)

# Read real data

In [32]:
import pandas as pd
import re

In [33]:
root_data = '../data/service_query/query.csv'

In [34]:
def parse(input):
    res = re.search(r'^(.*)(?=\([^()]*\)$)', input)
    if res is None:
        return ''
    return res.group(1)

In [35]:
query_list = []
with open(root_data, 'r') as f:
    for line in f:
        data = line.strip().split('|')
        if (5 - len(data)) > 0:
            for i in range(5- len(data)):
                data.append('')

        if len(data) <5:
            print(data)
            break

        query_list.append({'query': data[0], 'query_cnt': data[1], 'p_rank1': parse(data[2]), 'p_rank2': parse(data[3]), 'p_rank3': parse(data[4])})

In [36]:
query_pd = pd.DataFrame(query_list)

In [37]:
queries = list(query_pd['query'].unique())

# LOAD MODEL

In [38]:
import torch
from transformers import (
    AdamW,
    AutoModel,
    get_linear_schedule_with_warmup,
    AutoTokenizer,
    AutoConfig
)

In [39]:
from src.utils import set_seed
from src.trainer import SimcseTrainer
from src.dataset import DATASET_MAPPING_DICT
from src.utils import PreprocessorFactory 
from src.utils import get_model_argparse
from src.model import MODEL_MAPPING_DICT
from src.model import CONFIG_MAPPING_DICT
from src.logger import Experi_Logger
from config.nli_config import nli_parser_model_args

In [40]:
args = nli_parser_model_args()

In [162]:
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
args.pretrained_model = '/home/x1112436/final_result/faq/klue-faq-large-sent_robert/train_Triple_Triple'

In [164]:
model = MODEL_MAPPING_DICT['sent_roberta'].from_pretrained(
    args.pretrained_model, **vars(args), 
)
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

# load data

In [10]:
from skt.gcp import load_bigquery_ipython_magic, \
                    bq_to_pandas, \
                    get_bigquery_client

In [43]:
dataset = 'x1112436'
log_table = 'faq_table'
query = f"""

SELECT  TRIM(query) as query,
        answer,
        REPLACE(TRIM(intent_nm), "'", "") as intent_nm,
        answer,
        domain,
        status
FROM `skt-datahub.{dataset}.{log_table}`
WHERE intent_nm !='' and intent_nm is not null
"""

In [44]:
faq_table = bq_to_pandas(query)

unsupported operand type(s) for /: 'NoneType' and 'int'
Downloading: 100%|[32m██████████[0m|


In [45]:
idx2intent_nm = list(faq_table.intent_nm.unique())

# Load service query

In [46]:
service_query_file = '/home/x1112436/git/sent-semantic-repo/data/service_query/query.csv'

In [47]:
service_queries = []
with open(service_query_file, 'r') as f:
    for data in f:
        line = data.split('|')
        for i in range(5- len(line)):
            line.append('')
        service_queries.append({'query': line[0], 'count': line[1], 'p_rank1': line[2], 'p_rank2': line[3], 'p_rank3': line[4]})

In [48]:
import pandas as pd
service_queries_pd = pd.DataFrame(service_queries)

In [49]:
service_queries_pd.head(1)

Unnamed: 0,query,count,p_rank1,p_rank2,p_rank3
0,미납요금 납부 가능일 문의,49194,SKT미납센터연락처(0.4809759),미납이용정지(0.457217725),미납직권해지(0.43127817)


In [50]:
service_queries = list(service_queries_pd['query'].unique())

# Inference Setting

In [51]:
from torch.utils.data import (
    DataLoader, Dataset
)

In [52]:
from dataclasses import dataclass
from typing import List, Any, Union, Dict

In [53]:
@dataclass
class SingleSentenceInput:
    sentence_a: str = None
    a_input_ids: List[int] = None
    a_attention_mask: List[int] = None

In [54]:
class EmbeddingDataset(Dataset):
    def __init__(
            self,
            args,
            features:List[SingleSentenceInput],
            max_length,
            tokenizer,
            **kwargs
    ):
        super(EmbeddingDataset, self).__init__()
        self.args = args
        self.features = features
        self.max_length = max_length
        self.pad_token_id = tokenizer.pad_token_id
        self.sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id else tokenizer.eos_token_id

    def __getitem__(self, index) -> Dict[str, Any]:
        feature = self.features[index]
        return {
            'a_sentence': feature.sentence_a,
            'a_input_ids': torch.tensor(feature.a_input_ids, dtype=torch.long),
            'a_attention_mask': torch.tensor(feature.a_attention_mask, dtype=torch.long)
        }
    def __len__(self):
        return len(self.features)
    
    def loader(self, shuffle:bool=True, batch_size:int=64):
        return DataLoader(self, shuffle=shuffle, batch_size=batch_size, collate_fn=self.collater)

    def collater(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:

        a_sentence = [data['a_sentence'] for data in batch]
        a_input_ids = [data['a_input_ids'] for data in batch]
        a_attention_mask = [data['a_attention_mask'] for data in batch]
        ##  token level encoding
        batch_size = len(batch)
        sizes = [len(s) for s in a_input_ids]
        target_size = min(max(sizes), self.max_length)
        """ torch.full -> creates a tensor of a given shape and fills it with a scalar value self.pad_token_id here"""
        a_collated_ids = torch.full((batch_size, target_size), self.pad_token_id, dtype=torch.long)
        a_collated_attention_masks = torch.zeros((batch_size, target_size), dtype=torch.long)

        """ cut data if size > target_size else: fill by self.pad_token_id """
        for i, (input_id, attention_m, size) in enumerate(
                zip(a_input_ids, a_attention_mask, sizes)):
            diff = target_size - size
            if diff < 0:
                a_collated_ids[i, :target_size] = input_id[:target_size]
                a_collated_ids[i, -1] = self.sep_token_id
                a_collated_attention_masks[i, :target_size] = attention_m[:target_size]

            else:
                a_collated_ids[i, :size] = input_id
                a_collated_attention_masks[i, :size] = attention_m

        return {
            'a_sentence': a_sentence,
            'a_input_ids': a_collated_ids,
            'a_attention_mask': a_collated_attention_masks
        }

In [55]:
from src.utils.abs_preprocess import AbsPreprocessor

class Testprocessor(AbsPreprocessor):

    @classmethod
    def preprocess(cls, tokenizer,  input_list:List) -> None:
        """ try read tsv file using pandas first if [memory or parse] error catched use other reading method  """
    
        feature_list = list()
        skipped_line = 0

        for i, line in enumerate(input_list):
            try:
                a_encoded_sentence = cls.tokenizing(input=line, tokenizer=tokenizer, tokenizer_input=None)
                feature_list.append(
                    SingleSentenceInput(
                        sentence_a = line,
                        a_input_ids = a_encoded_sentence.input_ids,
                        a_attention_mask=a_encoded_sentence.attention_mask,
                    )
                )
            except Exception as e:
                print(f'Error occurs in {i} lines in preprocessing')
                print(line)
                print(e)
                break

        return feature_list


In [67]:
def embedding(model, dataloader):
    input_list = []
    embedding_result = []
    model.eval()
    with torch.no_grad():   
        for batch_idx, batch in enumerate(tqdm(dataloader)): 
            batch = {key: (item.to(args.device) if type(item) == torch.Tensor else item) for key, item in batch.items()}
            a_embedding = model(batch['a_input_ids'], batch['a_attention_mask'])
            a_sentence = batch['a_sentence']
            input_list.extend(a_sentence)
            embedding_result.append(a_embedding)
    embedding_result = torch.cat(embedding_result, 0) 
    return embedding_result, input_list


In [165]:
model = model.to(args.device)

# Set Dataloader and Dataset

In [89]:
intent_input = Testprocessor.preprocess(tokenizer = tokenizer , input_list = idx2intent_nm)

In [90]:
sq_input = Testprocessor.preprocess(tokenizer = tokenizer ,input_list = service_queries)

In [91]:
sqDataset = EmbeddingDataset(args=args, features=sq_input, max_length=args.model_max_len, tokenizer=tokenizer)

In [92]:
intentDataset = EmbeddingDataset(args=args, features=intent_input, max_length=args.model_max_len, tokenizer=tokenizer)

In [93]:
intentDataloader = intentDataset.loader(
            shuffle=False, batch_size=400 )

In [94]:
sqDataloader = sqDataset.loader(shuffle=False, batch_size=400 )

# Inference

In [138]:
from tqdm.notebook import tqdm

In [166]:
s_query_embedding, s_query_list = embedding(model=model, dataloader=sqDataloader)

  0%|          | 0/17 [00:00<?, ?it/s]

In [167]:
intent_embedding, intent_list = embedding(model=model, dataloader=intentDataloader)

  0%|          | 0/5 [00:00<?, ?it/s]

# TOP K

In [168]:
import torch.nn.functional as F
values, indices = torch.topk(F.normalize(s_query_embedding, dim =1) @ F.normalize(intent_embedding, dim=1).T, 3)

In [169]:
threshold = 0.4

In [170]:
values[values < threshold] = -1
indices[values < threshold] = -1

In [171]:
indices = indices.cpu().numpy()

In [172]:
predict_dict = dict()
for i in range(s_query_embedding.size()[0]):        
    predict_dict[s_query_list[i]] = [(intent_list[indices[i, j]], round(values[i,j].cpu().item(), 3)) for j in range(3) if indices[i,j] != -1]

predict_list = []
for key, value in predict_dict.items():
    query = key
    for i in range(3 - len(value)):
        value.append('')
        
    predict_list.append({'query': query, 'n_rank1': value[0], 'n_rank2': value[1], 'n_rank3': value[2]})
            

In [173]:
predict_pd = pd.DataFrame(predict_list)

In [174]:
predict_pd.head(50)

Unnamed: 0,query,n_rank1,n_rank2,n_rank3
0,미납요금 납부 가능일 문의,"(요금 납부일 확인 방법, 0.779)","(요금 납부일 변경 방법, 0.606)","(요금 수납 내역 확인 방법, 0.602)"
1,미납 문의할게,"(SKT 미납센터 연락처, 0.776)","(가산금, 0.701)","(과납, 0.673)"
2,미납문의할게,"(SKT 미납센터 연락처, 0.788)","(가산금, 0.696)","(과납, 0.653)"
3,데이터 자동선물 신청할래,"(가족 간 데이터 자동 선물하기, 0.701)","(가족 간 데이터 자동 선물 발송 시점, 0.547)","(T끼리 데이터 선물 조르기 방법, 0.487)"
4,로밍 요금제 한눈에 보기,"(로밍 요금 확인 시점, 0.562)","(T로밍 Biz 요금제, 0.473)","(B tv 요금제, 0.41)"
5,음성 잔여량 알려줘,"(일자별 잔여통화안내, 0.614)","(일자별 잔여통화안내 신청 방법, 0.507)","(baro 요금제 잔여량 확인, 0.5)"
6,요금 조회해줘,"(실시간 요금 상세 내역, 0.721)","(요금제문의, 0.694)","(자녀 요금 조회 방법, 0.495)"
7,콘텐츠 이용료 알려줘,"(콘텐츠 이용료 가능 대상, 0.551)","(콘텐츠 이용료 구매내역 확인 방법, 0.543)","(애플 콘텐츠 이용료, 0.451)"
8,미납요금,"(SKT 미납센터 연락처, 0.758)","(미납 이용정지, 0.697)","(가산금, 0.688)"
9,OnePass500 기간형,"(원패스 VIP 기간형, 0.752)","(원패스 기준 시간, 0.403)",


In [175]:
predict_pd.to_csv('../data/result_triple_triple-large.csv', index=False, encoding='utf-8-sig')

# join and sampling

In [None]:
weighted_sample = predict_pd.sample(n=1100, weights="count")