# add module path

In [1]:
import sys
module_path = '/home/x1112436/git/sent-semantic-repo'
sys.path.append(module_path)

# Read real data

In [10]:
import pandas as pd
import re

In [11]:
root_data = '../data/service_query/query.csv'

In [99]:
def parse(input):
    res = re.search(r'^(.*)(?=\([^()]*\)$)', input)
    if res is None:
        return ''
    return res.group(1)

In [109]:
query_list = []
with open(root_data, 'r') as f:
    for line in f:
        data = line.strip().split('|')
        if (5 - len(data)) > 0:
            for i in range(5- len(data)):
                data.append('')

        if len(data) <5:
            print(data)
            break

        query_list.append({'query': data[0], 'query_cnt': data[1], 'p_rank1': parse(data[2]), 'p_rank2': parse(data[3]), 'p_rank3': parse(data[4])})

In [110]:
query_pd = pd.DataFrame(query_list)

In [14]:
queries = list(query_pd['query'].unique())

# LOAD MODEL

In [2]:
import torch
from transformers import (
    AdamW,
    AutoModel,
    get_linear_schedule_with_warmup,
    AutoTokenizer,
    AutoConfig
)

In [3]:
from src.utils import set_seed
from src.trainer import SimcseTrainer
from src.dataset import DATASET_MAPPING_DICT
from src.utils import PreprocessorFactory 
from src.utils import get_model_argparse
from src.model import MODEL_MAPPING_DICT
from src.model import CONFIG_MAPPING_DICT
from src.logger import Experi_Logger
from config.nli_config import nli_parser_model_args

In [4]:
args = nli_parser_model_args()

In [5]:
args.pretrained_model = '/home/x1112436/model_file/faq_sent_roberta/sent_roberta'
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.output_dir = f'/home/x1112436/result/faq/modelfile/{args.pretrained_model}'
args.log_dir = f'/home/x1112436/result/faq/log/{args.pretrained_model}'
args.experiments_path = f'/home/x1112436/result/faq/experiment/{args.pretrained_model}/experiment.csv'
args.model_max_len = 100
args.is_preprocessed = True
args.valid_first = False
args.data_type='triple'
args.loss= 'TripletLoss'
args.margin = 1.0

In [6]:
model = MODEL_MAPPING_DICT['sent_roberta'].from_pretrained(
    args.pretrained_model, **vars(args), 
)
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

# load data

In [7]:
from torch.utils.data import (
    DataLoader, Dataset
)

In [8]:
dataprocessor = PreprocessorFactory('test')

In [15]:
query_test_input = dataprocessor.preprocess(tokenizer = tokenizer, input_list = queries)

In [16]:
InferenceDataset = DATASET_MAPPING_DICT['InferenceDataset']

In [17]:
squery_dataset = InferenceDataset(args=args, features=query_test_input, max_length=args.model_max_len, tokenizer=tokenizer)

In [18]:
squery_dataloader = squery_dataset.loader(shuffle=False, batch_size=400 )

In [20]:
#next(iter(squery_dataloader))

In [21]:
from skt.gcp import load_bigquery_ipython_magic, \
                    bq_to_pandas, \
                    get_bigquery_client

In [22]:
dataset = 'x1112436'
log_table = 'faq_table'
query = f"""
        SELECT  qry_txt_cont,
                ans_cont,
                intent_nm,
                type
        FROM `skt-datahub.{dataset}.{log_table}`
        WHERE type = 'faq'
    """

In [23]:
faq_table = bq_to_pandas(query)

query: 
        SELECT  qry_txt_cont,
                ans_cont,
                intent_nm,
                type
        FROM `skt-datahub.x1112436.faq_table`
        WHERE type = 'faq'
    
destination: skt-datahub._775c5ccab1096b3cccd7ac34a5db11c0a354fb07.anon72064c4d9aea8277a8e33f4640e22f213697ace283466db5904aa8d444d33a3d
total_rows: 245417
slot_secs: 1.169

Downloading: 100%|[32m██████████[0m|


In [65]:
faq_table[['intent_nm', 'ans_cont']].drop_duplicates(keep='first').to_csv('../data/label_data/intent_table.csv', encoding='utf-8-sig')

In [26]:
idx2intent_nm = list(faq_table.intent_nm.unique())

In [27]:
intent_nm_input = dataprocessor.preprocess(tokenizer = tokenizer, input_list = idx2intent_nm)

In [28]:
intent_dataset = InferenceDataset(args=args, features=intent_nm_input, max_length=args.model_max_len, tokenizer=tokenizer)

In [29]:
intent_dataloader = intent_dataset.loader(shuffle=False, batch_size=400 )

# Inference

In [30]:
model = model.to(args.device)

In [31]:
from tqdm.notebook import tqdm

In [33]:
model.eval()
squery_list = []
squery_embedding = []
with torch.no_grad():   
    for batch_idx, batch in enumerate(tqdm(squery_dataloader)): 
        batch = {key: (item.to(args.device) if type(item) == torch.Tensor else item) for key, item in batch.items()}
        a_embedding = model(batch['a_input_ids'], batch['a_attention_mask'])
        a_sentence = batch['a_sentence']
        squery_list.extend(a_sentence)
        squery_embedding.append(a_embedding)
    squery_embedding = torch.cat(squery_embedding, 0) 

  0%|          | 0/17 [00:00<?, ?it/s]

In [35]:
model.eval()
intent_list = []
intent_embedding = []
with torch.no_grad():   
    for batch_idx, batch in enumerate(tqdm(intent_dataloader)): 
        batch = {key: (item.to(args.device) if type(item) == torch.Tensor else item) for key, item in batch.items()}
        a_embedding = model(batch['a_input_ids'], batch['a_attention_mask'])
        a_sentence = batch['a_sentence']
        intent_list.extend(a_sentence)
        intent_embedding.append(a_embedding)
    intent_embedding = torch.cat(intent_embedding, 0) 

  0%|          | 0/5 [00:00<?, ?it/s]

In [52]:
import torch.nn.functional as F
values, indices = torch.topk(F.normalize(squery_embedding, dim =1) @ F.normalize(intent_embedding.T, dim=1), 3)

In [53]:
values = values.cpu().numpy()
indices = indices.cpu().numpy()

In [54]:
thres_value = 0.45

In [55]:
indices[values< thres_value] = -1

In [57]:
indices.shape

(6756, 3)

In [79]:
predict_result = []
for i in range(indices.shape[0]):
    query = squery_list[i]
    answer_list = []
    for j in range(3):
        if indices[i, j] != -1:
            answer_list.append(idx2intent_nm[indices[i,j]])
        else:
            answer_list.append('')
    predict_result.append({'query': query, 'n_rank_1':answer_list[0], 'n_rank2':answer_list[1], 'n_rank3':answer_list[2]})
                    
    
    

In [81]:
predict_result_pd = pd.DataFrame(predict_result)

In [111]:
data = pd.merge(query_pd, predict_result_pd, on ='query')

In [112]:
data.head(2)

Unnamed: 0,query,query_cnt,p_rank1,p_rank2,p_rank3,n_rank_1,n_rank2,n_rank3
0,미납요금 납부 가능일 문의,49194,SKT미납센터연락처,미납이용정지,미납직권해지,요금납부일확인방법,SKT미납센터연락처,미납이용정지
1,미납 문의할게,27234,SKT해지미납센터연락처,SKT미납센터연락처,미납직권해지,SKT미납센터연락처,미납이용정지,과납


In [113]:
data['query_cnt'] = data['query_cnt'].astype('int')
data['query_len'] = data['query'].apply(lambda x: len(x))

In [128]:
sampling_df3 = data.loc[(data.p_rank1 != data.n_rank_1) & (data.query_cnt > 2)]

In [124]:
sampling_df = data.loc[(data.p_rank1 != data.n_rank_1) & (data.query_cnt <= 2) & (data.n_rank_1 !='')].sample(n=950)

In [126]:
sampling_df2 = data.loc[(data.p_rank1 != data.n_rank_1) & (data.query_cnt <= 2) & (data.n_rank_1 =='')].sample(n=50)

In [132]:
result = pd.concat([sampling_df3, sampling_df, sampling_df2]).reset_index()

In [136]:
result.to_csv('../result/result_tbl.csv', encoding='utf-8-sig')