In [3]:
#!pip install sentence-transformers

In [2]:
from sentence_transformers import SentenceTransformer
import torch
from transformers import (
    AdamW,
    AutoModel,
    get_linear_schedule_with_warmup,
    AutoTokenizer,
    AutoConfig
)

In [3]:
from skt.vault_utils import get_secrets
proxies = get_secrets('proxies')

In [4]:
import os 
os.environ['http_proxy'] = proxies['http']
os.environ['https_proxy'] = proxies['https']

In [5]:
model = SentenceTransformer('jhlee3421/faq-semantic-klue-roberta-large', device='cuda')

# data load

In [7]:
from skt.gcp import load_bigquery_ipython_magic, \
                    bq_to_pandas, \
                    get_bigquery_client

In [8]:
dataset = 'x1112436'
log_table = 'faq_table'
query = f"""
        SELECT  qry_txt_cont,
                intent_nm,
                ans_cont,
                type
        FROM `skt-datahub.{dataset}.{log_table}`  AS a
        WHERE type = 'faq'
    """

In [9]:
faq_table = bq_to_pandas(query)

unsupported operand type(s) for /: 'NoneType' and 'int'
Downloading: 100%|[32m██████████[0m|


In [10]:
faq_table.iloc[0].intent_nm

'자동안심T로밍음성'

# preprocessing

In [10]:
#faq_table.loc[(faq_table.in !='') & (faq_table.ans_cont !='.')].reset_index(drop=True)

# Build index vocab

In [11]:
idx2intent_nm = list(faq_table.intent_nm.unique())
intent_nm2idx = {intent_nm : i for i, intent_nm in enumerate(idx2intent_nm)}

In [12]:
faq_table['intent_nm_idx'] = faq_table['intent_nm'].apply(lambda x: intent_nm2idx[x])

In [13]:
idx2query = list(faq_table.qry_txt_cont.unique())
query2idx = {query : i for i, query in enumerate(idx2query)}

In [14]:
faq_table['query_idx'] = faq_table['qry_txt_cont'].apply(lambda x: query2idx[x])

# intent to embedding

In [6]:
# chunking inetent
def chunking(input_list, chunk_size=200):
    result = []
    chunk_length = int(len(input_list) / chunk_size)
    chunk_rest = len(input_list) % chunk_size

    start = 0
    for i in range(0, chunk_length):
        result.append(input_list[start: start + chunk_size])
        start = start + chunk_size
                      
    if chunk_rest > 0 :
        result.append(input_list[start:])
    return result
        

In [16]:
chunk_result = chunking(idx2intent_nm, chunk_size= 200)

In [17]:
import numpy as np

In [22]:
from tqdm.notebook import tqdm

In [24]:
intent_nm_result = []
for i in tqdm(range(len(chunk_result))):
    if i ==0:
        embedding_result = model.encode(chunk_result[i])
    else:
        embedding = model.encode(chunk_result[i])
        embedding_result = np.concatenate((embedding_result, embedding), axis=0)
    intent_nm_result.extend(chunk_result[i])

  0%|          | 0/9 [00:00<?, ?it/s]

In [25]:
len(intent_nm_result)

1609

In [33]:
import pickle
with open('./pickle/intent.pkl', 'wb') as f:
    pickle.dump({
    'idx2intent': idx2intent_nm,
    'intent2idx' : intent_nm2idx
    }, f)

In [34]:
torch.save(torch.from_numpy(embedding_result), './embedding_result/intent.pt')

# query to label

In [20]:
query_answer_table = faq_table[['qry_txt_cont', 'intent_nm']]

In [21]:
query_answer_table.head(2)

Unnamed: 0,qry_txt_cont,intent_nm
0,t로밍 3분,자동안심T로밍음성
1,로밍 현지 통화 요금,로밍통화요금


In [22]:
query_to_answer = query_answer_table.set_index('qry_txt_cont').to_dict()['intent_nm']

# cal metric

In [23]:
query_chunk_list = chunking(idx2query)
query_result = []
query_index_result = []

for chunks in query_chunk_list:
    temp_result = []
    temp_index_result = []
    for chunk in chunks:
        temp_result.append(query_to_answer[chunk])
        temp_index_result.append(intent_nm2idx[query_to_answer[chunk]])
        
    query_result.append(temp_result)
    query_index_result.append(temp_index_result)

In [26]:
len(query_chunk_list)

1228

In [36]:
from torchmetrics.functional.pairwise import pairwise_cosine_similarity

In [234]:
predict_list = []
wrong_count = 0
for i in tqdm(range(len(query_chunk_list))):
    query = query_chunk_list[i]
    query_embed = torch.from_numpy(model.encode(query))
    cos_mat = pairwise_cosine_similarity(query_embed, embedding_result)
    values, indices = torch.topk(cos_mat, k = 1)
    predict_list = indices[:, 0]
    label = torch.tensor(query_index_result[i])
    wrong_count += label[label != predict_list].size()[0]

  0%|          | 0/1228 [00:00<?, ?it/s]

In [27]:
(24.197068403908794 * 1228) / 

29714.0

# Blacklist

In [26]:
embedding_result.shape

(1609, 1024)

In [27]:
black_list_path = '/home/x1112436/faq/blacklist/blacklist.txt'

In [28]:
black_list_sent = set()
with open(black_list_path) as f:
    for data in f:
        line = data.strip()
        black_list_sent.add(line)

In [29]:
black_list_sent = list(black_list_sent)

In [3]:
import torch

In [4]:
torch.__version__

'1.8.1+cu102'

In [1]:
#import torchmetrics

In [2]:
#from torchmetrics.functional.pairwise import pairwise_cosine_similarity

In [31]:
black_list_chunk_result = chunking(black_list_sent, chunk_size= 200)

In [46]:
from tqdm.notebook import tqdm

In [49]:
embedding_result = torch.from_numpy(embedding_result)

In [None]:
predict_list = []
value_list = []
wrong_count = 0
for i in tqdm(range(len(black_list_chunk_result))):
    query = black_list_chunk_result[i]
    query_embed = torch.from_numpy(model.encode(query))
    cos_mat = pairwise_cosine_similarity(query_embed, embedding_result)
    values, indices = torch.topk(cos_mat, k = 1)
    value_list.extend(values[:, 0].tolist())

  0%|          | 0/129 [00:00<?, ?it/s]

# real_query

In [8]:
root_path = '/home/x1112436/faq/service_query'

In [100]:
#file_path = '/home/x1112436/faq/service_query/service_query.csv'
file_path = '/home/x1112436/faq/service_query/recent_q.csv'

In [101]:
data = []
with open(file_path) as f:
    for line in f:
        datum = line.strip().split('|')
        query = datum[0]
        query_cnt = datum[1]
        model_result = datum[2:]
        data.append({'query': query, 'query_cnt':query_cnt, 'model_result': model_result})

In [102]:
import pandas as pd
df = pd.DataFrame(data)

In [112]:
query_list = list(df['query'].unique())

In [113]:
real_query_chunk = chunking(query_list)

In [114]:
from tqdm.notebook import tqdm
import numpy as np

In [115]:
result = []
for i in tqdm(range(len(real_query_chunk))):
    if i ==0:
        embedding_result = model.encode(real_query_chunk[i])
    else:
        embedding = model.encode(real_query_chunk[i])
        embedding_result = np.concatenate((embedding_result, embedding), axis=0)
    result.extend(real_query_chunk[i])

  0%|          | 0/34 [00:00<?, ?it/s]

In [116]:
import pickle
import torch

In [117]:
with open('./pickle/intent.pkl', 'rb') as f:
    intent_resource = pickle.load(f)

In [118]:
idx2intent = intent_resource['idx2intent']

In [119]:
intent_embedding = torch.from_numpy(torch.load('./embedding_result/intent.pkl'))

In [120]:
import torch.nn.functional as F

In [121]:
values, indices = torch.topk(F.normalize(torch.from_numpy(embedding_result), dim=1) @ F.normalize(intent_embedding.T, dim=1), 5)

In [122]:
result = []
for i in range(values.shape[0]):
    query = query_list[i]
    intent_nm_list = []
    value_list = []
    for j in range(values.shape[1]):
        value = np.round(values[i][j].item(),3)
        intent_nm = idx2intent[indices[i][j]]
        intent_nm_list.append(intent_nm+ '(' + str(value) + ')')
    result.append({'query': query, 'a_model_result': intent_nm_list})
    

In [123]:
new_model = pd.DataFrame(result)

In [124]:
df.head(3)

Unnamed: 0,query,query_cnt,model_result
0,미납요금 납부 가능일 문의,49194,"[SKT미납센터연락처(0.4809759), 미납이용정지(0.457217725), 미..."
1,미납 문의할게,27234,"[SKT해지미납센터연락처(0.4995), SKT미납센터연락처(0.4940547), ..."
2,미납문의할게,1616,"[SKT미납센터연락처(0.49495625), 미납직권해지(0.463400065), ..."


In [125]:
final_df = pd.merge(df, new_model, on = 'query')

In [129]:
final_df.tail(200)

Unnamed: 0,query,query_cnt,model_result,a_model_result
6556,감사합니다 연락주시면 좋겠습니다,1,"[휴대폰결제대행사연락처(0.48269505), SK텔레콤고객센터운영시간(0.4657...","[유선인터넷모뎀공유기(0.465), 퀵보이스차단서비스(0.465), 퀵보이스이용방법..."
6557,10일까지 납부가능 합니다,1,"[부가사용금액(0.463944525), 과납(0.45516785), 요금납부일확인방...","[요금납부일확인방법(0.432), 단말기할부금중도납부방법(0.405), 각종업무구비..."
6558,영업장 전번 부탁드립니다,1,"[영업장위치확인(0.4868497), 개통대리점조회방법(0.43437485), 행복...","[영업장위치확인(0.637), 제조사AS센터연락처(0.534), 개통대리점조회방법(..."
6559,타통신사에서는 신호가 먹통이 되면 유심 갈아끼우라고 했는데요 skt와의 절차는 어떻...,1,"[USIM기기변경방법(0.4823117), USIM카드교품방법(0.4812251),...","[USIM기변방법(0.552), USIM카드교품방법(0.524), 로밍중USIM변경..."
6560,네연락해보겠습니다,1,"[휴대폰결제대행사연락처(0.44381103), SK텔레콤고객센터운영시간(0.4045...","[전화상담사연결(0.533), 퀵보이스신청방법(0.425), 퀵보이스해지방법(0.4..."
...,...,...,...,...
6751,부가서비스 기타 4000원도 있네요,1,"[기타요금(0.4781918), 요금부분납부방법(0.44400195), 상품권수납(...","[기타요금(0.709), 음악관련(0.382), 소리관련(0.264), baro4G..."
6752,법인폰입니다,1,"[법인폰요금안내서신청_변경방법(0.48128655), 법인폰청구요금조회방법(0.45...","[법인고객센터(0.63), T끼리법인요금제(0.553), T끼리법인요금제회선_사용자..."
6753,노미지 060630,1,"[060정보료한도제(0.472423525), 060정보료한도제신청방법(0.47041...","[다이렉트5G62(0.328), T우주패스mini(0.326), PASS신용지키미(..."
6754,11월30일에 납부 가능할까요,1,[요금납부일확인방법(0.8642154)],"[단말기할부금중도납부방법(0.454), 요금납부일확인방법(0.419), OK캐쉬백납..."


In [88]:
final_df.to_csv('./result/result.csv', encoding='utf-8-sig')

In [97]:
for intent in idx2intent:
    if '리필쿠폰유효기간' in intent:
        print(intent)

In [99]:
'척척할인' in idx2intent

False

In [None]:
척척할인