# add module path

In [5]:
import sys
module_path = '/home/x1112436/git/sent-semantic-repo'
sys.path.append(module_path)

# Read real data

In [10]:
import pandas as pd
import re

In [11]:
root_data = '../data/service_query/query.csv'

In [99]:
def parse(input):
    res = re.search(r'^(.*)(?=\([^()]*\)$)', input)
    if res is None:
        return ''
    return res.group(1)

In [109]:
query_list = []
with open(root_data, 'r') as f:
    for line in f:
        data = line.strip().split('|')
        if (5 - len(data)) > 0:
            for i in range(5- len(data)):
                data.append('')

        if len(data) <5:
            print(data)
            break

        query_list.append({'query': data[0], 'query_cnt': data[1], 'p_rank1': parse(data[2]), 'p_rank2': parse(data[3]), 'p_rank3': parse(data[4])})

In [110]:
query_pd = pd.DataFrame(query_list)

In [14]:
queries = list(query_pd['query'].unique())

# LOAD MODEL

In [2]:
import torch
from transformers import (
    AdamW,
    AutoModel,
    get_linear_schedule_with_warmup,
    AutoTokenizer,
    AutoConfig
)

In [6]:
from src.utils import set_seed
from src.trainer import SimcseTrainer
from src.dataset import DATASET_MAPPING_DICT
from src.utils import PreprocessorFactory 
from src.utils import get_model_argparse
from src.model import MODEL_MAPPING_DICT
from src.model import CONFIG_MAPPING_DICT
from src.logger import Experi_Logger
from config.nli_config import nli_parser_model_args

In [7]:
args = nli_parser_model_args()

In [8]:
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
args.pretrained_model = '/home/x1112436/result/faq/modelfile/home/x1112436/model_file/sent_roberta'

In [9]:
model = MODEL_MAPPING_DICT['sent_roberta'].from_pretrained(
    args.pretrained_model, **vars(args), 
)
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

# load data

In [10]:
from skt.gcp import load_bigquery_ipython_magic, \
                    bq_to_pandas, \
                    get_bigquery_client

In [11]:
dataset = 'x1112436'
log_table = 'faq_table'
query = f"""

SELECT  query,
        answer,
        intent_nm,
        answer,
        domain,
        status
FROM `skt-datahub.{dataset}.{log_table}`
WHERE intent_nm !='' and intent_nm is not null
"""

In [12]:
faq_table = bq_to_pandas(query)

unsupported operand type(s) for /: 'NoneType' and 'int'
Downloading: 100%|[32m██████████[0m|


In [13]:
idx2intent_nm = list(faq_table.intent_nm.unique())

# Load service query

In [26]:
service_query_file = '/home/x1112436/git/sent-semantic-repo/data/service_query/query.csv'

In [27]:
service_queries = []
with open(service_query_file, 'r') as f:
    for data in f:
        line = data.split('|')
        for i in range(5- len(line)):
            line.append('')
        service_queries.append({'query': line[0], 'count': line[1], 'p_rank1': line[2], 'p_rank2': line[3], 'p_rank3': line[4]})

In [28]:
import pandas as pd
service_queries_pd = pd.DataFrame(service_queries)

In [30]:
service_queries_pd.head(1)

Unnamed: 0,query,count,p_rank1,p_rank2,p_rank3
0,미납요금 납부 가능일 문의,49194,SKT미납센터연락처(0.4809759),미납이용정지(0.457217725),미납직권해지(0.43127817)


In [None]:
service_queries = list(service_queries_pd['query'].unique())

# Inference Setting

In [14]:
from torch.utils.data import (
    DataLoader, Dataset
)

In [15]:
from dataclasses import dataclass
from typing import List, Any, Union, Dict

In [16]:
@dataclass
class SingleSentenceInput:
    sentence_a: str = None
    a_input_ids: List[int] = None
    a_attention_mask: List[int] = None

In [17]:
class EmbeddingDataset(Dataset):
    def __init__(
            self,
            args,
            features:List[SingleSentenceInput],
            max_length,
            tokenizer,
            **kwargs
    ):
        super(EmbeddingDataset, self).__init__()
        self.args = args
        self.features = features
        self.max_length = max_length
        self.pad_token_id = tokenizer.pad_token_id
        self.sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id else tokenizer.eos_token_id

    def __getitem__(self, index) -> Dict[str, Any]:
        feature = self.features[index]
        return {
            'a_sentence': feature.sentence_a,
            'a_input_ids': torch.tensor(feature.a_input_ids, dtype=torch.long),
            'a_attention_mask': torch.tensor(feature.a_attention_mask, dtype=torch.long)
        }
    def __len__(self):
        return len(self.features)
    
    def loader(self, shuffle:bool=True, batch_size:int=64):
        return DataLoader(self, shuffle=shuffle, batch_size=batch_size, collate_fn=self.collater)

    def collater(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:

        a_sentence = [data['a_sentence'] for data in batch]
        a_input_ids = [data['a_input_ids'] for data in batch]
        a_attention_mask = [data['a_attention_mask'] for data in batch]
        ##  token level encoding
        batch_size = len(batch)
        sizes = [len(s) for s in a_input_ids]
        target_size = min(max(sizes), self.max_length)
        """ torch.full -> creates a tensor of a given shape and fills it with a scalar value self.pad_token_id here"""
        a_collated_ids = torch.full((batch_size, target_size), self.pad_token_id, dtype=torch.long)
        a_collated_attention_masks = torch.zeros((batch_size, target_size), dtype=torch.long)

        """ cut data if size > target_size else: fill by self.pad_token_id """
        for i, (input_id, attention_m, size) in enumerate(
                zip(a_input_ids, a_attention_mask, sizes)):
            diff = target_size - size
            if diff < 0:
                a_collated_ids[i, :target_size] = input_id[:target_size]
                a_collated_ids[i, -1] = self.sep_token_id
                a_collated_attention_masks[i, :target_size] = attention_m[:target_size]

            else:
                a_collated_ids[i, :size] = input_id
                a_collated_attention_masks[i, :size] = attention_m

        return {
            'a_sentence': a_sentence,
            'a_input_ids': a_collated_ids,
            'a_attention_mask': a_collated_attention_masks
        }

In [18]:
from src.utils.abs_preprocess import AbsPreprocessor

class Testprocessor(AbsPreprocessor):

    @classmethod
    def preprocess(cls, tokenizer,  input_list:List) -> None:
        """ try read tsv file using pandas first if [memory or parse] error catched use other reading method  """
    
        feature_list = list()
        skipped_line = 0

        for i, line in enumerate(input_list):
            try:
                a_encoded_sentence = cls.tokenizing(input=line, tokenizer=tokenizer, tokenizer_input=None)
                feature_list.append(
                    SingleSentenceInput(
                        sentence_a = line,
                        a_input_ids = a_encoded_sentence.input_ids,
                        a_attention_mask=a_encoded_sentence.attention_mask,
                    )
                )
            except Exception as e:
                print(f'Error occurs in {i} lines in preprocessing')
                print(line)
                print(e)
                break

        return feature_list


In [19]:
def encode(chunk_list):
    emedding_dict = dict()
    embedding_list = []
    query_list = []
    for chunk in chunk_list:
        chunk_process = Testprocessor.preprocess(tokenizer = tokenizer, input_list = chunk)
        chunk_dataset = EmbeddingDataset(args=args, features=chunk_process, max_length=args.model_max_len, tokenizer=tokenizer)
        chunk_dataloader = chunk_dataset.loader(shuffle=False, batch_size=400)
        model.eval()
        with torch.no_grad():   
            for batch_idx, batch in enumerate(chunk_dataloader): 
                batch = {key: (item.to(args.device) if type(item) == torch.Tensor else item) for key, item in batch.items()}
                a_embedding = model(batch['a_input_ids'], batch['a_attention_mask'])
                a_sentence = batch['a_sentence']
                query_list.extend(a_sentence)
                embedding_list.append(a_embedding)
            
    embeddings = torch.cat(embedding_list, 0) 
    return embeddings, query_list


In [None]:
model = model.to(device)

# Set Dataloader and Dataset

In [None]:
intent_input = Testprocessor.preprocess(tokenizer = tokenizer , input_list = idx2intent_nm)

In [None]:
sq_input = Testprocessor.preprocess(tokenizer = tokenizer ,input_list = service_queries)

In [None]:
sqDataset = EmbeddingDataset(args=args, features=sq_input, max_length=args.model_max_len, tokenizer=tokenizer)

In [None]:
intentDataset = EmbeddingDataset(args=args, features=intent_input, max_length=args.model_max_len, tokenizer=tokenizer)

In [None]:
intentDataloader = intentDataset.loader(
            shuffle=False, batch_size=400 )

In [None]:
sqDataloader = sqDataset.loader(shuffle=False, batch_size=400 )

# Inference

In [None]:
model.eval()
service_query_list = []
embedding_service_query = []
with torch.no_grad():   
    for batch_idx, batch in enumerate(tqdm(sqDataloader)): 
        batch = {key: (item.to(args.device) if type(item) == torch.Tensor else item) for key, item in batch.items()}
        a_embedding = model(batch['a_input_ids'], batch['a_attention_mask'])
        a_sentence = batch['a_sentence']
        service_query_list.extend(a_sentence)
        embedding_service_query.append(a_embedding)
    embedding_service_query = torch.cat(embedding_service_query, 0) 


In [None]:
model.eval()
intent_list = []
embedding_intent = []
with torch.no_grad():   
    for batch_idx, batch in enumerate(tqdm(intentDataloader)): 
        batch = {key: (item.to(args.device) if type(item) == torch.Tensor else item) for key, item in batch.items()}
        a_embedding = model(batch['a_input_ids'], batch['a_attention_mask'])
        a_sentence = batch['a_sentence']
        intent_list.extend(a_sentence)
        embedding_intent.append(a_embedding)
    embedding_intent = torch.cat(embedding_intent, 0) 


# TOP K

In [None]:
import torch.nn.functional as F
values, indices = torch.topk(F.normalize(embedding_service_query, dim =1) @ F.normalize(embedding_intent, dim=1).T, 3)

In [None]:
threshold = 0.4

In [None]:
values[values < threshold] = -1
indices[values < threshold] = -1

In [None]:
indices = indices.cpu().numpy()

In [None]:
predict_dict = dict()
for i in range(embedding_service_query.size()[0]):        
    predict_dict[service_queries[i]] = [(intent_list[indices[i, j]], round(values[i,j].cpu().item(), 3)) for j in range(3) if indices[i,j] != -1]

predict_list = []
for key, value in predict_dict.items():
    query = key
    for i in range(3 - len(value)):
        value.append('')
        
    predict_list.append({'query': query, 'n_rank1': value[0], 'n_rank2': value[1], 'n_rank3': value[2]})
            

In [None]:
predict_pd = pd.DataFrame(predict_list)

# join and sampling

In [20]:
import pandas as pd

In [22]:
result1 = pd.read_csv('../result/result_v1.csv')
result2 = pd.read_csv('../result/result_v2.csv')

In [23]:
result1

Unnamed: 0,query,n_rank1,n_rank2,n_rank3,count,p_rank1,p_rank2,p_rank3
0,미납요금 납부 가능일 문의,"('SKT 미납센터 연락처', 0.884)","('미납 이용정지', 0.874)","('과납', 0.858)",49194,SKT미납센터연락처(0.4809759),미납이용정지(0.457217725),미납직권해지(0.43127817)
1,미납 문의할게,"('SKT 미납센터 연락처', 0.97)","('미납 이용정지', 0.935)","('과납', 0.881)",27234,SKT해지미납센터연락처(0.4995),SKT미납센터연락처(0.4940547),미납직권해지(0.4642235)
2,미납문의할게,"('SKT 미납센터 연락처', 0.956)","('미납 이용정지', 0.903)","('과납', 0.9)",1616,SKT미납센터연락처(0.49495625),미납직권해지(0.463400065),미납이용정지(0.4573068)


In [25]:
query = '미납 문의할게'
query.replace(' ', '')

'미납문의할게'