In [5]:
# pip install sentence_transformers
# pip install -U scikit-learn
# pip install rank_bm25

In [222]:
import torch
from transformers import (
    AdamW,
    AutoModel,
    get_linear_schedule_with_warmup,
    AutoTokenizer,
    AutoConfig
)

In [223]:
import sys
module_path = '/home/x1112436/git/sent-semantic-repo'
sys.path.append(module_path)

In [224]:
from skt.vault_utils import get_secrets
proxies = get_secrets('proxies')

In [225]:
import os 
import numpy as np
import torch

In [226]:
os.environ['http_proxy'] = proxies['http']
os.environ['https_proxy'] = proxies['https']

In [227]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [228]:
from src.utils import set_seed
from src.trainer import SimcseTrainer
from src.dataset import DATASET_MAPPING_DICT
from src.utils import PreprocessorFactory 
from src.utils import get_model_argparse
from src.model import MODEL_MAPPING_DICT
from src.model import CONFIG_MAPPING_DICT
from src.logger import Experi_Logger
from config.nli_config import nli_parser_model_args

In [229]:
#base_model_url = "jhlee3421/faq-semantic-klue-roberta-large"
#base_model_url= 'klue/roberta-large'

In [230]:
args = nli_parser_model_args()

In [231]:
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.output_dir = f'/home/x1112436/result/faq/modelfile/{args.pretrained_model}'
args.log_dir = f'/home/x1112436/result/faq/log/{args.pretrained_model}'
args.experiments_path = f'/home/x1112436/result/faq/experiment/{args.pretrained_model}/experiment.csv'
args.is_preprocessed = True
args.valid_first = False
args.data_type='triple'
args.loss= 'TripletLoss'
args.margin = 0.7
args.num_train_epochs = 20
args.model_max_len = 50
args.valid_first = True

# Load model

In [232]:
#args.pretrained_model = '/home/x1112436/model_file/faq_sent_roberta/sent_roberta'
args.pretrained_model = '/home/x1112436/model_file/sent_roberta'

In [233]:
model = MODEL_MAPPING_DICT['sent_roberta'].from_pretrained(
    args.pretrained_model, **vars(args), 
)
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

# define function

In [234]:
# chunking inetent
def chunking(input_list, chunk_size=200):
    if len(input_list) < 200:
        return [input_list]
    
    result = []
    chunk_length = int(len(input_list) / chunk_size)
    chunk_rest = len(input_list) % chunk_size

    start = 0
    for i in range(0, chunk_length):
        result.append(input_list[start: start + chunk_size])
        start = start + chunk_size
                      
    if chunk_rest > 0 :
        result.append(input_list[start:])
    return result

In [235]:
print(device)

cuda


# Load DATA

In [236]:
from skt.gcp import load_bigquery_ipython_magic, \
                    bq_to_pandas, \
                    get_bigquery_client

In [237]:
dataset = 'x1112436'
log_table = 'faq_table'
query = f"""

SELECT  query,
        answer,
        intent_nm,
        answer,
        domain,
        status
FROM `skt-datahub.{dataset}.{log_table}`
WHERE intent_nm !='' and intent_nm is not null
"""

In [238]:
faq_table = bq_to_pandas(query)

unsupported operand type(s) for /: 'NoneType' and 'int'
Downloading: 100%|[32m██████████[0m|


In [240]:
faq_table.loc[faq_table.answer==''].head(2)             

Unnamed: 0,query,answer,intent_nm,answer_1,domain,status
3,baro 3GB 문의,,baro 3GB,,sms_customer_center,INSERT
4,baro 3GB,,baro 3GB,,sms_customer_center,INSERT


# make corpus

In [241]:
idx2query = list(faq_table['query'].unique())
idx2intent_nm = list(faq_table.intent_nm.unique())

In [242]:
# with open('intent_nm.csv', 'a') as f:
#     for intent in idx2intent_nm):
#         f.write(intent + '\t' + origin + '\n')

In [243]:
faq_table_q_a = faq_table[['query', 'intent_nm']]
query_to_answer = faq_table_q_a.set_index('query').to_dict()['intent_nm']

# Token Count

In [244]:
from collections import Counter, OrderedDict

In [245]:
intent_list = list(faq_table.intent_nm.unique())
query_list = list(faq_table['query'].unique())

In [246]:
query_token_list = []
for query in query_list:
    query_token_list.extend(query.split())

In [247]:
intent_token_list = []
for intent in intent_list:
    intent_token_list.extend(intent.split())

In [248]:
intent_token_list = OrderedDict(Counter(intent_token_list).most_common())

In [249]:
query_token_list = OrderedDict(Counter(query_token_list).most_common())

# SAVE DATA FOR EACH LABEL (INTENT_NM)

In [None]:
for i, intent in enumerate(intent_list):
    file_name = f'../data/label_data/{i}.csv'
    queries = list(faq_table.loc[faq_table.intent_nm == intent]['query'].unique())
    intent = intent.replace("'", "")
    with open(file_name, 'a+') as f:
        f.write(intent + '\n')
        for query in queries:
            f.write(query + '\n')

# one sentence embedding setting

In [250]:
from torch.utils.data import (
    DataLoader, Dataset
)

In [251]:
from dataclasses import dataclass
from typing import List, Any, Union, Dict

In [252]:
@dataclass
class SingleSentenceInput:
    sentence_a: str = None
    a_input_ids: List[int] = None
    a_attention_mask: List[int] = None

In [253]:
class EmbeddingDataset(Dataset):
    def __init__(
            self,
            args,
            features:List[SingleSentenceInput],
            max_length,
            tokenizer,
            **kwargs
    ):
        super(EmbeddingDataset, self).__init__()
        self.args = args
        self.features = features
        self.max_length = max_length
        self.pad_token_id = tokenizer.pad_token_id
        self.sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id else tokenizer.eos_token_id

    def __getitem__(self, index) -> Dict[str, Any]:
        feature = self.features[index]
        return {
            'a_sentence': feature.sentence_a,
            'a_input_ids': torch.tensor(feature.a_input_ids, dtype=torch.long),
            'a_attention_mask': torch.tensor(feature.a_attention_mask, dtype=torch.long)
        }
    def __len__(self):
        return len(self.features)
    
    def loader(self, shuffle:bool=True, batch_size:int=64):
        return DataLoader(self, shuffle=shuffle, batch_size=batch_size, collate_fn=self.collater)

    def collater(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:

        a_sentence = [data['a_sentence'] for data in batch]
        a_input_ids = [data['a_input_ids'] for data in batch]
        a_attention_mask = [data['a_attention_mask'] for data in batch]
        ##  token level encoding
        batch_size = len(batch)
        sizes = [len(s) for s in a_input_ids]
        target_size = min(max(sizes), self.max_length)
        """ torch.full -> creates a tensor of a given shape and fills it with a scalar value self.pad_token_id here"""
        a_collated_ids = torch.full((batch_size, target_size), self.pad_token_id, dtype=torch.long)
        a_collated_attention_masks = torch.zeros((batch_size, target_size), dtype=torch.long)

        """ cut data if size > target_size else: fill by self.pad_token_id """
        for i, (input_id, attention_m, size) in enumerate(
                zip(a_input_ids, a_attention_mask, sizes)):
            diff = target_size - size
            if diff < 0:
                a_collated_ids[i, :target_size] = input_id[:target_size]
                a_collated_ids[i, -1] = self.sep_token_id
                a_collated_attention_masks[i, :target_size] = attention_m[:target_size]

            else:
                a_collated_ids[i, :size] = input_id
                a_collated_attention_masks[i, :size] = attention_m

        return {
            'a_sentence': a_sentence,
            'a_input_ids': a_collated_ids,
            'a_attention_mask': a_collated_attention_masks
        }

In [254]:
from src.utils.abs_preprocess import AbsPreprocessor

class Testprocessor(AbsPreprocessor):

    @classmethod
    def preprocess(cls, tokenizer,  input_list:List) -> None:
        """ try read tsv file using pandas first if [memory or parse] error catched use other reading method  """
    
        feature_list = list()
        skipped_line = 0

        for i, line in enumerate(input_list):
            try:
                a_encoded_sentence = cls.tokenizing(input=line, tokenizer=tokenizer, tokenizer_input=None)
                feature_list.append(
                    SingleSentenceInput(
                        sentence_a = line,
                        a_input_ids = a_encoded_sentence.input_ids,
                        a_attention_mask=a_encoded_sentence.attention_mask,
                    )
                )
            except Exception as e:
                print(f'Error occurs in {i} lines in preprocessing')
                print(line)
                print(e)
                break

        return feature_list


# Kmeans sampling for Representative query for each intent_nm

In [255]:
model = model.to(device)

In [256]:
def encode(chunk_list):
    emedding_dict = dict()
    embedding_list = []
    query_list = []
    for chunk in chunk_list:
        chunk_process = Testprocessor.preprocess(tokenizer = tokenizer, input_list = chunk)
        chunk_dataset = EmbeddingDataset(args=args, features=chunk_process, max_length=args.model_max_len, tokenizer=tokenizer)
        chunk_dataloader = chunk_dataset.loader(shuffle=False, batch_size=400)
        model.eval()
        with torch.no_grad():   
            for batch_idx, batch in enumerate(chunk_dataloader): 
                batch = {key: (item.to(args.device) if type(item) == torch.Tensor else item) for key, item in batch.items()}
                a_embedding = model(batch['a_input_ids'], batch['a_attention_mask'])
                a_sentence = batch['a_sentence']
                query_list.extend(a_sentence)
                embedding_list.append(a_embedding)
            
    embeddings = torch.cat(embedding_list, 0) 
    return embeddings, query_list


In [257]:
embedding_dict = {}
query_dict = {}
for i, intent in enumerate(idx2intent_nm):
    if i % 400 ==0:
        print(i)

    unique_query = faq_table_q_a[faq_table_q_a.intent_nm == intent]
    candidate_query = list(unique_query['query'].unique())
    model_input_list = chunking(candidate_query, chunk_size=300)
    embeddings, queries = encode(model_input_list)    
    intent = intent.strip()
    embedding_dict[intent] = embeddings.cpu().numpy()
    query_dict[intent] = queries

0
400
800
1200
1600


In [258]:
import pickle
embedding_file = '../embedding_result/intent_query_embedding_klue_roberta_large.pkl'
if os.path.exists(embedding_file):
    os.remove(embedding_file)
    
with open(embedding_file, 'wb') as f:
    pickle.dump({
        'query_dict': query_dict,
        'embedding_dict': embedding_dict
    }, f)

# after embedding

In [40]:
import pickle
with open('../embedding_result/intent_query_embedding_klue_roberta_large.pkl', 'rb') as f:
    test = pickle.load(f)
 

In [41]:
query_dict = test['query_dict']
embedding_dict = test['embedding_dict']

In [260]:
import sklearn
from sklearn.cluster import KMeans
import numpy as np

In [259]:
def get_cluster_size(data_size):
    if data_size >= 100:
        cluster_size = 5
    elif (data_size < 100) and (data_size > 30) :
        cluster_size = 3
    else:
        cluster_size = 1
    return cluster_size  

In [None]:
# pip install -U scikit-learn

In [261]:
from sklearn.cluster import KMeans

In [262]:
from tqdm.notebook import tqdm

In [263]:
from collections import defaultdict
sample_prop = 0.4
train_rep_queries = defaultdict(list)
test_rep_queries = defaultdict(list)

for key, value in tqdm(list(embedding_dict.items())):
    intent_nm = key
    query = query_dict[intent_nm]
    #query = [j for sub in query for j in sub]
    data_size = value.shape[0]
    cluster_size = get_cluster_size(data_size)
    sampled_q = []
    neg_sampled_q = []
    if cluster_size > 1:
        kmeans_label = KMeans(n_clusters=cluster_size, random_state=0, n_init="auto", max_iter=1000, init='k-means++').fit_predict(value)
        for i in range(cluster_size):
            c_i = np.where(kmeans_label == i)[0].tolist() 
            n_i = len(c_i)
            sample_i = np.random.choice(c_i, max(int(sample_prop * n_i), 1), replace=False)
            train_sample = sample_i[: int(0.8 * len(sample_i))]
            val_sample = sample_i[int(0.8 * len(sample_i)):]
            sampled_q.extend([query[ind] for ind in train_sample])
            neg_sampled_q.extend([query[ind] for ind in val_sample])
        train_rep_queries[intent_nm] = sampled_q
        test_rep_queries[intent_nm] = neg_sampled_q
    else:
        if len(query) > 10:
            ind = int(np.ceil(1/2 * len(query)))
            train_rep_queries[intent_nm] = query[:ind]
            test_rep_queries[intent_nm] = query[ind:]
        else:
            train_rep_queries[intent_nm] = query
            test_rep_queries[intent_nm] = []

  0%|          | 0/1606 [00:00<?, ?it/s]

# SAVE KMEANS 

In [264]:
import pickle
kmeans_file = '../embedding_result/kmeans_train_test_dataset.pkl'
if os.path.exists(kmeans_file):
    os.remove(kmeans_file)
    
with open(kmeans_file, 'wb') as f:
    pickle.dump({
        'train': train_rep_queries,
        'test': test_rep_queries
    }, f)

# Hard negative sampling based on BM25

In [265]:
train_query_list = []
for key, val in rep_queries.items():
    train_query_list.append(val)

In [266]:
#train_query_list[:5]

In [267]:
#!pip install rank_bm25

In [268]:
from rank_bm25 import BM25Okapi

In [269]:
black_list_words = ['관련', '방법', '서비스']

In [270]:
intent_list = []
for key in query_dict.keys():
    for blacklist in black_list_words:
        key = key.replace(blacklist, '')
    intent_list.append(key)

In [271]:
def tokenizer_white(sent):
  sent = sent.replace("'", "")
  return sent.split(" ")

In [272]:
tokenized_corpus = [tokenizer_white(intent) for intent in intent_list]

In [273]:
bm25 = BM25Okapi(tokenized_corpus)

In [274]:
len(train_rep_queries)

1606

In [275]:
negative_dict = dict()
negative_count = 5
bm25_thres = 6

for intent in train_rep_queries.keys():
    intent = intent.strip()
    neg_list = []
    try:
        neg_candidate = bm25.get_top_n(tokenizer_white(intent), intent_list, n = negative_count * 20 )
        #neg_candidate = [neg for neg in neg_candidate if intent not in neg]
        ## get result_top_k_threshold
        scores = bm25.get_scores(tokenizer_white(intent))
        result_topk = len(scores[np.nonzero(scores > bm25_thres)])
        #topk_neg = neg_candidate[1: 1 + result_topk]
        negatives = neg_candidate[result_topk: result_topk + negative_count]
        if not negatives:
            print(result_topk)
            
    except Exception as e:
        print(intent)
        print(e)
        break
        
    for neg in negatives:
        if (neg in intent) or (intent in neg):
            pass
        else:
            neg_list.append(neg)
            
    negative_dict[intent] = neg_list

# SAVE negative dict and intent_list

In [276]:
import pickle
neg_file = '../data/negative_dict_bm25.pkl'
if os.path.exists(neg_file):
    os.remove(neg_file)
    
with open(neg_file, 'wb') as f:
    pickle.dump(negative_dict, f)

In [277]:
import pickle
intent_file = '../data/intent_list.pkl'
if os.path.exists(intent_file):
    os.remove(intent_file)
    
with open(intent_file, 'wb') as f:
    pickle.dump(intent_list, f)

# Add vocab based on frequency count

In [22]:
candidate_add_intent_token_list = []
candidate_add_query_token_list = []
add_token_count = 20

In [97]:
for key, val in intent_token_list.items():
    if key not in tokenizer.vocab and len(key) > 1:
        #candidate_add_intent_token_list.append("["+ key + "]")
        candidate_add_intent_token_list.append(key)
    if len(candidate_add_intent_token_list) == add_token_count:
        break   

In [50]:
save_path = './data/add_tokens.csv'
with open(save_path, 'a') as f:
    for data in candidate_add_intent_token_list:
        f.write(data + '\n')

In [50]:
word_embedding_model = models.Transformer(base_model_url)

In [51]:
word_embedding_model.tokenizer.add_tokens(candidate_add_intent_token_list, special_tokens=True)

20

In [56]:
word_embedding_model.tokenizer.model_max_length=128

In [57]:
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

In [58]:
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [59]:
model.max_seq_length = 128

# DATASET FOR TRAIN

In [278]:
# from sentence_transformers import SentenceTransformer, SentencesDataset, losses
# from sentence_transformers.readers import InputExample

In [280]:
train_dataset = []
for intent, queries in train_rep_queries.items():
    for query in queries:
        for negative in negative_dict[intent]:
            train_dataset.append([query, intent, negative])
            
print(len(train_dataset))

357847


In [281]:
save_path = '../data/train.csv'
if os.path.exists(save_path):
    os.remove(save_path)

with open(save_path, 'a') as f:
    for data in train_dataset:
        line = '\t'.join(data)
        f.write(line + '\n')


In [282]:
len(test_rep_queries.keys())

1606

In [283]:
val_dataset = []
for intent, queries in test_rep_queries.items():
    if queries:
        for query in queries:
            for negative in negative_dict[intent]:
                val_dataset.append([query, intent, negative])

print(len(val_dataset))

save_path = '../data/val.csv'
if os.path.exists(save_path):
    os.remove(save_path)
            
with open(save_path, 'a') as f:
    for data in val_dataset:
        line = '\t'.join(data)
        f.write(line + '\n')

108063
