In [1]:
import pandas as pd
import numpy as np
import os
import glob
from time import perf_counter
import tqdm 

### data load

In [2]:
directory = os.getenv('HOME') + '/aiffel/aiffelthon'

In [3]:
os.listdir(directory +'/final')

['raw_data_sampling(0223).csv',
 'Train_set_data.csv',
 'Test_set_data_sampling(0223).csv',
 'Train_set_long_sequence.csv']

In [4]:
df = pd.read_csv(directory + '/final/Test_set_data_sampling(0223).csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    500 non-null    object
 1   dial    500 non-null    object
 2   reg     500 non-null    object
 3   eng     500 non-null    object
dtypes: object(4)
memory usage: 15.8+ KB


In [6]:
df.reg.value_counts()

jj    100
cc    100
kw    100
jd    100
gs    100
Name: reg, dtype: int64

In [7]:
df.sample(frac=1).head()

Unnamed: 0,text,dial,reg,eng
198,그~ 그~ 그건가 아닌가 모르겠는데 그런 종류 이렇게 패스츄리야 일종이,그~ 그~ 긴가 아닌가 모르겠는데 그런 종류 이케 패스츄리야 일종이,cc,I dont know if its that but its a kind of past...
36,그때는 자연 그대로 걸 봤었지.,그때는 자연 그대로 걸 봤었지게.,jj,I saw the natural one back then
15,이제 십이월 이니까 일월 이월쯤 되면 나올건가?,이제 십이월 이난 일월 이월쯤 되면 나올건가?,jj,Its twelve months now so will it come out by t...
443,그러니까 그렇게는 다 하는 거다.,그니까 그렇게는 다 하는 거다.,gs,That's why we do it all the time.
156,그 마음을 쪼끔 확인할 수 있는 하나의 수단이 되는 것 같아서 좀 기분이 좋더라고 그래서,그 마음을 쪼끔 확인할 수 있는 하나의 수단이 되는 것 같아서 쫌 기분이 좋더라고 그래서,cc,I felt a little bit because it seemed to be a ...


## 가장 적합한 모델 선택

허깅페이스에서 테스트 해볼만한 6가지 모델을 추출하였다.

In [2]:
model_ckpt = 'snunlp/KR-SBERT-V40K-klueNLI-augSTS'
model_ckpt2= 'Huffon/klue-roberta-base-nli'
model_ckpt3 ='ddobokki/klue-roberta-small-nli-sts'
model_ckpt4 ='beomi/KcELECTRA-base-v2022'
model_ckpt5 = 'lighthouse/mdeberta-v3-base-kor-further'
model_ckpt6 ='klue/bert-base'
model_ckpt7 = 'klue/roberta-small'

In [9]:
# ! pip install sentence_transformers

In [10]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer(model_ckpt)



In [11]:
sentences = df.text.values
text = '점심먹으로 가야겠어요'
k=3

In [12]:
def top_sts(text,sentences,model_ckpt,k):
    k = k
    model = SentenceTransformer(model_ckpt)
    start_time = perf_counter()
    embeddings = model.encode(sentences,convert_to_tensor=True)
    src_embeddings = model.encode(text,convert_to_tensor=True)
    top_k = np.argpartition(util.pytorch_cos_sim(src_embeddings, embeddings).to('cpu').numpy()[0],-k)[-k:]
    top_res = np.partition(util.pytorch_cos_sim(src_embeddings, embeddings).to('cpu').numpy()[0],-k)[-k:]
    latency = perf_counter() - start_time
    print(f'{latency * 1000:.3f} ms')
    return sentences[top_k], top_k, top_res

In [13]:
# model 
top_sts(text,sentences,model_ckpt,k)

5514.486 ms


(array(['그래서 아침 열 시 까지 기다려야 했는데', '이따 따뜻한 거 먹어.',
        '밥을 먹고있었는데 밥 먹고 있었는데 갑자기 무 문별의'], dtype=object),
 array([111, 299,  38]),
 array([0.42118955, 0.47750413, 0.48916396], dtype=float32))

In [14]:
# model2 
top_sts(text,sentences,model_ckpt2,k)

No sentence-transformers model found with name /aiffel/.cache/torch/sentence_transformers/Huffon_klue-roberta-base-nli. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /aiffel/.cache/torch/sentence_transformers/Huffon_klue-roberta-base-nli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /aiffel/.cache/torch/sentence_trans

784.541 ms


(array(['입이 강하니까 그 밥그릇이 딱 맞는 거야 그래서 나 그 밥그릇 좀 사야 되겠어.',
        '상추로 겉절이 해먹어도 맛있더라.', '조금 표준을 세워야 될 거 같다.'], dtype=object),
 array([165, 216, 412]),
 array([0.8589159 , 0.86442435, 0.86646533], dtype=float32))

In [15]:
# model3
top_sts(text,sentences,model_ckpt3,k)

388.078 ms


(array(['이따 따뜻한 거 먹어.', '밥을 먹고있었는데 밥 먹고 있었는데 갑자기 무 문별의',
        '자기는 인제 동료가 그냥 그~ 이제 뭐~ 밥 한 그릇 먹으라 그러니까'], dtype=object),
 array([299,  38, 413]),
 array([0.29013464, 0.40215534, 0.3343038 ], dtype=float32))

In [16]:
# model4 
top_sts(text,sentences,model_ckpt4,k)

No sentence-transformers model found with name /aiffel/.cache/torch/sentence_transformers/beomi_KcELECTRA-base-v2022. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /aiffel/.cache/torch/sentence_transformers/beomi_KcELECTRA-base-v2022 were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


742.064 ms


(array(['저기 라인이 조금 있는 거 같더라고', '아마 다음 주에 완공식 할 수 있을 것 같은데',
        '볼이 좁아가지고 저랑 잘 안 맞던데'], dtype=object),
 array([324, 262, 429]),
 array([0.87109256, 0.8728123 , 0.8739823 ], dtype=float32))

In [17]:
# model5
top_sts(text,sentences,model_ckpt5,k)

No sentence-transformers model found with name /aiffel/.cache/torch/sentence_transformers/lighthouse_mdeberta-v3-base-kor-further. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /aiffel/.cache/torch/sentence_transformers/lighthouse_mdeberta-v3-base-kor-further were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Speci

1966.247 ms


(array(['입이 강하니까 그 밥그릇이 딱 맞는 거야 그래서 나 그 밥그릇 좀 사야 되겠어.',
        '왜 소린질러 하기 싫음 말지 라면 삶아먹으면 되잖아.', '외양간에 가면.'], dtype=object),
 array([165, 265, 359]),
 array([0.87373066, 0.878375  , 0.88045275], dtype=float32))

In [18]:
# model6
top_sts(text,sentences,model_ckpt6,k)

No sentence-transformers model found with name /aiffel/.cache/torch/sentence_transformers/klue_bert-base. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /aiffel/.cache/torch/sentence_transformers/klue_bert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClass

808.523 ms


(array(['조금 뛰고 싶은데', '이따 따뜻한 거 먹어.', '이따가 한번 끝나서요'], dtype=object),
 array([382, 299,  60]),
 array([0.5977379 , 0.63222516, 0.6699983 ], dtype=float32))

In [19]:
# model7
top_sts(text,sentences,model_ckpt7,k)

No sentence-transformers model found with name /aiffel/.cache/torch/sentence_transformers/klue_roberta-small. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /aiffel/.cache/torch/sentence_transformers/klue_roberta-small were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint 

451.526 ms


(array(['나왔었어 #이 근데', '이따가 한번 끝나서요', '이따 따뜻한 거 먹어.'], dtype=object),
 array([ 31,  60, 299]),
 array([0.8369011, 0.8412701, 0.8419575], dtype=float32))

model 3이 NLI성능도 좋고 추론 시간도 적당하다, 이 모델로 문장 생성을 해야겠다.물론 훨씬 더 단축은 해야할것 같다

## create long sequences

In [20]:
sentences = df.text.values
sentences_eng = df.eng.values
sentences_dial = df.dial.values

### 선아님 참고

In [5]:
import numpy

In [9]:
numpy.mean([0.5,0.6,.5])

0.5333333333333333

In [13]:
(numpy.sum([1,0.5,0.6,.5])-1) / 3

0.5333333333333333

In [None]:
####

model = SentenceTransformer(model_ckpt)
sentences = #해당 토픽에 표준어 문장들
embeddings = model.encode(sentences,convert_to_tensor=True)
cos_similarity = util.pytorch_cos_sim(embeddings, embeddings).to('cpu').numpy() #seq == 4 ; 4x4 matrix

#topic 별로 : 문장들 간에 유사도 점수 mean ; standard deviation

In [21]:
# cosine 유사도 threshold 기준을 넘는 문장 index 반환

def long_seq(text,sentences,model_ckpt,k, threshold):
    k = k
    model = SentenceTransformer(model_ckpt)
    embeddings = model.encode(sentences,convert_to_tensor=True)
    src_embeddings = model.encode(text,convert_to_tensor=True)
    top_k = np.argpartition(util.pytorch_cos_sim(src_embeddings, embeddings).to('cpu').numpy()[0],-k)[-k:]
    top_res = np.partition(util.pytorch_cos_sim(src_embeddings, embeddings).to('cpu').numpy()[0],-k)[-k:]
    mask = top_res > threshold
    top_k = top_k[mask]
    return top_k

In [22]:
# 문장 <sep 기준으로 합치기>
def create_long_text(text,sentences, sentences_eng, sentences_dial, model_ckpt, k,threshold):
    topk = long_seq(text,sentences,model_ckpt,k, threshold)
    sentences = '<sep>'.join(sentences[topk][::-1])
    eng_sentences = '<sep>'.join(sentences_eng[topk][::-1])
    dial_sentences ='<sep>'.join(sentences_dial[topk][::-1])
    
    return sentences, eng_sentences, dial_sentences

In [23]:
# 10개 데이터만 먼저 확인
test_df = df.copy()[:10]

In [24]:
eng = test_df.eng.values
txt = test_df.text.values
dial = test_df.dial.values

In [25]:
eng_long, txt_long, dial_long =[], [], []
k =3 
threshold = 0.2
for i in txt :
    t, e,d = create_long_text(i,txt, eng, dial, model_ckpt3, k,threshold)
    eng_long.append(e)
    dial_long.append(d)
    txt_long.append(t)
print(eng_long[0])
print(txt_long[0])
print(dial_long[0])

So youre not taking that picture there are you just columnar joints<sep>Thats right Ill do that
그니까 거기 그 사진 찍는아니야 주상절리 마냥<sep>그렇지 그렇게하겠지
그니까 거기 그 사진 찍는아니 주상절리 마냥<sep>그렇지 겅하겠주게


In [30]:
def generate_long_text(data, model_ckpt, k,threshold) :
    '''
    returns concatenated sentences with close cosine similarity scores
    
    data : dataframe
    sentences : original text
    sentences_eng : english text
    setnences_dial : dialect text
    model_ckpt : model checkpoint from huggingface
    k : max number of sentences to compare after embedding
    threshold : threshold to concatenate
    
    '''
    if len(data) > 5000 :
        data = data.sample(frac=1,random_state=1)
        data = data[ : 3000]
    eng = data.eng.values
    txt = data.text.values
    dial = data.dial.values
    reg = data.reg.values
    
    eng_long, txt_long, dial_long =[], [], []
    
    for i in tqdm.tqdm(txt[:5]) :
        t, e,d = create_long_text(i,txt, eng, dial, model_ckpt3, k,threshold)
        eng_long.append(e)
        dial_long.append(d)
        txt_long.append(t)
    
    length = len(txt_long)
    return eng_long, txt_long, dial_long, reg[:length]

In [27]:
os.listdir(directory + '/final')

['raw_data_sampling(0223).csv',
 'Train_set_data.csv',
 'Test_set_data_sampling(0223).csv',
 'Train_set_long_sequence.csv']

In [28]:
df = pd.read_csv(directory + '/final/Train_set_data.csv')

In [61]:
# 각 지역별로 데이터 분리
df = df.sample(frac=1)
df_jj = df.loc[df['reg'] == 'jj']
df_cc = df.loc[df['reg'] == 'cc']
df_jd = df.loc[df['reg'] == 'jd']
df_gs = df.loc[df['reg'] == 'gs']
df_kw = df.loc[df['reg'] == 'kw']
df_all = [df_jj,df_cc, df_jd,df_gs,df_kw]

In [31]:
# 데이터가 너무 많아서 각 지역별 3000개 문장만 생성
threshold = 0.1
k = 3

eng_long_final, txt_long_final, dial_long_final = [], [], []

long_df = pd.DataFrame( columns = ['original','dial', 'eng', 'reg'])

for i in df_all :
    eng_long, txt_long, dial_long,regions = generate_long_text(i, model_ckpt3, k,threshold)
#     eng_long_final.append(eng_long)
#     txt_long_final.append(txt_long)
#     dial_long_final.append(dial_long)
    temp_df = pd.DataFrame({'original': txt_long,'dial':dial_long, 'eng' : eng_long, 'reg' : regions})
    long_df = long_df.merge(temp_df, how='outer')

100%|██████████| 5/5 [00:11<00:00,  2.33s/it]
100%|██████████| 5/5 [00:15<00:00,  3.10s/it]
100%|██████████| 5/5 [00:13<00:00,  2.71s/it]
100%|██████████| 5/5 [00:13<00:00,  2.79s/it]
100%|██████████| 5/5 [00:11<00:00,  2.28s/it]


In [32]:
# long_df.to_csv(directory + '/final/Train_set_long_sequence_final.csv')

In [33]:
long_df['eng'][8]

"Iveried a lot and thank you so much<sep>That's why I appreciate it.<sep>I think I've had a little bit of warm tears."

In [34]:
len(long_df['eng'][8].split())

21

## ONNX to accelerate

In [3]:
from transformers.convert_graph_to_onnx import convert
from transformers import AutoTokenizer



In [4]:
from pathlib import Path
tokenizer = AutoTokenizer.from_pretrained(model_ckpt3)
onnx_model_path =Path('onnx/model.onnx')
convert(framework='pt',model=model_ckpt3, tokenizer=tokenizer, output=onnx_model_path, opset=12, pipeline_name="feature-extraction")

Downloading (…)okenizer_config.json:   0%|          | 0.00/530 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



ONNX opset version set to: 12
Loading pipeline (model: ddobokki/klue-roberta-small-nli-sts, tokenizer: BertTokenizerFast(name_or_path='ddobokki/klue-roberta-small-nli-sts', vocab_size=32000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}))


Downloading (…)lve/main/config.json:   0%|          | 0.00/778 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/272M [00:00<?, ?B/s]

Using framework PyTorch: 1.9.1+cu111
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Found output output_1 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']


In [38]:
from onnxruntime import (GraphOptimizationLevel, InferenceSession, 
                         SessionOptions)

def create_model_for_provider(model_path, provider="CPUExecutionProvider"): 
    options = SessionOptions()
    options.intra_op_num_threads = 1
    options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
    session = InferenceSession(str(model_path), options, providers=[provider])
    session.disable_fallback()
    return session

In [39]:
import datasets

In [40]:
def tokenize_batch(batch):
    return tokenizer(batch['text'])

In [42]:
dataset_enc = dataset.map(tokenize_batch,batched=True, remove_columns=['text','dial','eng','reg'])

Map:   0%|          | 0/1065918 [00:00<?, ? examples/s]

In [44]:
dataset_enc

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1065918
})

In [45]:
onnx_model = create_model_for_provider(onnx_model_path)

In [46]:
inputs = dataset_enc[:1]

In [47]:
logits_onnx = onnx_model.run(None,inputs)[0]
logits_onnx.shape

(1, 9, 768)

In [48]:
dataset[:1]

{'text': ['생각이 쪼금씩 바뀌더라고'],
 'dial': ['생각이 쪼금씩 바뀌드라고'],
 'reg': ['jd'],
 'eng': ["I've changed my mind a little bit."]}

In [49]:
class OnnxPipeline :
    def __init__(self, model, tokenizer) :
        self.model =model
        self.tokenizer = tokenizer
        
    def __call__(self,query) :
        model_inputs = self.tokenizer(query, return_tensors='pt')
        inputs_onnx = {k:v.cpu().detach().numpy() for k, v in model_inputs.items()}
        
        logits = self.model.run(None, inputs_onnx)[0][0].mean(axis=0)
        return logits

In [50]:
from onnxruntime.quantization import quantize_dynamic, QuantType

model_input = "onnx/model.onnx"
model_output = "onnx/model.quant.onnx"
quantize_dynamic(model_input, model_output, weight_type=QuantType.QInt8)

Ignore MatMul due to non constant B: /[MatMul_74]
Ignore MatMul due to non constant B: /[MatMul_79]
Ignore MatMul due to non constant B: /[MatMul_168]
Ignore MatMul due to non constant B: /[MatMul_173]
Ignore MatMul due to non constant B: /[MatMul_262]
Ignore MatMul due to non constant B: /[MatMul_267]
Ignore MatMul due to non constant B: /[MatMul_356]
Ignore MatMul due to non constant B: /[MatMul_361]
Ignore MatMul due to non constant B: /[MatMul_450]
Ignore MatMul due to non constant B: /[MatMul_455]
Ignore MatMul due to non constant B: /[MatMul_544]
Ignore MatMul due to non constant B: /[MatMul_549]


In [51]:
onnx_quantized_model = create_model_for_provider(model_output)

In [53]:
onnx_quantized_model.run(None, inputs)[0][0][0].shape

(768,)

In [54]:
pipe = OnnxPipeline(onnx_quantized_model,tokenizer)

## region 별 데이터 구축

In [80]:
def extract_hidden_states(dataset) :
    logit = []
    for i in tqdm.tqdm(dataset) :
        hidden = pipe(i['text'])
        logit.append(hidden)
    return {'hidden_state': logit}

In [65]:
start_time = perf_counter()
hidden = extract_hidden_states(dataset_jj)
print(f'{ (perf_counter() - start_time) : .2f} s')

 143.91 s


In [66]:
sentences = np.array(dataset_jj['text'])

In [68]:
def top_sts_onnx(text,sentences,model_ckpt,k):
    k = k
    start_time = perf_counter()
    src_embeddings = pipe(text)
#     tgt_embeddings = pipe()
    top_k = np.argpartition(util.pytorch_cos_sim(src_embeddings, hidden['hidden_state']).to('cpu').numpy()[0],-k)[-k:]
    top_res = np.partition(util.pytorch_cos_sim(src_embeddings, hidden['hidden_state']).to('cpu').numpy()[0],-k)[-k:]
    latency = perf_counter() - start_time
    print(f'{latency * 1000:.3f} ms')
    return sentences[top_k], top_k, top_res

In [69]:
top_sts_onnx(text,sentences,model_ckpt,k) #

1123.173 ms


(array(['이마트에서 만나서 거기서 점심 먹고 해결 해서', '그리고 -저- 점심 먹고',
        '밥 하세요 하니까 점심은 어떻게 할것이냐 하니까'], dtype='<U101'),
 array([1899, 3078, 8135]),
 array([0.582674  , 0.72822845, 0.715572  ], dtype=float32))

In [70]:
# generate senteces with onnx quantized model

In [111]:
def long_seq_onnx(text,k,threshold):
    k = k
    start_time = perf_counter()
    src_embeddings = pipe(text)
    top_k = np.argpartition(util.pytorch_cos_sim(src_embeddings, hidden['hidden_state']).to('cpu').numpy()[0],-k)[-k:]
    top_res = np.partition(util.pytorch_cos_sim(src_embeddings, hidden['hidden_state']).to('cpu').numpy()[0],-k)[-k:]
    mask = top_res > threshold
    top_k = top_k[mask]
    return top_k

In [133]:
def create_long_text_onnx(text,k,threshold):
    topk = long_seq_onnx(text,k,threshold)
    st = '<sep>'.join(sentences[topk][::-1])
    eng_st = '<sep>'.join(sentences_eng[topk][::-1])
    dial_st ='<sep>'.join(sentences_dial[topk][::-1])
    
    return st, eng_st, dial_st

In [151]:
def generate_long_text_onnx(data,k,threshold) :
    '''
    returns concatenated sentences with close cosine similarity scores
    
    data : dataframe
    sentences : original text
    sentences_eng : english text
    setnences_dial : dialect text
    model_ckpt : model checkpoint from huggingface
    k : max number of sentences to compare after embedding
    threshold : threshold to concatenate
    
    '''
    eng = data['eng']
    txt = data['text']
    dial = data['dial']
    reg = data['reg']
    
    eng_long, txt_long, dial_long =[], [], []
    
    for i in tqdm.tqdm(txt) :
        t, e,d = create_long_text_onnx(i,k,threshold)
        eng_long.append(e)
        dial_long.append(d)
        txt_long.append(t)
    
    length = len(txt_long)
    return eng_long, txt_long, dial_long, reg[:length]

In [135]:
df_jj = df.loc[df['reg'] == 'jj']
df_cc = df.loc[df['reg'] == 'cc']
df_jd = df.loc[df['reg'] == 'jd']
df_gs = df.loc[df['reg'] == 'gs']
df_kw = df.loc[df['reg'] == 'kw']
df_all = [df_jj,df_cc, df_jd,df_gs,df_kw]

In [152]:
dataset_jj = datasets.Dataset.from_pandas(df_jj.iloc[:3000].reset_index())
dataset_cc = datasets.Dataset.from_pandas(df_cc.iloc[:3000].reset_index())
dataset_kw = datasets.Dataset.from_pandas(df_kw.iloc[:3000].reset_index())
dataset_gs = datasets.Dataset.from_pandas(df_gs.iloc[:3000].reset_index())
dataset_jd = datasets.Dataset.from_pandas(df_jd.iloc[:3000].reset_index())
df_all = [dataset_jj,dataset_cc, dataset_jd,dataset_gs,dataset_kw]

In [153]:
pipe = OnnxPipeline(onnx_quantized_model,tokenizer)

In [154]:
# 데이터가 너무 많아서 각 지역별 3000개 문장만 생성
threshold = 0.2
k = 5

eng_long_final, txt_long_final, dial_long_final = [], [], []

long_df = pd.DataFrame( columns = ['original','dial', 'eng', 'reg'])

for i in df_all :
    hidden = extract_hidden_states(i)
    sentences = np.array(i['text'])
    sentences_eng = np.array(i['eng'])
    sentences_dial = np.array(i['dial'])
    eng_long, txt_long, dial_long,regions = generate_long_text_onnx(i,k,threshold)
    temp_df = pd.DataFrame({'original': txt_long,'dial':dial_long, 'eng' : eng_long, 'reg' : regions})
    long_df = long_df.merge(temp_df, how='outer')

100%|██████████| 3000/3000 [00:44<00:00, 67.16it/s]
100%|██████████| 3000/3000 [14:09<00:00,  3.53it/s]
100%|██████████| 3000/3000 [01:02<00:00, 48.33it/s]
100%|██████████| 3000/3000 [14:25<00:00,  3.47it/s]
100%|██████████| 3000/3000 [00:54<00:00, 55.15it/s]
100%|██████████| 3000/3000 [14:17<00:00,  3.50it/s]
100%|██████████| 3000/3000 [00:58<00:00, 51.68it/s]
100%|██████████| 3000/3000 [14:16<00:00,  3.50it/s]
100%|██████████| 3000/3000 [00:43<00:00, 68.38it/s]
100%|██████████| 3000/3000 [14:15<00:00,  3.51it/s]


In [155]:
long_df.to_csv(directory + '/final/Train_set_long_sequence_finalv1.csv')

In [157]:
long_df.head()

Unnamed: 0,original,dial,eng,reg
0,근데 결국은 요 이노무 개가 이 발 속 위에 있는 거를 어떻게 어쨌는지 물어서 집에...,근데 결국은 요 이노무 개가 이 발 속 우에 이신 거를 어떵사 어떵해산지 물언 집에...,"But after all, the Inomu dog came home asking ...",jj
1,나였으니까 해서 들이지고핸 가서<sep>나 물렸어 그때.<sep>야 나 있었잖아<s...,나쑤니깐 하난 들이지고핸 강<sep>나 물렸네 그때.<sep>야 나 있네<sep>행...,Since it was me I went in and hit it<sep>I was...,jj
2,이것도 &company-name에서 거기서 샀어.<sep> 그렇게 하네<sep>이렇...,이것도 &company-name에서 거기서 산.<sep> 경 햄시네게<sep>이렇게...,I bought this from company name too<sep>That's...,jj
3,못 가져가게 하는거 하면 숨겨서 해서 살며시 그 속에서 그거<sep>조용히 붙어...,못 가져가게 하는거 하면 곱졍 해영 솔짝 그 속에서 그거<sep>속솜해그내 붙어...,If you dont let me take it Ill hide it and liv...,jj
4,강아지한테 한번 물려버리니까 와우<sep>&name약간 너 애착 인형 아니야?<se...,강생이한테 한번 물려부난 와우<sep>&name약간 너 애착 인형 아니?<sep>아...,I got bitten by a dog so wow<sep>Aren't you a ...,jj


In [158]:
long_df.reg.value_counts()

jj    3000
cc    3000
jd    3000
gs    3000
kw    3000
Name: reg, dtype: int64

In [159]:
long_df['eng'][3]

"If you dont let me take it Ill hide it and live in it<sep>You have to stay quietly put it on<sep>He's quiet. He can't say anything.<sep>Yes so eat only the front and stay still in the back<sep>I hid it."