In [1]:
import pandas as pd
import sentencepiece as spm
import numpy as np
import os
from konlpy.tag import Mecab

In [2]:
dir_path = os.getenv("HOME") +'/aiffel/DATA'

In [3]:
os.listdir(dir_path)

['corpus_dec_8009.txt',
 'corpus_dec_custom_mecab_4009.txt',
 'data_train_spm_4000_0317.pkl',
 'corpus_enc_8009.txt',
 'data_test_custom_msp_4000_0321.pkl',
 'test_results_spm4000.csv',
 'test_v_final_0313.csv',
 'train_v_final_0317.csv',
 'corpus_dec_r0.txt',
 '.ipynb_checkpoints',
 'data_test_msp_4000_0320.pkl',
 'data_test_spm_4000_0317.pkl',
 'data_test_spm_4000.pkl',
 'data_train_spm_4000.pkl',
 'corpus_enc_r0.txt',
 'data_train_msp_4000_0320.pkl',
 'train_v_final_0313.csv',
 'test_results_msp4000.csv',
 'test_v_final_0317.csv',
 'data_train_custom_msp_4000_0321.pkl',
 'corpus_dec_mecab_4009.txt']

In [4]:
df_train=pd.read_csv(dir_path + '/train_v_final_0317.csv')
df_test=pd.read_csv(dir_path + '/test_v_final_0317.csv')

In [9]:
# import tqdm
# m_dial = []
# dial_text = df_train.dial.values
# for i in tqdm.tqdm(dial_text) :
#     temp_ = mecab.morphs(i)
#     m_dial.append(' '.join(temp_))

100%|██████████| 1340262/1340262 [04:05<00:00, 5450.38it/s]


In [13]:
def generate_tokenizer(corpus, vocab_size, lang="en", pad_id=0, bos_id=1, eos_id=2, unk_id=3):

    temp_file = os.getenv('HOME') + f'/aiffel/DATA/corpus_{lang}.txt'     # corpus를 받아 txt파일로 저장
    
    with open(temp_file, 'w') as f:
        for row in corpus:
            f.write(str(row) + '\n')
    
    # Sentencepiece를 이용해 
    spm.SentencePieceTrainer.Train(
        f'--input={temp_file} --pad_id={pad_id} --bos_id={bos_id} --eos_id={eos_id} \
        --unk_id={unk_id} --model_prefix=spm_{lang} --vocab_size={vocab_size} --model_type=bpe \
        --user_defined_symbols=<jj>,<jd>,<gs>,<cc>,<kw>'   # model_r1
    )
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load(f'spm_{lang}.model') # model_r1

    return tokenizer

In [None]:
enc_tokenizer = generate_tokenizer(df_train.eng.values, 8009, lang ='enc_8009')
dec_tokenizer = generate_tokenizer(df_train.dial.values, 8009, lang ='dec_8009')

In [7]:
# dir_path = os.getenv("HOME") + '/aiffel/saturi/notebook/Model/'

# enc_tokenizer = spm.SentencePieceProcessor()
# enc_tokenizer.Load('spm_dec_8009.model')

# dec_tokenizer = spm.SentencePieceProcessor()
# dec_tokenizer.Load('spm_dec_8009.model')

# # dec_tokenizer_msp = spm.SentencePieceProcessor()
# # dec_tokenizer_msp.Load('spm_dec_mecab_4009.model')

# dec_tokenizer.set_encode_extra_options("bos:eos")

True

In [8]:
df_train['eng'] = '<'+df_train['reg']+'> ' + df_train['eng']
df_test['eng'] = '<'+df_test['reg']+'> ' + df_test['eng']

In [9]:
df_train.head()

Unnamed: 0,reg,topic,eng,dial,tok_len,tok_cat
0,jd,역사,<jd> I've changed my mind a little bit.,생각이 쪼금씩 바뀌드라고,10,1
1,jd,가족,<jd> You know what? Foreigners still get used ...,웜마 알겄는가 애국인들이 그또 달 적응하고,17,1
2,jd,건강,"<jd> All of a sudden, I think I've ripped off ...",아으 갑자기 에전에 맹장 뜯은 게잉 생각난디야.,22,1
3,jd,스타일,<jd> I have a lot of ideas about getting older...,보먼은 좀 품위 있게 나이 들어가야 되겠단 싱각을 참 마이 해요잉.,18,1
4,jd,먹거리,"<jd> It's a little bit of a tastey, but still...",약깐 좀 약깐 맛이가 쪼까 머시기 카긴 한디 그또,18,1


 0.   toks_en     
 1.   toks_dec    
 2.   source_txt  
 3.   target_txt  
 4.   topic       
 5.   reg         

In [10]:
df_train.dial.values[0]

'생각이 쪼금씩 바뀌드라고 '

In [12]:
# spm
dec_tokenizer.encode_as_pieces(df_train.dial.values[0])

['<s>', '▁', '생각이', '▁', '쪼금씩', '▁', '바뀌드라고', '</s>']

In [21]:
# # msp
# dec_tokenizer_msp.encode_as_pieces(df_train.dial.values[0])

['▁생각', '이', '▁쪼금', '씩', '▁바뀌', '드', '라고']

In [16]:
# # custom msp
# dec_tokenizer.encode_as_pieces(df_train.dial.values[0])

['<s>', '▁생각', '이', '▁쪼금', '씩', '▁바뀌', '드라고', '</s>']

In [22]:
import tqdm
def tokenize_data(df) :
    dial_text = df['dial'].values
    eng_text = df['eng'].values

    toks_en = []
    toks_dec = []

    for i in tqdm.tqdm(range(len(df))) :
        
        en_tokenized = enc_tokenizer.encode(eng_text[i])
        dial_tokenized = dec_tokenizer.encode(dial_text[i])
        toks_en.append(en_tokenized)
        toks_dec.append(dial_tokenized)
    
    df['toks_en'] = toks_en
    df['toks_dec'] = toks_dec

In [23]:
for i in [df_test, df_train]:
    tokenize_data(i)

100%|██████████| 1500/1500 [00:00<00:00, 3449.42it/s]
100%|██████████| 1340262/1340262 [02:42<00:00, 8251.75it/s]


In [24]:
dir_path = os.getenv("HOME") +'/aiffel/DATA'
df_train.to_pickle(dir_path + '/data_train_custom_msp_4000_0321.pkl','gzip')
df_test.to_pickle(dir_path + '/data_test_custom_msp_4000_0321.pkl','gzip')