In [3]:
pip install soynlp

Note: you may need to restart the kernel to use updated packages.


In [28]:
pip install sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
     |████████████████████████████████| 103 kB 5.3 MB/s            
Installing collected packages: sacrebleu
Successfully installed sacrebleu-2.4.3
Note: you may need to restart the kernel to use updated packages.


In [29]:
import os
import re
import json
import random
import sacrebleu
import numpy as np
import pandas as pd
import urllib.request
import tensorflow as tf
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer
from soynlp.tokenizer import MaxScoreTokenizer
from soynlp import DoublespaceLineCorpus
from tensorflow.keras.optimizers import Adam
from transformers import AutoTokenizer, TFAutoModel
import warnings

In [14]:
bert_model = TFAutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/449M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/lovit/soynlp/master/tutorials/2016-10-20.txt", filename="2016-10-20.txt")
corpus = DoublespaceLineCorpus("2016-10-20.txt")

In [4]:
word_extractor = WordExtractor()
word_extractor.train(corpus)
word_score_table = word_extractor.extract()

training was done. used memory 2.049 Gb
all cohesion probabilities was computed. # words = 223348
all branching entropies was computed # words = 361598
all accessor variety was computed # words = 361598


In [7]:
scores = {word:score.cohesion_forward for word, score in word_score_table.items()}
maxscore_tokenizer = MaxScoreTokenizer(scores=scores)

In [8]:
def data_import(data_name):
    data_path = os.getenv('HOME')+'/aiffel/project_data/dlthon/'+data_name
    imported_data = pd.read_csv(data_path)
    return imported_data

In [9]:
def bert_encode(datas, sent_max_length):
    input_ids = []
    attention_masks = []
    
    for sent in datas:
        encoded = tokenizer.encode_plus(sent,
                                        add_special_tokens = True,
                                        max_length = sent_max_length,
                                        padding='max_length',
                                        truncation = True,
                                        return_attention_mask=True)
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids), np.array(attention_masks)

In [10]:
def create_model(bert_model, max_len):
    input_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32)
    
    output = bert_model([input_ids, attention_mask])
    output = output.last_hidden_state
    output = tf.keras.layers.Dense(64, activation='relu')(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(3, activation='softmax')(output)
    
    model = tf.keras.Model(inputs = [input_ids, attention_mask], outputs = output)
    model.compile(Adam(learning_rate=0.0001),loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [11]:
def decode_input_ids(input_ids):
    return tokenizer.decode(input_ids, skip_special_tokens=True)

def convert_to_dataframe_with_newline(all_input_ids, all_masks):
    rows = []
    
    for conversation_idx, (input_ids, mask) in enumerate(zip(all_input_ids, all_masks)):
        current_sentence = []
        conversation_text = []
        last_mask = None

        for i, input_id in enumerate(input_ids):
            decoded_word = decode_input_ids([input_id])
            
            m = mask[i]
            if m != 2:
                if last_mask is not None and m != last_mask:
                    conversation_text.append(''.join(current_sentence))
                    conversation_text.append("\n")  # 줄바꿈
                    current_sentence = []
                current_sentence.append(decoded_word)
                last_mask = m
                
        if current_sentence:
            conversation_text.append(''.join(current_sentence))
            
        rows.append({
            "Conversation": conversation_idx + 1,
            "Text": ''.join(conversation_text)
        })
        
        
    return pd.DataFrame(rows)

In [36]:
def get_spacing(test_data_div, tokenizer):
    return_text =[]
    for test_test in test_data_div['Text']:
        return_single_conv_list = []
        text_list = test_test.split('\n')
        for text in text_list:
            tokened_text = tokenizer.tokenize(text)
            return_single_speak = ' '.join(tokened_text)
            return_single_conv_list.append(return_single_speak)
        return_single_conv = '\n'.join(return_single_conv_list)
        return_text.append(return_single_conv)
    
    return_data = pd.DataFrame({"Conversation" : list(test_data_div["Conversation"]), "text" : return_text})
    
    return return_data

In [75]:
def eval_bule(back_translate_data, col_name):
    return_score = []
    hypothesis = back_translate_data[1][col_name]
    references = back_translate_data[0][col_name]
    for hyp, ref in zip(hypothesis, references):
        hypothese = hyp
        reference = ref
        bleu = sacrebleu.corpus_bleu([hypothese], [[reference]])
        return_score.append(bleu.score)
    return_score = np.array(return_score)
    mean_score = np.mean(return_score)
    return mean_score

In [15]:
use_bert_model = create_model(bert_model, 240)
use_bert_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 240)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 240)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 117653760   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 240, 64)      24640       tf_bert_model[0][13]         

In [16]:
use_bert_model.load_weights('dlthon2.keras')

In [32]:
test_data = data_import('test.csv')

In [18]:
test_data = np.array(test_data['text'])

In [19]:
test_input_ids, test_attention_masks = bert_encode(test_data, 240)

In [20]:
test_labels = use_bert_model.predict([test_input_ids, test_attention_masks])

In [21]:
test_labels = test_labels.argmax(axis=-1)

In [22]:
test_data_div = convert_to_dataframe_with_newline(test_input_ids, test_labels)

In [37]:
final_test_data = get_spacing(test_data_div, maxscore_tokenizer)

In [39]:
final_test_data

Unnamed: 0,Conversation,text
0,1,아가씨 담배 한갑주소네\n4 500 원 입니다 어네\n지갑어디갔지에이 버스 에서 잃...
1,2,우리 팀 에서 다른 팀 으로 갈 사람 없나?그럼영지 씨가 가는건어때?네\n?제가요?...
2,3,너 오늘 그게뭐야네\n제가뭘 잘못 했나요.? 제대로 좀\n하지 네 똑바로 좀 하지 ...
3,4,이거 들어 바 와이\n노래 진짜 좋다 그치\n요즘 이것만\n들어 진짜 너무 좋다 내...
4,5,아무튼앞 으로 니가내 와이 파이 야.\n.응 와이 파이 온\n.켰어.반말?주인 님이...
...,...,...
495,496,미나씨 휴가 결제 올리기 전에저랑상의하 라고 말한거기억해요?네\n합니다 . 보고서를...
496,497,교수 님제 논문에 제이름이없나요?아\n무슨논문말이야? 지난 번\n냈던 논문이 요.그...
497,498,야너네\n저요?그래\n너왜요돈\n좀\n줘봐돈 없어요 돈이\n왜 없어 지갑\n은폼이니...
498,499,야너 빨리 안 뛰어 와?너이 환자 제대로 봤어안봤어 어제저녁 부터 계속 보다가 지금...


In [33]:
test_data

Unnamed: 0,idx,text
0,t_000,아가씨 담배한갑주소 네 4500원입니다 어 네 지갑어디갔지 에이 버스에서 잃어버렸나...
1,t_001,우리팀에서 다른팀으로 갈 사람 없나? 그럼 영지씨가 가는건 어때? 네? 제가요? ...
2,t_002,너 오늘 그게 뭐야 네 제가 뭘 잘못했나요.? 제대로 좀 하지 네 똑바로 좀 하지 ...
3,t_003,이거 들어바 와 이 노래 진짜 좋다 그치 요즘 이 것만 들어 진짜 너무 좋다 내가 ...
4,t_004,아무튼 앞으로 니가 내 와이파이야. .응 와이파이 온. 켰어. 반말? 주인님이라고도...
...,...,...
495,t_495,미나씨 휴가 결제 올리기 전에 저랑 상의하라고 말한거 기억해요? 네 합니다. 보고서...
496,t_496,교수님 제 논문에 제 이름이 없나요? 아 무슨 논문말이야? 지난 번 냈던 논문이...
497,t_497,야 너 네 저요? 그래 너 왜요 돈좀 줘봐 돈 없어요 돈이 왜 없어 지갑은 폼이...
498,t_498,야 너 빨리 안 뛰어와? 너 이 환자 제대로 봤어 안 봤어 어제 저녁부터 계속 보다...


In [40]:
data_for_blue = [final_test_data, test_data]

In [70]:
score = eval_bule(data_for_blue, 'text')

In [71]:
score

16.16005857663051

In [26]:
final_test_data.to_csv('divided_test_data.csv', index=False)