In [115]:
# !pip install sentencepiece
# !pip install tensorflow_addons

Looking in indexes: http://ftp.daumkakao.com/pypi/simple
Collecting tensorflow_addons
[?25l  Downloading http://mirror.kakao.com/pypi/packages/b3/f8/d6fca180c123f2851035c4493690662ebdad0849a9059d56035434bff5c9/tensorflow_addons-0.11.2-cp36-cp36m-manylinux2010_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 1.3MB/s eta 0:00:01
[?25hCollecting typeguard>=2.7
  Downloading http://mirror.kakao.com/pypi/packages/f3/28/cc6df4c26d14c338c9744dc510a8c7f1a9115f8233e7602cca140a61430c/typeguard-2.10.0-py3-none-any.whl
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.11.2 typeguard-2.10.0


In [117]:
import tensorflow as tf
import pandas as pd
import numpy as np
import sentencepiece as spm

import re
import os

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [4]:
def alpha_num(text):
    return re.sub(r'[^a-zA-z0-9\s]', '', text)

In [5]:
text_list = train['question_text'].str.lower().apply(alpha_num)
test_text_list = test['question_text'].str.lower().apply(alpha_num)

In [6]:
valid_percent = 0.2

data_len = len(train)
test_index = list(range(len(test)))

valid_index = np.random.choice(range(data_len), int(data_len*valid_percent), replace=False)
train_index = list(set(range(data_len)) - set(valid_index))

train_text_list = [text_list[i] for i in train_index]
print("loop 1 finish")
valid_text_list = [text_list[i] for i in valid_index]
print("loop 2 finish")
test_text_list = [test_text_list[i] for i in test_index]
print("loop 3 finish")

train_label_list = [train['target'].tolist()[i] for i in train_index]
print("loop 4 finish")
valid_label_list = [train['target'].tolist()[i] for i in valid_index]

loop 1 finish
loop 2 finish
loop 3 finish
loop 4 finish


In [7]:
save_path = './save'

if not os.path.exists(save_path):
    os.mkdir(save_path)

In [185]:
train_label_tensor = tf.convert_to_tensor(train_label_list)
valid_label_tensor = tf.convert_to_tensor(valid_label_list)

In [19]:
train_label_tensor[:20]

<tf.Tensor: shape=(20,), dtype=int32, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int32)>

In [8]:
vocab_size = 45000
pad_idx = 0
bos_idx = 1
eos_idx = 2
unk_idx = 3

if not os.path.isfile(f'{save_path}/m_text.vocab'):
    # 1) Make Korean text to train vocab
    with open(f'{save_path}/text.txt', 'w') as f:
        for text in train_text_list:
            f.write(f'{text}\n')


    # 2) SentencePiece model training
    spm.SentencePieceProcessor()
    spm.SentencePieceTrainer.Train(
        f'--input={save_path}/text.txt --model_prefix={save_path}/m_text'
        f'--vocab_size={vocab_size} --character_coverage=0.9995 '
        f'--model_type=bpe --split_by_whitespace=true '
        f'--pad_id={pad_idx} --unk_id={unk_idx} '
        f'--bos_id={bos_idx} --eos_id={eos_idx}'
    )

    vocab_list = list()
    with open(f'{save_path}/m_text_{vocab_size}.vocab') as f:
        for line in f:
            vocab_list.append(line[:-1].split('\t')[0])
    word2id_spm = {w: i for i, w in enumerate(vocab_list)}

In [9]:
# SentencePiece model load
spm_ = spm.SentencePieceProcessor()
spm_.Load(f"{save_path}/m_text.model")

# Tokenizing
train_encoded_list = [[bos_idx] + spm_.EncodeAsIds(text) + [eos_idx] for text in train_text_list]
valid_encoded_list = [[bos_idx] + spm_.EncodeAsIds(text) + [eos_idx] for text in valid_text_list]

In [12]:
train_tensor = tf.keras.preprocessing.sequence.pad_sequences(train_encoded_list, padding='post', maxlen=500)
valid_tensor = tf.keras.preprocessing.sequence.pad_sequences(valid_encoded_list, padding='post', maxlen=500)

In [200]:
valid_tensor = tf.convert_to_tensor(valid_tensor)

In [212]:
BUFFER_SIZE = 500
BATCH_SIZE = 512
embedding_dim = 256
units = 1024
steps_per_epoch = 500 // BATCH_SIZE

dataset = tf.data.Dataset.from_tensor_slices((train_tensor, train_label_tensor)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
val_dataset = tf.data.Dataset.from_tensor_slices((valid_tensor, valid_label_tensor))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [213]:
example_input_batch, example_label_batch = next(iter(dataset))
example_input_batch.shape, example_label_batch.shape

(TensorShape([512, 500]), TensorShape([512]))

In [214]:
class My_Model_1(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_sz):
        super(My_Model_1, self).__init__()
        self.batch_sz = batch_sz
        self.units = units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units, 
                                      return_sequences=True,
                                      return_state = True)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
        self.last_one = tf.keras.layers.Dense(2, activation='softmax')
        
    def call(self, x):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = self.initialize_hidden_state())
        
        query = tf.expand_dims(state, 1)
        score = self.V(tf.nn.tanh(self.W1(query) + self.W2(output)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        
        concat = tf.concat([context_vector, state], axis=-1)
        
        result = self.last_one(concat)
        
#         score = self.W1(state)
#         repeat_score = tf.repeat(tf.expand_dims(score, axis=1), 1024, axis=1)
#         print(output.shape, repeat_score.shape)
#         score_1 = tf.tensordot(repeat_score, output, axes=[[1], [0]])
        
        return result

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.units))

In [215]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

In [216]:
model_sample = My_Model_1(vocab_size, embedding_dim, units, BATCH_SIZE)

In [217]:
model_sample.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer='adam',
                    metrics=[tf.keras.metrics.sparse_categorical_accuracy])

In [218]:
model_train = model_sample.fit(dataset, epochs=2, validation_data=val_dataset)

Train for 2040 steps, validate for 510 steps
Epoch 1/2
Epoch 2/2


In [219]:
spm_ = spm.SentencePieceProcessor()
spm_.Load(f"{save_path}/m_text.model")

# Tokenizing
test_encoded_list = [[bos_idx] + spm_.EncodeAsIds(text) + [eos_idx] for text in test_text_list]

In [220]:
test_tensor = tf.keras.preprocessing.sequence.pad_sequences(test_encoded_list, padding='post', maxlen=500)

In [225]:
test_tensor = tf.convert_to_tensor(test_tensor)
test_dataset = tf.data.Dataset.from_tensor_slices(test_tensor)
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [235]:
output = np.argmax(model_sample.predict(test_dataset), axis=1)

In [237]:
model_sample.summary()

Model: "my__model_1_70"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_70 (Embedding)     multiple                  11520000  
_________________________________________________________________
gru_70 (GRU)                 multiple                  3938304   
_________________________________________________________________
dense_128 (Dense)            multiple                  1049600   
_________________________________________________________________
dense_129 (Dense)            multiple                  1049600   
_________________________________________________________________
dense_130 (Dense)            multiple                  1025      
_________________________________________________________________
dense_131 (Dense)            multiple                  4098      
Total params: 17,562,627
Trainable params: 17,562,627
Non-trainable params: 0
________________________________________

In [238]:
output

array([1, 0, 0, ..., 0, 0, 0])