In [0]:
import pandas as pd
import numpy as np
import re
import json
import gensim
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [0]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

In [0]:
FILTERS = "([~.,!?\"':;)(])"
MAX_SEQUENCE_LENGTH = 31

In [0]:
train_data = pd.read_csv(DATA_IN_PATH + 'train.csv', encoding='utf-8')

In [0]:
train_pos_data = train_data.loc[train_data['is_duplicate'] == 1]
train_neg_data = train_data.loc[train_data['is_duplicate'] == 0]

class_difference = len(train_neg_data) - len(train_pos_data)
sample_frac = 1 - (class_difference / len(train_neg_data))
train_neg_data = train_neg_data.sample(frac = sample_frac)

In [0]:
train_data = pd.concat([train_neg_data, train_pos_data])

In [0]:
change_filter = re.compile(FILTERS)

questions1 = [str(s) for s in train_data['question1']]
questions2 = [str(s) for s in train_data['question2']]

filtered_questions1 = list()
filtered_questions2 = list()

for q in questions1:
     filtered_questions1.append(re.sub(change_filter, "", q).lower())
        
for q in questions2:
     filtered_questions2.append(re.sub(change_filter, "", q).lower())

In [0]:
def extract_questions(df):
    for data in df:
       yield gensim.utils.simple_preprocess(data)

In [0]:
documents_1 = list(extract_questions(filtered_questions1))
documents_2 = list(extract_questions(filtered_questions2))

In [0]:
len(documents_1)

298526

In [0]:
len(documents_2)

298526

In [0]:
documents  = documents_1 + documents_2
len(documents)

597052

In [0]:
model = gensim.models.Word2Vec(documents, size=128)

In [0]:
word_vectors = model.wv

In [0]:
print("Number of word vectors: {}".format(len(word_vectors.vocab)))

Number of word vectors: 24259


In [0]:
MAX_NB_WORDS = len(word_vectors.vocab)

In [0]:
model.train(documents, total_examples=len(documents), epochs=10)

W0622 04:35:33.249613 140049650235264 base_any2vec.py:1182] Effective 'alpha' higher than previous training cycles


(43165332, 60749550)

In [0]:
model.save(DATA_IN_PATH+"Quora-Question-Pairs.w2v")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
vocab = list(model.wv.vocab)
word_list = model[vocab]

  


In [0]:
word_list[0].shape

(128,)

In [0]:
word_list.shape

(24349, 128)

In [0]:
!ls -al ./data_in/

total 570416
drwxr-xr-x 2 root root      4096 Jun 21 16:14 .
drwxr-xr-x 1 root root      4096 Jun 21 16:11 ..
-rw-r--r-- 1 root root  38993494 Jun 21 16:14 Quora-Question-Pairs.w2v
-rw-r--r-- 1 root root  22346871 Jun 21 16:11 sample_submission.csv
-rw------- 1 root root   5190322 Jun 21 16:11 sample_submission.csv.zip
-rw-r--r-- 1 root root 314015126 Jun 21 16:11 test.csv
-rw------- 1 root root 117931806 Jun 21 16:11 test.csv.zip
-rw-r--r-- 1 root root  63399110 Jun 21 16:11 train.csv
-rw------- 1 root root  22202741 Jun 21 16:11 train.csv.zip


In [0]:
len(model.wv.vocab)

24349

In [0]:
word2index = {token: token_index for token_index, token in enumerate(model.wv.index2word)}

In [0]:
def sequences(documents):
  sequence_output_index = []
  for sequence in documents:
    #print(sequence)
    sequence_index = []
    for word in sequence:
      if word not in model.wv.vocab:
        continue
      sequence_index.append(word2index[word])

    if len(sequence_index) > MAX_SEQUENCE_LENGTH:
      sequence_index = sequence_index[:MAX_SEQUENCE_LENGTH]
      
    #sequence_index += (MAX_SEQUENCE_LENGTH - len(sequence_index)) * [PAD]
    #print(sequence_index)
    sequence_output_index.append(sequence_index)    
  return np.asarray(sequence_output_index)
                     

In [0]:
q1_data = sequences(documents_1)
q2_data = sequences(documents_2)

NameError: ignored

In [0]:
q1_data

In [0]:
q1_data = pad_sequences(q1_data, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_data = pad_sequences(q2_data, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [0]:
labels = np.array(train_data['is_duplicate'], dtype=int)

In [0]:
data_configs = {}
data_configs['vocab'] = word2index
data_configs['vocab_size'] = len(word2index)

In [0]:
TRAIN_Q1_DATA = 'train_q1.npy'
TRAIN_Q2_DATA = 'train_q2.npy'
TRAIN_LABEL_DATA = 'train_label.npy'
DATA_CONFIGS = 'data_configs.json'

np.save(open(DATA_IN_PATH + TRAIN_Q1_DATA, 'wb'), q1_data)
np.save(open(DATA_IN_PATH + TRAIN_Q2_DATA , 'wb'), q2_data)
np.save(open(DATA_IN_PATH + TRAIN_LABEL_DATA , 'wb'), labels)

json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'))

In [0]:
import sys
import tensorflow as tf
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split

import json

In [0]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

In [0]:
!ls -al

total 40
drwxr-xr-x 1 root root 4096 Jun 21 16:11 .
drwxr-xr-x 1 root root 4096 Jun 21 16:01 ..
-rw-r--r-- 1 root root 2487 Jun 21 16:11 adc.json
drwxr-xr-x 1 root root 4096 Jun 21 16:11 .config
drwxr-xr-x 2 root root 4096 Jun 21 16:15 data_in
drwxr-xr-x 3 root root 4096 Jun 21 16:18 data_out
drwx------ 3 root root 4096 Jun 21 16:11 gdrive
drwxr-xr-x 1 root root 4096 Jun 18 16:14 sample_data


In [0]:
TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
NB_WORDS_DATA_FILE = 'data_configs.json'

## 학습에 필요한 파라메터들에 대해서 지정하는 부분이다.

BATCH_SIZE = 4096
EPOCH = 2
HIDDEN = 64
BUFFER_SIZE = 10000

NUM_LAYERS = 3
DROPOUT_RATIO = 0.3

TEST_SPLIT = 0.1
RNG_SEED = 13371447
EMBEDDING_DIM = 128
MAX_SEQ_LEN = 31

In [0]:
## 데이터를 불러오는 부분이다. 효과적인 데이터 불러오기를 위해, 미리 넘파이 형태로 저장시킨 데이터를 로드한다.

q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = None

with open(DATA_IN_PATH + NB_WORDS_DATA_FILE, 'r') as f:
    prepro_configs = json.load(f)

In [0]:
VOCAB_SIZE = prepro_configs['vocab_size']
VOCAB_SIZE

24349

In [0]:
q1_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q1_data])
q2_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q2_data])

In [0]:
## 데이터를 나누어 저장하자. sklearn의 train_test_split을 사용하면 유용하다. 하지만, 쿼라 데이터의 경우는
## 입력이 1개가 아니라 2개이다. 따라서, np.stack을 사용하여 두개를 하나로 쌓은다음 활용하여 분류한다.

X = np.stack((q1_data, q2_data), axis=1)
y = labels
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)

train_Q1 = train_X[:,0]
train_Q2 = train_X[:,1]
test_Q1 = test_X[:,0]
test_Q2 = test_X[:,1]

In [0]:
train_Q1

array([[   3,    2,   53, ...,    0,    0,    0],
       [   2,  132, 1636, ...,    0,    0,    0],
       [   1,   56, 2486, ...,    0,    0,    0],
       ...,
       [  13,  715,   15, ...,    0,    0,    0],
       [   3,    6,   24, ...,    0,    0,    0],
       [   1,    6,   12, ...,    0,    0,    0]], dtype=int32)

In [0]:

def rearrange(base, hypothesis, labels):
    features = {"base": base, "hypothesis": hypothesis}
    return features, labels

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_Q1, train_Q2, train_y))
    dataset = dataset.shuffle(buffer_size=len(train_Q1))
    dataset = dataset.batch(BATCH_SIZE) #4096
    dataset = dataset.map(rearrange)
    dataset = dataset.repeat(EPOCH) # 2
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_Q1, test_Q2, test_y))
    dataset = dataset.batch(BATCH_SIZE) #4096
    dataset = dataset.map(rearrange)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [0]:

from tensorflow.python.keras.layers import Embedding

def Malstm(features, labels, mode):
        
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    # VOCAB_SIZE : 24349   EMBEDDING_DIM : 128   
    embedding = Embedding(VOCAB_SIZE,
                     EMBEDDING_DIM,
                     mask_zero=False,
                     weights=[word_list],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)
    
    base_embedded_matrix = embedding(features['base']) 
    
    hypothesis_embedded_matrix = embedding(features['hypothesis']) 
    
    q_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = HIDDEN, activation = tf.nn.tanh , state_is_tuple=False) 

    q, q_output_states = tf.nn.dynamic_rnn(cell = q_lstm_cell, 
                                           inputs = base_embedded_matrix, 
                                           dtype = tf.float32,
                                           scope='query')       

    s_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = HIDDEN, activation = tf.nn.tanh, state_is_tuple=False) 
    s, s_output_states = tf.nn.dynamic_rnn(cell = s_lstm_cell, 
                                           inputs = hypothesis_embedded_matrix, 
                                           dtype = tf.float32,
                                           scope='sim_query')     

    with tf.variable_scope('output_layer'):
        logit_layer = tf.exp(-tf.reduce_sum(tf.abs(q_output_states - s_output_states), axis=1, keepdims=True)) 
    
        logit_layer = tf.squeeze(logit_layer, axis=-1) 
       
    if PREDICT:
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions={
                      'is_duplicate':logit_layer
                  })
    
    #prediction 진행 시, None
    if labels is not None:
        labels = tf.to_float(labels)
    
    loss = tf.losses.mean_squared_error(labels=labels, predictions=logit_layer)

    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(logit_layer))
        eval_metric_ops = {'acc': accuracy}
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  eval_metric_ops= eval_metric_ops,
                  loss=loss)

    elif TRAIN:

        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  train_op=train_op,
                  loss=loss)

In [0]:
!rm -rf /checkpoint/malstm/

In [0]:
model_dir = os.path.join(os.getcwd(), DATA_OUT_PATH + "/checkpoint/malstm/")
os.makedirs(model_dir, exist_ok=True)

config_tf = tf.estimator.RunConfig()

lstm_est = tf.estimator.Estimator(Malstm, model_dir=model_dir)

In [0]:
lstm_est.train(train_input_fn)

embedding:  <tensorflow.python.keras.layers.embeddings.Embedding object at 0x7f33a7df9f60>


W0621 16:47:52.906411 139861576632192 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f33a7dc9048>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tuple=True.
W0621 16:47:53.026030 139861576632192 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f33a7d9bf28>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tuple=True.


base_embedded_matrix:  Tensor("embedding/embedding_lookup/Identity_1:0", shape=(?, 31, 128), dtype=float32)
hypothesis_embedded_matrix:  Tensor("embedding_1/embedding_lookup/Identity_1:0", shape=(?, 31, 128), dtype=float32)
q_lstm_cell:  <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f33a7dc9048>
q:  Tensor("query/transpose_1:0", shape=(?, 31, 64), dtype=float32)
q_output_states:  Tensor("query/while/Exit_3:0", shape=(?, 128), dtype=float32)
s_lstm_cell:  <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f33a7d9bf28>
s:  Tensor("sim_query/transpose_1:0", shape=(?, 31, 64), dtype=float32)
s_output_states:  Tensor("sim_query/while/Exit_3:0", shape=(?, 128), dtype=float32)
logit_layer:  Tensor("output_layer/Exp:0", shape=(?, 1), dtype=float32)
logit_layer:  Tensor("output_layer/Squeeze:0", shape=(?,), dtype=float32)


W0621 16:47:54.611497 139861576632192 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
W0621 16:47:54.884833 139861576632192 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1066: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.


<tensorflow_estimator.python.estimator.estimator.Estimator at 0x7f33a7a4aeb8>

In [0]:
lstm_est.evaluate(eval_input_fn)

W0621 16:54:06.948600 139861576632192 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f33a36b9780>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tuple=True.


embedding:  <tensorflow.python.keras.layers.embeddings.Embedding object at 0x7f33a79c5ef0>
base_embedded_matrix:  Tensor("embedding/embedding_lookup/Identity_1:0", shape=(?, 31, 128), dtype=float32)
hypothesis_embedded_matrix:  Tensor("embedding_1/embedding_lookup/Identity_1:0", shape=(?, 31, 128), dtype=float32)
q_lstm_cell:  <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f33a36b9780>


W0621 16:54:07.066392 139861576632192 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f33a497be10>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tuple=True.


q:  Tensor("query/transpose_1:0", shape=(?, 31, 64), dtype=float32)
q_output_states:  Tensor("query/while/Exit_3:0", shape=(?, 128), dtype=float32)
s_lstm_cell:  <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f33a497be10>
s:  Tensor("sim_query/transpose_1:0", shape=(?, 31, 64), dtype=float32)
s_output_states:  Tensor("sim_query/while/Exit_3:0", shape=(?, 128), dtype=float32)
logit_layer:  Tensor("output_layer/Exp:0", shape=(?, 1), dtype=float32)
logit_layer:  Tensor("output_layer/Squeeze:0", shape=(?,), dtype=float32)


{'acc': 0.70883995, 'global_step': 132, 'loss': 0.19398028}