# Ma LSTM

In [1]:
import sys
import tensorflow as tf
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split

import json

In [0]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
base_path = './gdrive/My Drive/Colab Notebooks'

In [4]:
data_in_path = base_path + '/data_in/'
data_out_path = base_path + '/data_out/'

In [46]:
TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
DATA_CONFIGS = 'data_configs.json'

## 학습에 필요한 파라메터들에 대해서 지정하는 부분이다.

BATCH_SIZE = 256
EPOCH = 2
HIDDEN_DIM = 64
EMBEDDING_DIM = 128
DROPOUT_RATIO = 0.3

TEST_SPLIT = 0.1
RNG_SEED = 13

In [7]:
q1_data = np.load(open(data_in_path + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(data_in_path + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(data_in_path + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = None

with open(data_in_path + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)

In [8]:
VOCAB_SIZE = prepro_configs['vocab_size']
VOCAB_SIZE

76609

### Split train and test dataset

In [10]:
np.stack((q1_data, q2_data), axis=1).shape

(298526, 2, 31)

In [11]:
x = np.stack((q1_data, q2_data), axis=1)
y = labels
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=TEST_SPLIT, random_state=RNG_SEED)

train_q1 = train_x[:,0]
train_q2 = train_x[:,1]
test_q1 = test_x[:,0]
test_q2 = test_x[:,1]

In [15]:
print(train_q1.shape)
print(train_q2.shape)
print(test_q1.shape)
print(test_q2.shape)
print(train_y.shape)
print(test_y.shape)

(268673, 31)
(268673, 31)
(29853, 31)
(29853, 31)
(268673,)
(29853,)


In [43]:
def mapping_fn(base, hypothesis, labels):
    features = {"base": base, "hypothesis": hypothesis}
    return features, labels

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_q1, train_q2, train_y))
    dataset = dataset.shuffle(len(train_q1))
    dataset = dataset.batch(BATCH_SIZE) #4096
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(EPOCH) # 2
    
    return dataset

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_q1, test_q2, test_y))
    dataset = dataset.batch(BATCH_SIZE) #4096
    dataset = dataset.map(mapping_fn)
    
    return dataset

### Model Setup

In [54]:
def ma_lstm(features, labels, mode):

    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    embedding = tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
    lstm = tf.keras.layers.LSTM(HIDDEN_DIM)
    
    base_embedded = embedding(features['base']) 
    hypothesis_embedded = embedding(features['hypothesis'])
    
    base_lstm_output = lstm(base_embedded)
    hypothesis_lstm_output = lstm(hypothesis_embedded)

    with tf.variable_scope('output_layer'):
        logit_layer = tf.exp(-tf.reduce_sum(tf.abs(base_lstm_output - hypothesis_lstm_output), axis=1))
      
    if PREDICT:
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions={
                      'is_duplicate':logit_layer
                  })
    
    #prediction 진행 시, None
    if labels is not None:
        labels = tf.to_float(labels)
    
    loss = tf.losses.mean_squared_error(labels=labels, predictions=logit_layer)

    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(logit_layer))
        eval_metric_ops = {'acc': accuracy}
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  eval_metric_ops= eval_metric_ops,
                  loss=loss)

    elif TRAIN:

        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  train_op=train_op,
                  loss=loss)

In [55]:
estimator = tf.estimator.Estimator(ma_lstm, model_dir=data_out_path + "/checkpoint/malstm/")

I0919 23:05:48.650579 4588836288 estimator.py:1790] Using default config.
I0919 23:05:48.651763 4588836288 estimator.py:209] Using config: {'_model_dir': './/data_out//checkpoint/malstm/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fee31765cc0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [56]:
tf.logging.set_verbosity(tf.logging.INFO)

In [57]:
estimator.train(train_input_fn)

I0919 23:05:50.410641 4588836288 estimator.py:1145] Calling model_fn.
I0919 23:05:51.611579 4588836288 estimator.py:1147] Done calling model_fn.
I0919 23:05:51.614263 4588836288 basic_session_run_hooks.py:541] Create CheckpointSaverHook.
I0919 23:05:51.679338 4588836288 monitored_session.py:240] Graph was finalized.
W0919 23:05:51.683121 4588836288 deprecation.py:323] From /Users/chojunghyun/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
I0919 23:05:51.684780 4588836288 saver.py:1280] Restoring parameters from .//data_out//checkpoint/malstm/model.ckpt-0
W0919 23:05:51.883599 4588836288 deprecation.py:323] From /Users/chojunghyun/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1066: ge

KeyboardInterrupt: 

In [0]:
lstm_est.evaluate(eval_input_fn)

W0622 04:33:19.597750 140227660601216 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f8902396208>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tuple=True.
W0622 04:33:19.720473 140227660601216 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f8909a3bf60>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tuple=True.
W0622 04:33:19.999620 140227660601216 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.


{'acc': 0.71852076, 'global_step': 132, 'loss': 0.18716064}

In [0]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'))

In [0]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"base":test_q1_data, 
                                                         "hypothesis":test_q2_data}, 
                                                      shuffle=False)

predictions = np.array([p['is_duplicate'] for p in lstm_est.predict(input_fn=predict_input_fn)])

W0622 04:33:51.712189 140227660601216 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/inputs/queues/feeding_queue_runner.py:62: QueueRunner.__init__ (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
W0622 04:33:51.738292 140227660601216 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/inputs/queues/feeding_functions.py:500: add_queue_runner (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
W0622 04:33:51.793607 140227660601216 rnn_cell_impl.py:697] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7f89077446a0>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tu

In [0]:
print(len(predictions)) #2345796

output = pd.DataFrame( data={"test_id":test_id_data, "is_duplicate": list(predictions)} )
output.to_csv( "/content/gdrive/My Drive/Colab Notebooks/Data/rnn_predict.csv", index=False, quoting=3 )

2345796
