# Ma LSTM

In [None]:
!pip install -q tensorflow-gpu==2.0.0-rc1

In [1]:
import sys
import tensorflow as tf
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split

import json

In [0]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
base_path = '/content/gdrive/My Drive/Colab Notebooks'

In [3]:
data_in_path = base_path + '/data_in/'
data_out_path = base_path + '/data_out/'

In [32]:
TRAIN_Q1_DATA_FILE = 'quora_train_q1.npy'
TRAIN_Q2_DATA_FILE = 'quora_train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'quora_train_label.npy'
DATA_CONFIGS = 'quora_data_configs.json'

## 학습에 필요한 파라메터들에 대해서 지정하는 부분이다.

model_name = 'maLSTM'

batch_size = 256
num_epochs = 2

test_split = 0.1
seed = 13

In [5]:
q1_data = np.load(open(data_in_path + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(data_in_path + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(data_in_path + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = None

with open(data_in_path + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)

### Split train and test dataset

In [6]:
np.stack((q1_data, q2_data), axis=1).shape

(298526, 2, 31)

In [7]:
x = np.stack((q1_data, q2_data), axis=1)
y = labels
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=test_split, random_state=seed)

train_q1 = train_x[:,0]
train_q2 = train_x[:,1]
valid_q1 = valid_x[:,0]
valid_q2 = valid_x[:,1]

In [8]:
print(train_q1.shape)
print(train_q2.shape)
print(valid_q1.shape)
print(valid_q2.shape)
print(train_y.shape)
print(valid_y.shape)

(268673, 31)
(268673, 31)
(29853, 31)
(29853, 31)
(268673,)
(29853,)


In [16]:
def mapping_fn(base, hypothesis, labels=None):
    features = {"base": base, "hypothesis": hypothesis}
    if labels is not None:
        return features, labels
    else:
        return features

dataset = tf.data.Dataset.from_tensor_slices((train_q1, train_q2, train_y))
dataset = dataset.shuffle(len(train_q1))
dataset = dataset.batch(batch_size) 
dataset = dataset.map(mapping_fn)

validation_dataset = tf.data.Dataset.from_tensor_slices((valid_q1, valid_q2, valid_y))
validation_dataset = validation_dataset.batch(batch_size) 
validation_dataset = validation_dataset.map(mapping_fn)

### Model Setup

In [10]:
from tensorflow.keras import layers

In [33]:
class Model(tf.keras.Model):
    
    def __init__(self, **kargs):
        super(Model, self).__init__(name=model_name)
        self.embedding = layers.Embedding(input_dim=kargs['vocab_size'],
                                     output_dim=kargs['embedding_size'])
        self.lstm = layers.LSTM(units=kargs['lstm_dimension'])
        
    def call(self, x):
        x1 = x['base']
        x2 = x['hypothesis']
        x1 = self.embedding(x1)
        x2 = self.embedding(x2)
        x1 = self.lstm(x1)
        x2 = self.lstm(x2)
        x = tf.exp(-tf.reduce_sum(tf.abs(x1 - x2), axis=1))
        
        return x

In [34]:
kargs = {'vocab_size': prepro_configs['vocab_size'],
        'embedding_size': 300,
        'lstm_dimension': 300}

In [40]:
model = Model(**kargs)

model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mean_squared_error'])

In [41]:
checkpoint_path = data_out_path + model_name + '/weights.{epoch:02d}-{val_loss:.2f}'
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, verbose=1, save_weights_only=True)

model.fit(dataset, epochs=num_epochs,
         validation_data=validation_dataset)

Epoch 1/2
     13/Unknown - 10s 751ms/step - loss: 0.3001 - mean_squared_error: 0.3001

KeyboardInterrupt: 

In [37]:
TEST_Q1_DATA_FILE = 'quora_test_q1.npy'
TEST_Q2_DATA_FILE = 'quora_test_q2.npy'
TEST_ID_DATA_FILE = 'quora_test_id.npy'

test_q1_data = np.load(open(data_in_path + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(data_in_path + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(data_in_path + TEST_ID_DATA_FILE, 'rb'))

In [38]:
test_dataset = tf.data.Dataset.from_tensor_slices((test_q1_data, test_q2_data))
test_dataset = test_dataset.batch(batch_size) 
test_dataset = test_dataset.map(mapping_fn)

In [39]:
predictions = model.predict(test_dataset)

KeyboardInterrupt: 

In [0]:
print(len(predictions)) #2345796

output = pd.DataFrame(data={"test_id":test_id_data, "is_duplicate": list(predictions)})
output.to_csv(f"{data_out_path}rnn_predict.csv", index=False, quoting=3)

2345796
