In [None]:
# !unzip /content/drive/Shareddrives/DATASET/ndsc-product-matching.zip -d /content/data/

In [None]:
!cp -r '/content/drive/MyDrive/Colab Projects/product-pair-matching/data/raw' '/content/'

In [None]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

AUTOTUNE = tf.data.AUTOTUNE

In [None]:
class cfg:
    DATA_PATH = '/content/raw'
    CSV_TRAIN_PATH = os.path.sep.join([DATA_PATH, 'new_training_set.csv'])
    CSV_TEST_PATH = os.path.sep.join([DATA_PATH, 'new_test_set.csv'])
    BATCH_SIZE = 32
    EPOCHS = 25
    BASE_OUTPUT = '/content/outputs'

In [None]:
train_df = pd.read_csv(cfg.CSV_TRAIN_PATH, index_col=0)
train, hold = train_test_split(train_df, test_size=0.3, random_state=42)
valid, test = train_test_split(hold, test_size=0.3, random_state=42)

DEBUG = False
if DEBUG:
    cfg.EPOCHS = 2
    train_data = train.head(100).to_numpy()
    valid_data = valid.head(20).to_numpy()
else:
    train_data = train.to_numpy()
    valid_data = valid.to_numpy()
    test_data = test.to_numpy()

## Preprocessing

In [None]:
default_stop_words = [
    'atau', 'dan', 'and', 'murah', 'grosir',
    'untuk', 'termurah', 'cod', 'terlaris', 'bisacod', 'terpopuler',
    'bisa', 'terbaru', 'tempat', 'populer', 'di', 'sale', 'bayar', 'flash',
    'promo', 'seler', 'in', 'salee', 'diskon', 'gila', 'starseller', 'seller'
]

def preprocess_text(text):
    s = str(text).lower()
    s = ' '.join([word for word in s.split() if word not in default_stop_words])
    s = re.sub('&', ' and ', s)
    s = re.sub('/', 'atau', s, count=1)
    s = re.sub(r"[^a-zA-Z0-9]+", ' ', s)
    s = re.sub(' s ', 's ', s)
    s = re.sub(r"([0-9]+(\.[0-9]+)?)", r" \1 ", s).strip()
    return s

def preprocess_data(data):
    data[:,0] = np.array(list(map(preprocess_text, data[:,0])))
    data[:,2] = np.array(list(map(preprocess_text, data[:,2])))
    return data

In [None]:
%%time
train_data = preprocess_data(train_data)
valid_data = preprocess_data(valid_data)
test_data = preprocess_data(test_data)

CPU times: user 296 ms, sys: 3.03 ms, total: 299 ms
Wall time: 300 ms


In [None]:
def map_func(title_1, title_2, label):
    return {
        'title_1': title_1,
        'title_2': title_2
    }, label

def create_dataset(data, batch_size=cfg.BATCH_SIZE):
    title_1 = data[:,0]
    title_2 = data[:,2]
    label = np.array(data[:,4], dtype='int')

    ds = tf.data.Dataset.from_tensor_slices(
        (title_1, title_2, label)
    )

    ds = ds.map(map_func, num_parallel_calls=AUTOTUNE)
    # ds = ds.cache('/content/outputs/cache/dump.tfcache') 
    ds = ds.shuffle(buffer_size=1024)
    ds = ds.repeat()
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds

In [None]:
%%time
train_ds = create_dataset(train_data)
valid_ds = create_dataset(valid_data)

CPU times: user 485 ms, sys: 318 ms, total: 803 ms
Wall time: 819 ms


In [None]:
for _data, _label in valid_ds.unbatch().take(1):
    print('Label {}'.format(_label.numpy()))
    print(_data['title_1'].numpy())
    print(_data['title_2'].numpy())

Label 1
b'minyak ikan ecer  10  caps repack atau vitamin kucing anjing vitamin bulu kucing anjing tung hai'
b'minyak ikan ecer  100  caps repack atau vitamin anjing kucing'


## Model

In [None]:
def create_text_extractor(vectorize_layer):
    inputs = layers.Input(shape=(1,), dtype='string', name='input_text')
    x = vectorize_layer(inputs)
    x = layers.Embedding(len(vectorize_layer.get_vocabulary()),
                         output_dim=64,
                         mask_zero=True)(x)
    x = layers.GRU(64)(x)
    outputs = layers.Dense(48, activation='relu', name='output_text')(x)
    model = Model(inputs, outputs, name='text_extractor')
    return model

In [None]:
def euclidean_distance(vectors):
	(feats_a, feats_b) = vectors
	num_squared = K.sum(K.square(feats_a - feats_b), axis=1,
		keepdims=True)
	return K.sqrt(K.maximum(num_squared, K.epsilon()))

In [None]:
def create_vectorize_layer(vocab, vocab_size=10000, sequence_length=128):
    vectorize_layer = layers.experimental.preprocessing.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)
    vectorize_layer.adapt(vocab)

    # tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=None)
    # tokenizer.fit_on_texts(np.concatenate([data[:,0], data[:,2]]))
    # vectorize_layer.set_vocabulary(np.array(list(tokenizer.index_word.values())))
    # vectorize_layer.get_vocabulary()
    return vectorize_layer

In [None]:
tf.keras.backend.clear_session()

title_1 = layers.Input(shape=(1,), dtype='string', name='title_1')
title_2 = layers.Input(shape=(1,), dtype='string', name='title_2')

vectorize_layer = create_vectorize_layer(vocab=np.concatenate([train_data[:,0], train_data[:,2],
                                                               valid_data[:,0], valid_data[:,2]]))
text_extractor = create_text_extractor(vectorize_layer)
feats_text_1 = text_extractor(title_1)
feats_text_2 = text_extractor(title_2)

distance = layers.Lambda(euclidean_distance)([feats_text_1, feats_text_2])
outputs = layers.Dense(1, activation="sigmoid", dtype='float32', name='final_dense')(distance)
model = Model(inputs=[title_1, title_2], outputs=outputs, name='siamese_networks')

In [None]:
metrics_f1 = tfa.metrics.F1Score(num_classes=1,threshold=0.5, name='f1')
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', metrics_f1])

## Training

In [None]:
steps_per_epoch = len(train_data)//cfg.BATCH_SIZE
validation_steps = len(valid_data)//cfg.BATCH_SIZE

model.fit(train_ds, 
          validation_data=valid_ds,
          epochs=cfg.EPOCHS,
          steps_per_epoch=steps_per_epoch,
          validation_steps=validation_steps)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f5d903a89e8>

## Prediction

In [None]:
pred_data = test_data.copy()
pred = model.predict({
        'title_1': pred_data[:,0],
        'title_2': pred_data[:,2]})

print('Accuracy: {}'.format(accuracy_score(pred_data[:,4].astype(int), np.hstack(np.round(pred)).astype(int))))
print('F1 Score: {}'.format(f1_score(pred_data[:,4].astype(int), np.hstack(np.round(pred)).astype(int))))

Accuracy: 0.7142857142857143
F1 Score: 0.7713787085514834
