# 1. Loading

In [None]:
import tensorflow as tf
LIMIT = 1

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit = 1024 * LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
    print('then RAPIDS can use %iGB GPU RAM'%(16 - LIMIT))
  except RuntimeError as e:
    print(e)
else:
    print("GPU is not running")

In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv("../input/shopee-product-matching/train.csv")
print(df.shape)
df.head()

In [None]:
DEBUG = False
if DEBUG:
    df = df.sample(n = 2000).reset_index(drop = True)
print(df.shape)

In [None]:
from sklearn.model_selection import GroupKFold
groups = df["label_group"].values
gkf = GroupKFold(n_splits = 5)
for train_idx, valid_idx in gkf.split(df, groups, groups):
    train = df.iloc[train_idx, :].copy()
    valid = df.iloc[valid_idx, :].copy()
print(train.shape, valid.shape)

In [None]:
train

# 2. Images

# 3. Autoencoder

In [None]:
import matplotlib.pyplot as plt
plt.style.use("seaborn-white")
import tensorflow as tf

def train_preprocess(path, _):
    path = "../input/shopee-product-matching/train_images" + "/" + path
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels = 3)
    image = tf.image.resize(image, [256, 256])
    image = tf.cast(image, tf.float32) / 255.0
    return image, image

train_ds = tf.data.Dataset.from_tensor_slices((train["image"].values, train["label_group"].values))
valid_ds = tf.data.Dataset.from_tensor_slices((valid["image"].values, valid["label_group"].values))
train_ds = train_ds.map(train_preprocess)
valid_ds = valid_ds.map(train_preprocess)

In [None]:
image, _ = next(iter(train_ds))
plt.imshow(image)
plt.show()
print(image.shape)

In [None]:
image, _ = next(iter(valid_ds))
plt.imshow(image)
plt.show()
print(image.shape)

In [None]:
train_ds = train_ds.batch(64).prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
valid_ds = valid_ds.batch(64 * 2)

In [None]:
import tensorflow.keras.layers as L
import tensorflow.keras.models as M

def autoencoder(input_shape):
    inputs = L.Input(shape = input_shape)
    encoded = L.Conv2D(filters = 16, kernel_size = (3, 3), padding = "same", activation = "relu")(inputs)
    encoded = L.BatchNormalization()(encoded)
    encoded = L.MaxPooling2D(pool_size = (2, 2), padding = "same")(encoded)
    encoded = L.Conv2D(filters = 32, kernel_size = (3, 3), padding = "same", activation = "relu")(encoded)
    encoded = L.BatchNormalization()(encoded)
    encoded = L.MaxPooling2D(pool_size = (2, 2), padding = "same")(encoded)
    encoded = L.Conv2D(filters = 64, kernel_size = (3, 3), padding = "same", activation = "relu")(encoded)
    encoded = L.BatchNormalization()(encoded)
    encoded = L.MaxPooling2D(pool_size = (2, 2), padding = "same")(encoded)
    """
    encoded = L.Conv2D(filters = 64, kernel_size = (3, 3), padding = "same", activation = "relu")(encoded)
    encoded = L.BatchNormalization()(encoded)
    encoded = L.MaxPooling2D(pool_size = (2, 2), padding = "same")(encoded)
    encoded = L.Conv2D(filters = 64, kernel_size = (3, 3), padding = "same", activation = "relu")(encoded)
    encoded = L.BatchNormalization()(encoded)
    encoded = L.MaxPooling2D(pool_size = (2, 2), padding = "same")(encoded)
    encoded = L.Conv2D(filters = 128, kernel_size = (3, 3), padding = "same", activation = "relu")(encoded)
    encoded = L.BatchNormalization()(encoded)
    encoded = L.MaxPooling2D(pool_size = (2, 2), padding = "same")(encoded)
    """

#    features = encoded
    
    """
    decoded = L.Conv2D(filters = 128, kernel_size = (3, 3), padding = "same", activation = "relu")(encoded)
    decoded = L.UpSampling2D(size = (2, 2))(decoded)
    decoded = L.Conv2D(filters = 64, kernel_size = (3, 3), padding = "same", activation = "relu")(decoded)
    decoded = L.UpSampling2D(size = (2, 2))(decoded)
    decoded = L.Conv2D(filters = 64, kernel_size = (3, 3), padding = "same", activation = "relu")(decoded)
    decoded = L.UpSampling2D(size = (2, 2))(decoded)
    """
    decoded = L.Conv2D(filters = 64, kernel_size = (3, 3), padding = "same", activation = "relu")(encoded)
    decoded = L.UpSampling2D(size = (2, 2))(decoded)
    decoded = L.Conv2D(filters = 32, kernel_size = (3, 3), padding = "same", activation = "relu")(decoded)
    decoded = L.UpSampling2D(size = (2, 2))(decoded)
    decoded = L.Conv2D(filters = 16, kernel_size = (3, 3), padding = "same", activation = "relu")(decoded)
    decoded = L.UpSampling2D(size = (2, 2))(decoded)
    decoded = L.Conv2D(filters = 3, kernel_size = (3, 3), padding = "same", activation = "sigmoid")(decoded)
    
    encoder = M.Model(inputs = inputs, outputs = encoded)
    autoencoder = M.Model(inputs = inputs, outputs = decoded)
    autoencoder.compile(optimizer = "Adam", loss = "binary_crossentropy")
    return autoencoder, encoder

tf.keras.backend.clear_session()
autoencoder, encoder = autoencoder((256, 256, 3))
autoencoder.summary()

In [None]:
TRAINING = False

if TRAINING:
    history = autoencoder.fit(
        train_ds, validation_data = valid_ds, epochs = 1,
        callbacks = [
            tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 3, mode = "min"),
            tf.keras.callbacks.ModelCheckpoint(filepath = "autoencoder.h5", monitor = "val_loss", mode = "min", save_best_only = True, save_weights_only = True)
        ]
    )
else:
    autoencoder.load_weights("../input/shoppee-autoencoder0324/autoencoder.h5")

In [None]:
image = next(iter(valid_ds))[0][0]
plt.imshow(image)
plt.show()

In [None]:
plt.imshow(autoencoder.predict(next(iter(valid_ds))[0])[0])
plt.show()

In [None]:
from tqdm.notebook import tqdm, trange
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    import cuml
    from cuml.neighbors import NearestNeighbors
else:
    from sklearn.neighbors import NearestNeighbors
import gc

# INFERENCE

In [None]:
CV = True

test = pd.read_csv("../input/shopee-product-matching/test.csv")
if test.shape[0] > 3:
    CV = False

if CV:
    test = valid.copy()
    DIR = "../input/shopee-product-matching/train_images"
else:
    DIR = "../input/shopee-product-matching/test_images"
print(test.shape, DIR)

In [None]:
CHUNK = 1024
CHUNK_SIZE = test.shape[0] // CHUNK
if test.shape[0] % CHUNK != 0:
    CHUNK_SIZE += 1

In [None]:
def test_preprocess(path):
    path = DIR + "/" + path
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels = 3)
    image = tf.image.resize(image, [256, 256])
    image = tf.cast(image, tf.float32) / 255.0
    return image

test["image_preds"] = ""
KNN = 50
knn = NearestNeighbors(n_neighbors = KNN)

for chunk in range(CHUNK_SIZE):
    a = chunk * CHUNK
    b = min((chunk + 1) * CHUNK, test.shape[0])
    
    test_ds = tf.data.Dataset.from_tensor_slices((test.iloc[a : b]["image"].values))
    test_ds = test_ds.map(test_preprocess)
    test_ds = test_ds.batch(64 * 2)

    test_encoded = []
    for image in tqdm(test_ds):
        batch_size = image.shape[0]
        encoded = encoder.predict(image)
        test_encoded.append(encoded.reshape(batch_size, -1))
    test_encoded = np.concatenate(test_encoded, axis = 0)
    
    knn.fit(test_encoded)
    distances, indices = knn.kneighbors(test_encoded)
    del test_encoded; gc.collect()

    preds = []
    for i in range(b - a):
        idx = np.where(distances[i] < 5.0)[0]
        ids = indices[i, idx]
        preds.append(test.iloc[a: b].iloc[ids]["posting_id"].values)

    del distances, indices; gc.collect()
    test.iloc[a : b,]["image_preds"] = preds
test.head()

In [None]:
test["matches"] = test.apply(lambda x: " ".join(np.unique(x["image_preds"])) , axis = 1)
submit = test[["posting_id", "matches"]].copy()
submit.to_csv("submission.csv", index = False)
submit