In [None]:
import numpy as np 
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import string
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.applications import EfficientNetB0
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
import glob
import multiprocessing
import os
import cuml, cudf, cupy
from cuml.neighbors import NearestNeighbors
from cuml.feature_extraction.text import TfidfVectorizer

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)


In [None]:
AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE=16
IMG_SIZE=256

In [None]:
df = pd.read_csv('../input/shopee-product-matching/test.csv')
df.head()
len(df)

In [None]:
df['image_path'] = '../input/shopee-product-matching/test_images' + df['image']
files = df['image_path'].values
df.drop(['image'], axis=1, inplace=True)

In [None]:
def load_image(img_path):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img)
    img = tf.cast(img, tf.float32)
    img = tf.image.resize(img, [IMG_SIZE,IMG_SIZE])
    print("Image loaded")
    return img

def image_dataset(file_paths, ordered=True, encoder=None):
    ds = tf.data.Dataset.from_tensor_slices(file_paths)
    ds = ds.map(load_image, num_parallel_calls=AUTOTUNE)
    if not ordered:
        ds.data.options().experimental_deterministic=False
    ds = ds.batch(BATCH_SIZE)
    if encoder:
        ds = ds.map(encoder.predict)
    ds = ds.prefetch(AUTOTUNE)
    
    return ds

In [None]:
def encoder_factory():
    base = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg')
    return base

from pathlib import Path
encoder = keras.models.load_model('../input/efficientb0/model.hdf5')
dataset = image_dataset(files)


encoded = encoder.predict(dataset)
np.save('encoded.npy',encoded)


KNN = 50 if len(files)>50 else 3
neighbors = NearestNeighbors(KNN)
neighbors.fit(encoded)
distances, indexes = neighbors.kneighbors(encoded)

In [None]:
def predict_same_image(df,threshold, distances,indexes):
    match_column = []
    for i in range(len(df)):
        matches=indexes[i,distances[i,]<threshold]
        match_ids = df.loc[matches,'posting_id'].values
        #matches_string = " ".join(match_ids)
        match_column.append(match_ids)
    
    return match_column

#matches = predict_same_image(df, 7.08, distances, indexes)
#df['matches']=matches

In [None]:
df_g = cudf.DataFrame(df)


model = TfidfVectorizer(stop_words=None,
                       binary=True,
                       max_features=25000)
text_embeddings = model.fit_transform(df_g.title).toarray()

In [None]:


preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(df_g)//CHUNK
if len(df_g)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(df_g))
    print('chunk',a,'to',b)
    
    #COSINE SIMILARITY DISTANCE
    cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
    
    for k in range(b-a):
        IDX = cupy.where(cts[k,]>0.75)[0]
        o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
        preds.append(o)

df['matches'] = preds

In [None]:
submission_frame = pd.DataFrame({'posting_id':df['posting_id'], 'matches':df['matches'].str.join(" ")})
submission_frame.to_csv('./submission.csv', index=False)

In [None]:
submission_frame