In [None]:
!pip install ../input/shopee-external-models/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/shopee-external-models/efficientnet-1.1.0-py3-none-any.whl
import numpy as np 
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import gc
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.models import load_model
from efficientnet.tfkeras import EfficientNetB3
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

In [None]:
test=pd.read_csv(r'../input/shopee-product-matching/test.csv')
if len(test)<=3:
    train=pd.read_csv(r'../input/shopee-product-matching/train.csv')
    image_paths='../input/shopee-product-matching/train_images/'+train.image
else:
    train=pd.read_csv(r'../input/shopee-product-matching/test.csv')
    image_paths='../input/shopee-product-matching/test_images/'+train.image

train.shape

In [None]:
del test
gc.collect()

In [None]:
import re
def preprocess(description):
  # Actually not required as what we have is titles which usually doesn't contain words that we use for general communication.
    description=description.lower()
    description=re.sub('[-\n\t]+',' ',description)
    description= re.sub(r"won\'t", "will not",description)
    description=re.sub(r"can\'t", "can not",description)
    description=re.sub(r"n\'t", " not",description)
    description=re.sub(r"\'re", " are",description)
    description=re.sub(r"\'s", " is",description)
    description=re.sub(r"\'d", " would",description)
    description=re.sub(r"\'ll", " will",description)
    description=re.sub(r"\'t", " not",description)
    description=re.sub(r"\'ve", " have",description)
    description=re.sub(r"\'m", " am",description)
    description=re.sub('[^a-z0-9]+',' ',description)
    description=re.sub('\s+',' ',description)
    return description.strip()

In [None]:
cleansed_train=[preprocess(title) for title in tqdm(train.title.values)]

In [None]:
%%time
idf=TfidfVectorizer(binary=True, max_features=20000)
embedded_train_idf=idf.fit_transform(cleansed_train).toarray()
embedded_train_idf.shape

In [None]:
del cleansed_train, idf
gc.collect()

In [None]:
embedded_train_idf=torch.from_numpy(embedded_train_idf)
embedded_train_idf=embedded_train_idf.cuda()

In [None]:
idf_matches=[]
chunksize=512
chunks=len(train)//chunksize if len(train)%chunksize==0 else (len(train)//chunksize)+1
for chunk in tqdm(range(chunks)):
    start=chunk*chunksize
    end=min(len(train),start+chunksize)
    cossim=torch.matmul(embedded_train_idf,embedded_train_idf[start:end].T).T
    cossim=cossim.data.cpu().numpy()
    for per_posting in cossim:
        indices=np.where(per_posting>=0.73)[0]
        match=train.iloc[indices].posting_id.values
        if len(match.tolist())>50:
            ind=np.where(per_posting>=sorted(per_posting)[-50])[0]
            match=train.iloc[ind].posting_id.values
        idf_matches.append(match.tolist())

In [None]:
del embedded_train_idf
gc.collect()

In [None]:
LIMIT = 3.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('TensorFlow Limit {}GB'.format(LIMIT))
print('RAPIDS Limit {}GB'.format(16-LIMIT))

In [None]:
eff_b3_embedder=load_model('../input/shopee-arcface-trained-image-embedders/EFNetb3_embedder_50_epochs.h5')

In [None]:
def read_image(image):
    image = tf.io.read_file(image)
    image= tf.image.decode_jpeg(image, channels = 3)
    image = tf.image.resize(image,(512,512))
    image = tf.cast(image, tf.float32) / 255.0
    return image

def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(32)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

In [None]:
image_embeddings_effb3=[]
chunksize=4096
chunks=len(train)//chunksize if len(train)%chunksize==0 else (len(train)//chunksize)+1
for chunk in tqdm(range(chunks)):
    start=chunk*chunksize
    end=min(len(train),start+chunksize)
    images=get_dataset(image_paths[start:end])
    embeddings=eff_b3_embedder.predict(images)
    image_embeddings_effb3.append(embeddings)
image_embeddings_effb3=np.concatenate(np.array(image_embeddings_effb3))
image_embeddings_effb3.shape

In [None]:
del eff_b3_embedder, image_paths
gc.collect()

In [None]:
image_embeddings_effb3=np.array([i/np.linalg.norm(i) for i in tqdm(image_embeddings_effb3)])
image_embeddings_effb3.shape

In [None]:
embedded_train_eff=torch.from_numpy(image_embeddings_effb3)
embedded_train_eff=embedded_train_eff.cuda()

In [None]:
del image_embeddings_effb3
gc.collect()

In [None]:
image_matches=[]
chunksize=2048
chunks=len(train)//chunksize if len(train)%chunksize==0 else (len(train)//chunksize)+1
for chunk in tqdm(range(chunks)):
    start=chunk*chunksize
    end=min(len(train),start+chunksize)
    cossim=torch.matmul(embedded_train_eff,embedded_train_eff[start:end].T).T
    cossim=cossim.data.cpu().numpy()
    for per_posting in cossim:
        indices=np.where(per_posting>=0.6)[0]
        match=train.iloc[indices].posting_id.values
        if len(match.tolist())>50:
            ind=np.where(per_posting>=sorted(per_posting)[-50])[0]
            match=train.iloc[ind].posting_id.values
        image_matches.append(match.tolist())

In [None]:
combined=[]
for i in tqdm(range(len(image_matches))):
    comb_arr=np.concatenate((np.array(image_matches[i]),np.array(idf_matches[i])), axis=0)
    sort, indices=np.unique(comb_arr, return_index=True)
    combined.append(' '.join(comb_arr[np.sort(indices)]))

In [None]:
del image_matches, idf_matches
gc.collect()

In [None]:
train['matches']=combined
train.head()

In [None]:
del combined
gc.collect()

In [None]:
submission=train[['posting_id','matches']]
submission.head()

In [None]:
del train
gc.collect()

In [None]:
submission.to_csv('submission.csv',index=False)