# Importing Utils and Reading Data

In [None]:
import torch

dev="cuda" if torch.cuda.is_available() else "cpu"
if dev=="cuda":
  print (f"Available gpu is :{torch.cuda.get_device_name (dev)}.")
else:
  print (f"GPU not available :{torch.cuda.get_device_name(dev)}.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cv2 as cv
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from IPython.display import clear_output
from sklearn.metrics.pairwise import cosine_similarity
import sys, os

In [None]:
# When submitting path will read test images

SUBMIT=True
csv="../input/shopee-product-matching/train.csv"
img_dir="../input/shopee-product-matching/train_images/"
if SUBMIT:
  csv="../input/shopee-product-matching/test.csv"
  img_dir="../input/shopee-product-matching/test_images/"

In [None]:
dataset=pd.read_csv (csv)
dataset.head(5)

In [None]:
# tmp = dataset.groupby('label_group').posting_id.agg('unique').to_dict()
# dataset['target'] = dataset.label_group.map(tmp)
# dataset.head(5)

# Text Embedding

In [None]:
text_model=TfidfVectorizer()
X=text_model.fit_transform(dataset.title.values)

In [None]:
if len(dataset)>50:
    KNN=50
else:
    KNN=1
neighbors_model=NearestNeighbors(n_neighbors=KNN)
nbrs=neighbors_model.fit(X)
text_distances, text_indices=nbrs.kneighbors(X)

In [None]:
idxs=[]
for i in range (text_indices.shape[0]):
  idxs.append(text_indices[i][text_distances[i]<0.8].tolist())
pst=[]
for i in idxs:
  pst.append(dataset.posting_id.iloc[i].tolist())
dataset['textEm']=pst

In [None]:
del pst
del idxs
del text_indices
del text_distances
del nbrs
del neighbors_model
del X
del text_model

# Image Embedding

In [None]:
BASE=img_dir

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=BASE): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv.imread(self.path+row.image)
            X[i,] = cv.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X

In [None]:
model = EfficientNetB0(weights="../input/effnetb0/efficientnetb0_notop.h5", include_top=False, pooling='avg', input_shape=None)
train_gen = DataGenerator(dataset, batch_size=64)
image_embeddings = model.predict(train_gen,verbose=1)
print('image embeddings shape is',image_embeddings.shape)

In [None]:
del train_gen
del model

In [None]:
# sim=cosine_similarity(image_embeddings)

In [None]:
# scores=[]
# threshold=[0.7]
# for thresh in threshold:
#   idxs=[]
#   for i in range (sim.shape[0]):
#     idxs.append(np.argwhere(sim[i]>=thresh).reshape(1, -1).tolist()[0])
#   pst=[]
#   for i in idxs:
#     pst.append(dataset.posting_id.iloc[i].tolist())
#   dataset['imgEm']=pst

In [None]:
# del sim

In [None]:
if len(dataset)>50:
    KNN=50
else:
    KNN=1
neighbors_model=NearestNeighbors(n_neighbors=KNN)
nbrs=neighbors_model.fit(image_embeddings)
img_distances, img_indices=nbrs.kneighbors(image_embeddings)

In [None]:
del neighbors_model
del nbrs
del KNN

In [None]:
idxs=[]
for i in range (img_indices.shape[0]):
  idxs.append(img_indices[i][img_distances[i]<7].tolist())
pst=[]
for i in idxs:
  pst.append(dataset.posting_id.iloc[i].tolist())
dataset['imgEm']=pst

# F1 Score

In [None]:
# tmp = dataset.groupby('label_group').posting_id.agg('unique').to_dict()
# dataset['target'] = dataset.label_group.map(tmp)

In [None]:
# dataset['matches']=0
# for i in range (dataset.shape[0]):
#   dataset.matches.iloc[i]=dataset.textEm.iloc[i]+dataset.imgEm.iloc[i]
#   dataset.matches.iloc[i]=np.unique(dataset.matches.iloc[i])

In [None]:
# np.unique(dataset.matches.values[0])

In [None]:
def concMet(row):
    return ' '.join(np.unique(np.concatenate([row.textEm, row.imgEm])))

In [None]:
dataset['matches']=dataset.apply(concMet, axis=1)

In [None]:
dataset.head(5)

In [None]:
# f1=[]
# for i in range (len(dataset)):
#     a=len(np.intersect1d(dataset.iloc[i].target, dataset.iloc[i].matches))
#     b=len(dataset.iloc[i].target) + len(dataset.iloc[i].matches)
#     f1.append(2*a/b)
# print ("F1 Score: ", np.mean(f1))

In [None]:
dataset[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()