In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cuml
import spacy
import nltk
import re
import os
import gc
import cv2
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
from tensorflow.keras import layers, regularizers

from cuml import metrics
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn import metrics

In [None]:
LIMIT = 6
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM' % LIMIT)
print('then RAPIDS can use %iGB GPU RAM' % (16-LIMIT))

In [None]:
PATH = '../input/shopee-product-matching/'
train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')
print(train.shape, test.shape)

In [None]:
# train = train.groupby('label_group').head(2).reset_index(drop=True)
# train

In [None]:
gen = keras.preprocessing.image.ImageDataGenerator(rescale=1.0/255)
gen

In [None]:
def input_arcface(df, batch_size=32):
    img_gen = gen.flow_from_dataframe(df, '../input/shopee-product-matching/train_images/',
                                      x_col='image', 
                                      y_col='label_group',
                                      target_size=(256, 256), 
                                      batch_size=batch_size,
                                      color_mode="rgb",
                                       )
    while True:
        X1 = img_gen.next()
        yield [X1[0], X1[1]], X1[1]

In [None]:
class ArcFace(layers.Layer):
    def __init__(self, n_classes=10, s=30.0, m=0.50, regularizer=None, **kwargs):
        super(ArcFace, self).__init__(**kwargs)
        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.regularizer = regularizers.get(regularizer)

    def build(self, input_shape):
        super(ArcFace, self).build(input_shape[0])
        self.W = self.add_weight(name='W',
                                 shape=(input_shape[0][-1], self.n_classes),
                                 initializer='glorot_uniform',
                                 trainable=True,
                                 regularizer=self.regularizer)

    def call(self, inputs):
        x, y = inputs
        c = K.shape(x)[-1]
        # normalize feature
        x = tf.nn.l2_normalize(x, axis=1)
        # normalize weights
        W = tf.nn.l2_normalize(self.W, axis=0)
        # dot product
        logits = x @ W
        # add margin
        # clip logits to prevent zero division when backward
        theta = tf.acos(K.clip(logits, -1.0 + K.epsilon(), 1.0 - K.epsilon()))
        target_logits = tf.cos(theta + self.m)
        # sin = tf.sqrt(1 - logits**2)
        # cos_m = tf.cos(logits)
        # sin_m = tf.sin(logits)
        # target_logits = logits * cos_m - sin * sin_m
        #
        logits = logits * (1 - y) + target_logits * y
        # feature re-scale
        logits *= self.s
        out = tf.nn.softmax(logits)

        return out

    def compute_output_shape(self, input_shape):
        return (None, self.n_classes)

In [None]:
IMG_SIZE = 256
n_classes = 11014

inp = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
y = layers.Input(shape=())
x = keras.applications.EfficientNetB3(include_top=False, weights='../input/tfkerasefficientnetimagenetnotop/efficientnetb3_notop.h5', pooling='avg')(inp)
output = ArcFace(n_classes)([x, y])

model = keras.models.Model(inputs=[inp, y], outputs=[output])
keras.utils.plot_model(model, show_layer_names=True, show_shapes=True)

In [None]:
lr = 0.0005
model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
              loss='categorical_crossentropy', 
              metrics=['categorical_accuracy', 'accuracy'])

train['label_group'] = train['label_group'].astype(str)

In [None]:
BATCH_SIZE = 16
EPOCHS = 6

history = model.fit_generator(input_arcface(train, BATCH_SIZE),
                              epochs=EPOCHS,
                              steps_per_epoch=train.shape[0]//BATCH_SIZE)

In [None]:
emb_model = keras.models.Model(inputs=model.layers[-3].input, outputs=model.layers[-3].output)

In [None]:
train = test.copy()
# train = pd.concat([train, train]).reset_index(drop=True)

In [None]:
embeddings = []
mul = 10
steps = len(train) // mul
if(len(train) % mul != 0):
    steps += 1
_ = gc.collect()
for i in tqdm(range(steps)):
    a = i*mul
    b = (i+1)*mul
    b = min(b, len(train))

    images = []

    for k, idx in enumerate(train[a:b]['image']):
        k = k+a
        img = cv2.imread(PATH + 'test_images/' + idx)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        images.append(img)
    embeddings_t = emb_model.predict(
        np.array(images), use_multiprocessing=True, workers=4)
    embeddings.append(embeddings_t)

del model, images, embeddings_t, img
_ = gc.collect()

In [None]:
embeddings_cat = np.concatenate(embeddings)

del embeddings
_ = gc.collect()
_

In [None]:
from cuml.neighbors import NearestNeighbors

In [None]:
KNN = 50
if(len(test) <= 3 ): KNN=2
model = NearestNeighbors(n_neighbors=KNN, metric='cosine')
model.fit(embeddings_cat)

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar images...')
CTS = len(embeddings_cat)//CHUNK
if len(embeddings_cat)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(embeddings_cat))
    print('chunk',a,'to',b)
    distances, indices = model.kneighbors(embeddings_cat[a:b,])
    
    for k in range(b-a):
        IDX = np.where(distances[k,]<0.1)[0]
        IDS = indices[k,IDX]
        o = train.iloc[IDS].posting_id.values
        preds.append(o)
        
del model, distances, indices
_ = gc.collect()

In [None]:
# img_pred = []
# mul = 100

# steps = len(embeddings_cat) // mul
# if(len(embeddings_cat) % mul != 0):
#     steps += 1

# for i in tqdm(range(steps)):
#     a = i*mul
#     b = (i+1)*mul
#     b = min(b, len(embeddings_cat))
#     k = metrics.pairwise_distances(
#         embeddings_cat[a:b], embeddings_cat, metric='cosine')
#     k = 1-k
#     for p in range(b-a):
#         idx = np.where(k[p, ] > 0.9)[0]
#         tmp = train.iloc[idx].posting_id.values
#         img_pred.append(tmp)

In [None]:
train['img_preds'] = preds

# del embeddings_cat, k
_ = gc.collect()

In [None]:
# NOW TITLE COLUMN
train['title'] = train['title'].str.lower()
corpus = train['title']

cv = TfidfVectorizer(max_features=20_000) 
vectors = cv.fit_transform(corpus).toarray()

In [None]:
title_preds = []
mul = 1000

steps = len(vectors) // mul
if(len(vectors) % mul != 0):
    steps += 1

for i in tqdm(range(steps)):
    a = i*mul
    b = (i+1)*mul
    b = min(b, len(vectors))
    k = metrics.pairwise_distances(vectors[a:b], vectors, metric='cosine')
    k = 1 - k

    for p in range(b-a):
        idx = np.where(k[p, ] > 0.7)[0]
        tmp = train.iloc[idx].posting_id.values
        title_preds.append(tmp)

In [None]:
train['title_preds'] = title_preds

In [None]:
# tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
# train['target'] = train.label_group.map(tmp)

tmp = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['oof'] = train.image_phash.map(tmp)

In [None]:
train['final_preds'] = train.apply(
    lambda x: np.union1d(x['oof'], x['img_preds']), axis=1)

train['final_preds'] = train.apply(
    lambda x: np.union1d(x['title_preds'], x['final_preds']), axis=1)

In [None]:
# def getMetric(col):
#     def f1score(row):
#         n = len( np.intersect1d(row.target,row[col]) )
#         return 2*n / (len(row.target)+len(row[col]))
#     return f1score

In [None]:
# train['f1'] = train.apply(getMetric('oof'),axis=1)
# train['f1'].mean()

In [None]:
# train['f1'] = train.apply(getMetric('title_preds'),axis=1)
# train['f1'].mean()

In [None]:
# train['f1'] = train.apply(getMetric('img_preds'),axis=1)
# train['f1'].mean()

In [None]:
# train['f1'] = train.apply(getMetric('final_preds'),axis=1)
# train['f1'].mean()

In [None]:
train['matches'] = train['final_preds'].apply(lambda x: ' '.join(x))
train[['posting_id', 'matches']].to_csv('submission.csv', index=False)

In [None]:
train

In [None]:
train.to_csv('train.csv', index=False)