In [None]:
import os
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

In [None]:
import pandas as pd
import math
import numpy as np
import cudf
from keras.models import Model

from keras.layers import Input
from keras.layers import Conv2D, GlobalAveragePooling2D,Dropout,Flatten
from keras.layers import MaxPooling2D,Dense
from keras.layers.merge import concatenate
from keras.utils import plot_model

import tensorflow as tf
from tensorflow import keras
from keras import layers, models
from keras.preprocessing.image import ImageDataGenerator

from sklearn.model_selection import train_test_split



from tqdm import tqdm  

from sklearn.utils import shuffle

import itertools

import random

In [None]:
if len(tf.config.experimental.list_physical_devices('GPU'))< 1:
    raise Exception("Sorry, no GPU found")

In [None]:
train_df = pd.read_csv('../input/shopee-product-matching/train.csv')
test_df = pd.read_csv('../input/shopee-product-matching/test.csv')
len_data_train  = len(train_df)
len_data_test  = len(test_df)
BATCH_SIZE = 32
TRAIN_BATCHES = math.ceil(len_data_train/BATCH_SIZE)
TEST_BATCHES = math.ceil(len_data_test/BATCH_SIZE)

In [None]:
if not len_data_test > 3: FRONT_END = True

In [None]:
train_images = '../input/shopee-product-matching/train_images'
test_images = '../input/shopee-product-matching/test_images'
IMG_SIZE = 32
eta = 1/1000.0

In [None]:
def myzip(s,t):
    return [(s[i], t[i]) for i in range(len(s))]

In [None]:
labels = list(set(train_df.label_group.tolist()))
 
labels.sort()
no_classes = len(labels)

label=[]
mapped=[]
for index,value in enumerate(labels):
    label.append(value)
    mapped.append(index)
zipper = myzip(label, mapped)
reverse = myzip(mapped,label)
label_dict = dict(zipper)
reverse_dict = dict(reverse)

for index,label in enumerate(train_df.label_group):    
    train_df.at[index,'label_group'] = label_dict[label]

In [None]:
ys = train_df.label_group.to_numpy().astype(np.float32)

xs_train_image = train_df.image.to_numpy()
xs_test_image = test_df.image.to_numpy()

xs_train_image,ys = shuffle(xs_train_image,ys)

In [None]:
def get_test_images(test_batch):
    if isinstance(test_batch, str):
        test_batch = [test_batch]
    xs_test = np.empty([len(test_batch),IMG_SIZE,IMG_SIZE,3])
    for index,image in enumerate(test_batch):
        path = os.path.join(test_images,image)
        img = tf.keras.preprocessing.image.load_img(path) 
        img = tf.keras.preprocessing.image.img_to_array(img) 
        img = tf.keras.preprocessing.image.smart_resize(img,size=(IMG_SIZE,IMG_SIZE)) 
        img = np.array([img])
        xs_test[index] = img/255.0
    return xs_test

In [None]:
xs = np.empty([len_data_train,IMG_SIZE,IMG_SIZE,3])

In [None]:
def get_xs(xs_batch,batch):
    for index,image in enumerate(xs_batch):
        path = os.path.join(train_images,image)
        img = tf.keras.preprocessing.image.load_img(path) 
        img = tf.keras.preprocessing.image.img_to_array(img)         
        img = tf.keras.preprocessing.image.smart_resize(img,size=(IMG_SIZE,IMG_SIZE)) 
        img = np.array([img])
        xs[BATCH_SIZE*batch+index] = img

In [None]:
for batch in tqdm(range(TRAIN_BATCHES)):
        get_xs(xs_train_image[batch * BATCH_SIZE : min(BATCH_SIZE*batch + BATCH_SIZE,len_data_train)],batch)

In [None]:
train_datagen = ImageDataGenerator(
    rescale=1./255,    
    rotation_range=23,
    width_shift_range=0.23,
    height_shift_range=0.23,
    horizontal_flip=True,
    zoom_range=0.23,
    
    )

In [None]:
test_datagen = ImageDataGenerator( 
    rescale=1./255,    
    rotation_range=29,
    width_shift_range=0.29,
    height_shift_range=0.29,
    horizontal_flip=True,
    zoom_range=0.29,
    )

In [None]:
def divergence(layer_in, f1, f2_in, f2_out, f3_in, f3_out, f4_out):
    # 1x1 conv
    conv1 = Conv2D(f1, (1,1), padding='same', activation='relu')(layer_in)
    # 3x3 conv
    conv3 = Conv2D(f2_in, (1,1), padding='same', activation='relu')(layer_in)
    conv3 = Conv2D(f2_out, (3,3), padding='same', activation='relu')(conv3)
    # 5x5 conv
    conv5 = Conv2D(f3_in, (1,1), padding='same', activation='relu')(layer_in)
    conv5 = Conv2D(f3_out, (5,5), padding='same', activation='relu')(conv5)
    # max pooling
    pool = MaxPooling2D((3,3), strides=(1,1), padding='same')(layer_in)
    pool = Conv2D(f4_out, (1,1), padding='same', activation='relu')(pool)
    # concatenate filters and max pool community
    layer_out = concatenate([conv1, conv3, conv5, pool], axis=-1)
    layer_out = MaxPooling2D((2,2), strides=(2,2))(layer_out)
    return layer_out

In [None]:
# function for creating output block
def out_block(layer_in,classes):
    layer_in = Dropout(0.31)(layer_in)
    layer_in = Flatten()(layer_in)
    layer_in = Dense(classes)(layer_in)
    return layer_in

In [None]:
def create_model():
    # define model input
    visible = Input(shape=(IMG_SIZE, IMG_SIZE, 3))    
    layer = divergence(visible, 128,   64, 128, 32, 64,    64)
    layer = divergence(layer, 256,      128, 256, 64, 128,     128)
    layer = out_block(layer, no_classes)
    return Model(inputs=visible, outputs=layer)

In [None]:
model = create_model()
  
model.compile(optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])

In [None]:
plot_model(model, show_shapes=True, to_file='divergence.png')

In [None]:
history = model.fit(train_datagen.flow(xs, ys, batch_size=BATCH_SIZE),
              steps_per_epoch=len(xs) / BATCH_SIZE, epochs=45,validation_data=(test_datagen.flow(xs, ys, batch_size=BATCH_SIZE)))

In [None]:
if FRONT_END:
    import matplotlib.pyplot as plt

In [None]:
if FRONT_END:
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

    test_loss, test_acc = model.evaluate(test_datagen.flow(xs, ys, batch_size=32), verbose=2)

    print(test_acc)

In [None]:
def predict(xs_test):    
    return np.argmax(model.predict(xs_test), axis = 1)

In [None]:
if not 'label_group' in test_df:
    test_df['label_group'] = test_df.image.map(get_test_images).map(predict).map(lambda x: x[0])

In [None]:
temp = test_df.groupby('label_group').posting_id.agg('unique').to_dict()
test_df['match_post'] = test_df.label_group.map(temp)
temp = test_df.groupby('image_phash').posting_id.agg('unique').to_dict()
test_df['match_phash'] = test_df.image_phash.map(temp)
test_df.head()

In [None]:
#https://www.kaggle.com/cdeotte
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.match_post,row[col]) )
        return 2*n / (len(row.match_post)+len(row[col]))
    return f1score

def combine_for_sub(row):
    x = np.concatenate([row.match_post,row.match_phash])
    return ' '.join( np.unique(x) )

In [None]:
if FRONT_END:
    test_df['f1'] = test_df.apply(getMetric('match_phash'),axis=1)
    print('CV score for baseline =',test_df.f1.mean())

In [None]:
test_df['matches'] = test_df.apply(combine_for_sub,axis=1)

In [None]:
test_df[['posting_id','matches']].to_csv('submission.csv',index=False)

In [None]:
if FRONT_END:
    xs_test = get_test_images(xs_test_image)
    predictions = predict(xs_test)

In [None]:
if FRONT_END:
    for index,prediction in enumerate(predictions):
   
        matches = train_df.loc[train_df['label_group'] == prediction].index.values.tolist()
        plt.figure(figsize=(10,10))
        for i,match in enumerate(matches[:25]):
            image = train_df.iloc[match].image
            path = os.path.join(train_images,image)
            img = tf.keras.preprocessing.image.load_img(path) 
        
            plt.subplot(5,5,i+1)
            plt.xticks([])
            plt.yticks([])
            plt.grid(False)
            plt.imshow(img, cmap=plt.cm.binary)
            plt.xlabel(reverse_dict[prediction])#reverse lookup dictionary
        plt.show()
        posting_id = test_df.iloc[index,:].posting_id
    
        image = test_df.iloc[index,:].image
        path = os.path.join(test_images,image)
        img = tf.keras.preprocessing.image.load_img(path)     
        plt.figure(figsize=(32,32))
        plt.subplot(5,5,1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(img, cmap=plt.cm.binary)
        plt.xlabel(posting_id)
        plt.show()