In [None]:
import os

In [None]:
import math
from tqdm import tqdm  
from keras.preprocessing.text import Tokenizer

import numpy as np, pandas as pd
import cv2,matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing import image

In [None]:
train_df = pd.read_csv('../input/shopee-product-matching/train.csv')
test_df = pd.read_csv('../input/shopee-product-matching/test.csv')

In [None]:
no_classes = len(train_df.label_group.unique())

In [None]:
len_data_train  = len(train_df)
len_data_test  = len(test_df)
BATCH_SIZE = 32
TRAIN_BATCHES = math.ceil(len_data_train/BATCH_SIZE)
TEST_BATCHES = math.ceil(len_data_test/BATCH_SIZE)
IMG_SIZE = 32

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=IMG_SIZE, batch_size=BATCH_SIZE, path=''): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        batches = math.ceil(len(self.df) / self.batch_size)
        
        return batches

    def __getitem__(self, batch):
        'Generate one batch of data'
        demarrer=batch*self.batch_size
        fin=min(self.batch_size*batch + self.batch_size,len(self.df))
        indexes = self.indexes[demarrer:fin]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):            
            img = image.load_img(self.path+row.image, target_size=(32, 32))
            x = image.img_to_array(img)
            x = np.array(x)
            x = preprocess_input(x)
            X[i,] = x
        return X

In [None]:
TRAIN = '../input/shopee-product-matching/train_images/'
TEST = '../input/shopee-product-matching/test_images/'

CHUNK_SIZE = 1024*4

In [None]:


def get_embeds(df,image_path):
    WIMGN = '../input/vgg16imagenet/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5'
    
    model = VGG16(include_top=False, weights=WIMGN, input_shape=None,pooling='avg')
    
    embeds = []
    
    print('Computing image embeddings...')
    BATCHES = math.ceil(len(df)/CHUNK_SIZE)

    for i,batch in enumerate( range( BATCHES ) ):
    
        a = batch * CHUNK_SIZE
        b = min(CHUNK_SIZE*batch + CHUNK_SIZE,len_data_train)
    
        print('chunk',a,'to',b)
    
        image_gen = DataGenerator(df.iloc[a:b], batch_size=BATCH_SIZE, path=image_path)
        image_embeddings = model.predict(image_gen,verbose=1,use_multiprocessing=True, workers=4)
        embeds.append(image_embeddings)
    
    image_embeddings = np.concatenate(embeds)
    return image_embeddings

In [None]:
train_embeds = get_embeds(train_df,TRAIN)

In [None]:
test_embeds = get_embeds(test_df,TEST)

In [None]:
train_embeds = np.concatenate((train_embeds,test_embeds), axis=0)

In [None]:
del test_embeds

In [None]:
from keras.preprocessing.text import text_to_word_sequence

In [None]:
def get_words():
    with open('../input/infochimps/words_alpha.txt', 'r') as file:
        data = file.read().replace('\n', ' ')
    words = np.array(   list(set(text_to_word_sequence(data)))    )
    return words

In [None]:
words = get_words()

In [None]:
def double(x):
    return x + x

In [None]:
def word_embed(image_embeddings):
    mymax=np.array(np.max(image_embeddings, axis=0))
    mymin=np.array(np.min(image_embeddings, axis=0))
    mystd=double(np.array(np.std(image_embeddings, axis=0)))
    
    bins={}
    for i in range(0,len(mymax)):
        amax=mymax[i]
        amin=mymin[i]
        astd=mystd[i]
        abin=[]
        counter = amin
        while counter <= amax:
            abin.append(counter)
            counter+=astd
        bins[i]=abin
        
    alen = len(image_embeddings.T)
    blen = len(image_embeddings)
    
    digitized=np.empty([alen,blen])
    
    for index in range(alen):
        inds = np.digitize(image_embeddings[:,index], bins[index])
        digitized[index]=inds
    
    digitized = digitized.T
    
    digitized_words={}   
    prev_len = 0;
    for index in range(alen):
        worded=[]    
        for row in digitized[:,index]:
            getit=int(row+prev_len)
            worded.append(words[getit])
        prev_len += max(digitized[:,index])+1         
        digitized_words[index]=np.array(worded)
        
    del digitized
        
    digitized_words = np.array(list(digitized_words.values()),dtype=object)
    digitized_words = digitized_words.T
    
    train_words = digitized_words[:len_data_train,:]
    test_words = digitized_words[len_data_train:,:]
    del digitized_words
    train_words = dict(zip(range(len(train_words)),train_words))
    test_words = dict(zip(range(len(test_words)),test_words))
    
    return train_words,test_words

In [None]:
train_words,test_words = word_embed(train_embeds)

In [None]:
train_df['image_net']=train_df.index.map(train_words)

In [None]:
del train_words

In [None]:
test_df['image_net']=test_df.index.map(test_words)

In [None]:

del test_words

In [None]:
def unpack(parte):
    good_stuff = [y for x,y in parte]    
    return good_stuff

In [None]:
def transformer(dictd,col1,col2):
    dict_list=[]
    for key in dictd:
        value = dictd[key]
        if not key.isnumeric() and len(key) > 1:
            dict_list.append([value,key])
    dict_list.sort(reverse=True)
    temp = pd.DataFrame(dict_list,columns=[col1,col2])
    return temp.set_index('words')

In [None]:
def candy_man(batch_tokens,batch,y_hats,refs):
    for indexa,tokens in enumerate(batch_tokens):
        award = {}
        for index,tokens2 in enumerate(refs):
            local_award = 0
            for index2,word in enumerate(tokens): 
                twerk = 0
                for index3,word2 in enumerate(tokens2):
                    twerk+=1                  
                    if word == word2:
                        local_award+=1
                        break
                if twerk >= len(tokens2):
                    break
                
                
            if local_award > 0:
                award[index] = local_award
        award_list=[]
        label_list=[]
        for key in award:
            value = award[key]
            award_list.append([value,key])
        if len(award_list) > 0:
            award_list.sort(reverse=True)

        for (award,indexer) in award_list:
            label_list.append([award,train_df.loc[indexer,].label_group])
        y_hats[BATCH_SIZE*batch+indexa]= label_list

In [None]:
def anotate_y_hat(sampler):
    if len(sampler) == 0:
        clas = ''
    else:
        clas = unpack(sampler)[0]
    return clas

In [None]:
def prime_test():    
    y_hats={}
    
    for batch in range(TEST_BATCHES):
        candy_man(test_df.image_net[batch * BATCH_SIZE : min(BATCH_SIZE*batch + BATCH_SIZE,len_data_test)],batch,y_hats,test_df.image_net)
    return y_hats

In [None]:
def predict_test(y_hats):
    test_df['y_hat_weights']=test_df.index.map(y_hats)
    test_df['y_hat_labels']=test_df.y_hat_weights.map(anotate_y_hat)
    return

In [None]:
def my_concatenate(array1,array2):
    return ' '.join(np.unique(np.concatenate((array1,array2), axis = None)))

In [None]:
def submit_test():
    
    randmn = -100000
    for index,row in test_df.iterrows():
        if row['y_hat_labels'] == "":
            test_df.loc[index,'y_hat_labels'] = randmn
            randmn+=1
    temp = test_df.groupby('y_hat_labels').posting_id.agg('unique').to_dict()
    test_df['y_hat_postings'] = test_df.y_hat_labels.map(temp)
    
    temp = test_df.groupby('image_phash').posting_id.agg('unique').to_dict()
    test_df['duplicates'] = test_df.image_phash.map(temp)
    
    test_df['matches'] = [my_concatenate(item[0],item[1]) for item in test_df[['duplicates','y_hat_postings']].values ]

In [None]:
def run_test():
    y_hats = prime_test()
    predict_test(y_hats)
    submit_test() 

In [None]:
run_test()

In [None]:
test_df[['posting_id','matches']].to_csv('submission.csv',index=False)