# 0. Introduction
Welcome to the competition 'Shopee - Price Match Guarantee'!  
Also, welcome to this source code.  
This source code provides a concise source code that allows users to submit results as simply as possible.  
Try this source code and upvote if you like it!  
Have a nice day, and good luck to you.

# 1. Preparation
In this section, we will prepare some of the python packages and define some of the python custom functions.

In [None]:
import os, glob, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from skimage.transform import resize

In [None]:
def sorted_list(path):
    
    tmplist = glob.glob(path)
    tmplist.sort()
    
    return tmplist

def read_csv(path, head=5):
    
    df = pd.read_csv(path)
    print("Shape:", df.shape)
    
    return df

# 2. Exploration of Given Dataset
Section 2 includes simple EDA for training and test procedure for this competition.
1. CSV explocation
2. Training set exploration  
    2.1. Image presentation  
    2.2. Word extraction  
    2.3. Statistics of unique group  

## Show CSV

In [None]:
list_in = sorted_list(os.path.join('../input/shopee-product-matching', '*'))
list_in

In [None]:
list_tr = sorted_list(os.path.join('../input/shopee-product-matching/train_images', '*'))
list_tr[:10]

In [None]:
list_te = sorted_list(os.path.join('../input/shopee-product-matching/test_images', '*'))
list_te

In [None]:
df_tr = read_csv('../input/shopee-product-matching/train.csv')
df_tr

In [None]:
df_te = read_csv('../input/shopee-product-matching/test.csv')
df_te

In [None]:
df_sb = read_csv('../input/shopee-product-matching/sample_submission.csv')
df_sb

## Training Set
1. Confirm the unique groups via applying 'label_group' key to df_tr (training data).  
2. Present the varied information of the unique group.

In [None]:
uniq_tr = list(set(list(df_tr['label_group'])))
uniq_tr.sort()
print(len(uniq_tr))
print(uniq_tr[:10])

tmp_group = df_tr[df_tr['label_group'] == uniq_tr[-1]]
tmp_group

Image of the training set.  
Each image are saved with varied size.  
In this code block, the histogram for every channel is shown.

In [None]:
for idx in range(tmp_group.shape[0]):
    tmp_path = os.path.join('../input/shopee-product-matching/train_images', tmp_group['image'].iloc[idx])
    tmp_image = plt.imread(tmp_path)
    
    plt.figure(figsize=(15, 10))
    plt.subplot(2, 3, 1)
    h, w, c = tmp_image.shape
    plt.title("Group: %s (%d x %d x %d)" %(tmp_group['label_group'].iloc[idx], h, w, c))
    plt.imshow(tmp_image)
    
    plt.subplot(2, 3, 2)
    tmp_image_r = resize(tmp_image, (128, 128))
    h, w, c = tmp_image_r.shape
    plt.title("Resized (%d x %d x %d)" %(h, w, c))
    plt.imshow(tmp_image_r)
    
    plt.subplot(2, 3, 4)
    plt.title("Histogram-Red")
    plt.hist(tmp_image[:, :, 0].reshape(-1), color='#ff0000')
    
    plt.subplot(2, 3, 5)
    plt.title("Histogram-Green")
    plt.hist(tmp_image[:, :, 1].reshape(-1), color='#00ff00')
    
    plt.subplot(2, 3, 6)
    plt.title("Histogram-Blue")
    plt.hist(tmp_image[:, :, 2].reshape(-1), color='#0000ff')
    
    plt.tight_layout()
    plt.show()

Extracting the words from the title column of the df_tr (training set).  
Moreover, in the following code block aggregates frequency of each word that extracted from the title.

In [None]:
def get_words(df_group):
    dict_word = {}
    for idx in range(df_group.shape[0]):
        list_word = re.findall(r'\w+', df_group['title'].iloc[idx].lower())
        for name_word in list_word:
            try: dict_word[name_word] += 1
            except: dict_word[name_word] = 1

    list_sorted = sorted(dict_word.items(), key=lambda x: x[1], reverse=True)
    dict_word = {}
    for content in list_sorted:
        dict_word[content[0]] = content[1]

    return dict_word

def get_keys(dictionary, sort=False):
    
    tmplist = list(dictionary.keys())
    if(sort): tmplist.sort()
    
    return tmplist

def show_dict(dictionary):
    
    for name_key in get_keys(dictionary):
        print("%s: %.5f" %(name_key, dictionary[name_key]))

In [None]:
dict_word = get_words(tmp_group)
show_dict(dict_word)

Aggregation of the each group.  
Aggregated information is following.  
* NUM_GROUP: number of sample for each unique label group  
* HEGHIT: height for each image
* WIDTH: width for each image
* CHANNEL: number of the channel for each image

In [None]:
dict_inform = {\
    'NUM_GROUP': [],\
    'HEGHIT': [],\
    'WIDTH': [],\
    'CHANNEL': []}

for uniq_id in uniq_tr:
    tmp_group = df_tr[df_tr['label_group'] == uniq_id]
    dict_inform['NUM_GROUP'].append(tmp_group.shape[0])
    
    for idx in range(tmp_group.shape[0]):
        tmp_path = os.path.join('../input/shopee-product-matching/train_images', tmp_group['image'].iloc[idx])
        tmp_image = plt.imread(tmp_path)
        h, w, c = tmp_image.shape
        dict_inform['HEGHIT'].append(h)
        dict_inform['WIDTH'].append(w)
        dict_inform['CHANNEL'].append(c)
    
list_key = list(dict_inform.keys())
for idx_key, name_key in enumerate(list_key):
    print("* %s" %(name_key))
    print("MIN:", np.min(dict_inform[name_key]))
    print("MAX:", np.max(dict_inform[name_key]))
    print("AVG:", np.average(dict_inform[name_key]))
    
    plt.figure(figsize=(8, 4))
    plt.title("Histogram: %s" %(name_key))
    plt.hist(dict_inform[name_key])
    plt.show()

# 3. Training (Word Embedding)

In [None]:
dict_uniq = {}
for uniq_id in uniq_tr:
    tmp_group = df_tr[df_tr['label_group'] == uniq_id]
    dict_word = get_words(tmp_group)
    val_sum = np.sum(list(dict_word.values()))
    list_key = get_keys(dict_word)
    for name_key in list_key:
        dict_word[name_key] = float(dict_word[name_key] / val_sum)
    
    dict_uniq[uniq_id] = dict_word

In [None]:
list_uniq = get_keys(dict_uniq, sort=True)
for idx_uniq, name_uniq in enumerate(list_uniq):
    if(idx_uniq > 3): break
    print("\n* ID: %s" %(name_uniq))
    show_dict(dict_uniq[name_uniq])

# 4. Test

In [None]:
dict_te = {}
for name_id in list(df_te['posting_id']):
    dict_word = get_words(df_te[df_te['posting_id'] == name_id])
    
    print("\n* ID: %s" %(name_id))
    val_sum = np.sum(list(dict_word.values()))
    list_key = get_keys(dict_word)
    for name_key in list_key:
        dict_word[name_key] = float(dict_word[name_key] / val_sum)
        
    dict_te[name_id] = dict_word
    show_dict(dict_te[name_id])

# 5. Make Submission

In [None]:
df_sb

In [None]:
list_te = get_keys(dict_te, sort=True)
for name_te in list_te:
    list_word_te = get_keys(dict_te[name_te])
    dict_socre = {}
    for idx_uniq, name_uniq in enumerate(list_uniq):
        dict_socre[name_uniq] = 0
        list_word_tr = get_keys(dict_uniq[name_uniq])
        
        for idx_te, name_word_te in enumerate(list_word_te):
            for idx_tr, name_word_tr in enumerate(list_word_tr):
                if(name_word_te == name_word_tr):
                    val_tr = dict_uniq[name_uniq][name_word_tr]
                    val_te = dict_te[name_te][name_word_te]
                    dict_socre[name_uniq] += val_tr * val_te

    max_score = max(dict_socre.values())
    list_score = get_keys(dict_socre, sort=True)
    
    group_te = df_te[df_te['posting_id'] == name_te]
    path_te = os.path.join('../input/shopee-product-matching/test_images', group_te['image'].iloc[0])
    image_te = plt.imread(path_te)
    
    plt.title("Test")
    plt.imshow(image_te)
    plt.show()
    plt.close()
            
    matches = ""
    for idx_score, name_score in enumerate(list_score):
        if(dict_socre[name_score] == max_score):
            
            group_tr = df_tr[df_tr['label_group'] == name_score]
            for idx_tr in range(group_tr.shape[0]):
                if(len(matches) == 0): matches = group_tr['posting_id'].iloc[idx_tr]
                else: matches += " %s" %(group_tr['posting_id'].iloc[idx_tr])
                    
                path_tr = os.path.join('../input/shopee-product-matching/train_images', group_tr['image'].iloc[idx_tr])
                image_tr = plt.imread(path_tr)

                plt.title("Train: %s" %(group_tr['image'].iloc[idx_tr]))
                plt.imshow(image_tr)
                plt.show()
                plt.close()
                
    print("Test ID: %s\nMached ID: %s" %(name_te, matches))
    df_sb.loc[df_sb['posting_id'] == name_te, 'matches'] = matches


In [None]:
df_sb

In [None]:
df_sb.to_csv('submission.csv', index=False)