# Table of Contents
* [Preparations and first glance](#1)
* [Explore training data table](#2)
* [Images with same label group - Examples](#3)
* [Compare color distributions](#4)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import random

# plots
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D 
import seaborn as sns
import plotly.express as px

# image specific
from PIL import Image
import imagehash

# file search
import glob

<a id='1'></a>
# Preparations and first glance

In [None]:
# get pathes of all training files into a list
train_images_path = '../input/shopee-product-matching/train_images/'
train_files = glob.glob(train_images_path + '*')
print('Number of training images:', len(train_files))

In [None]:
# function for displaying an image via filename (incl. path)
# in addition we also calculate a hash value
def show_image(filename):
    img = Image.open(filename)
    # show image
    plt.imshow(img)
    plt.show()
    
    # calc custom hash value (perceptual hash is already in the data)
    im_hash_bits = imagehash.dhash(img) # bitstring version
    
    # there are quite a few other options, e. g.:
    # im_hash_bits = imagehash.phash(img) # bitstring version
    # im_hash_bits = imagehash.colorhash(img) # bitstring version

    im_hash_hex = str(im_hash_bits) # convert bitstring => hex 
    im_hash_dec = int(im_hash_hex,16) # decimal version
    # return different versions of the hash
    return im_hash_bits, im_hash_hex, im_hash_dec 

In [None]:
# demo of function call
show_image(train_files[42])

#### We can see the different return values: The bitstring of the hash, the corresponding hexadecimal number and the corresponding decimal representation.

In [None]:
# now show a few example images
my_hash_vals_bit = []
my_hash_vals_hex = []
for i in range(10):
    print('Image '+str(i)+':')
    im_hash_bit,im_hash_hex,_ = show_image(train_files[i]) # use only the hex version of return values
    my_hash_vals_bit.append(im_hash_bit)
    my_hash_vals_hex.append(im_hash_hex)

In [None]:
# show corresponding custom hash values
print(my_hash_vals_hex)

In [None]:
# measure a few distances - based on bit string representation (different bits / number of bits)
print('Diff 0/1:', (my_hash_vals_bit[0] - my_hash_vals_bit[1]) / 64)
print('Diff 3/4:', (my_hash_vals_bit[3] - my_hash_vals_bit[4]) / 64)
print('Diff 3/6:', (my_hash_vals_bit[3] - my_hash_vals_bit[6]) / 64)
print('Diff 4/6:', (my_hash_vals_bit[4] - my_hash_vals_bit[6]) / 64)
print('Diff 7/8:', (my_hash_vals_bit[7] - my_hash_vals_bit[8]) / 64)
print('Diff 8/9:', (my_hash_vals_bit[8] - my_hash_vals_bit[9]) / 64)

<a id='2'></a>
# Explore training data table

### Let's have a look at the structured training data

In [None]:
# training data table
df = pd.read_csv('../input/shopee-product-matching/train.csv')

In [None]:
df.head()

In [None]:
# structure of data frame
df.info()

#### Images can occur multiple times in the table:

In [None]:
# frequencies of image filenames
df.image.value_counts()

#### Hashes can occur multiple times in the table:

In [None]:
# frequencies of perceptual hashes
df.image_phash.value_counts()

#### Check one exemplary group with a high frequency of the phash:

In [None]:
df_multi_hash = df[df.image_phash=='e992966d4ba49761']
df_multi_hash.sort_values(by='image')

In [None]:
print('Number of rows :', df_multi_hash.shape[0])
print('Unique images  :', df_multi_hash.image.nunique())

#### => We have 13 images here having the same hash value (and the same label).

In [None]:
# let's check those images having same phash
my_images = df_multi_hash.image.unique().tolist() # image array w/o duplicates
for i in my_images:
    print('Image '+ i +':')
    _ = show_image(train_images_path+i)

#### Label groups:

In [None]:
# label frequencies
df.label_group.value_counts()

#### Each label group occurs at least twice.

<a id='3'></a>
# Images with same label group - Examples

### Example 1

In [None]:
df_demo_1 = df[df.label_group==3915137548].sort_values(by='image_phash').reset_index(drop=True)
display(df_demo_1)

my_hash_vals_bit = []
my_hash_vals_hex = []
# show images / calc hashes
for f in df_demo_1.image:
    current_file = train_images_path + f
    im_hash_bit,im_hash_hex,_ = show_image(current_file)
    print('Hash Value:',im_hash_hex)
    my_hash_vals_bit.append(im_hash_bit)
    my_hash_vals_hex.append(im_hash_hex)

In [None]:
# custom hash values
my_hash_vals_hex

In [None]:
# measure distances - based on bit string representation (different bits / number of bits)
print('Diff 0/1:', (my_hash_vals_bit[0] - my_hash_vals_bit[1]) / 64)
print('Diff 0/2:', (my_hash_vals_bit[0] - my_hash_vals_bit[2]) / 64)
print('Diff 1/2:', (my_hash_vals_bit[1] - my_hash_vals_bit[2]) / 64)

### Example 2

In [None]:
df_demo_2 = df[df.label_group==3661848281].sort_values(by='image_phash').reset_index(drop=True)
display(df_demo_2)

my_hash_vals_bit = []
my_hash_vals_hex = []
# show images / calc hashes
for f in df_demo_2.image:
    current_file = train_images_path + f
    im_hash_bit,im_hash_hex,_ = show_image(current_file)
    print('Hash Value:',im_hash_hex)
    my_hash_vals_bit.append(im_hash_bit)
    my_hash_vals_hex.append(im_hash_hex)

In [None]:
# custom hash values
my_hash_vals_hex

In [None]:
# measure distances - based on bit string representation (different bits / number of bits)
print('Diff 0/1:', (my_hash_vals_bit[0] - my_hash_vals_bit[1]) / 64)
print('Diff 0/2:', (my_hash_vals_bit[0] - my_hash_vals_bit[2]) / 64)
print('Diff 1/2:', (my_hash_vals_bit[1] - my_hash_vals_bit[2]) / 64)

### Example 3

In [None]:
# here is an example with lots of rather different images
df_demo_3 = df[df.label_group==645628].sort_values(by='image_phash').reset_index(drop=True)
display(df_demo_3)

my_hash_vals_bit = []
my_hash_vals_hex = []
# show images / calc hashes
for f in df_demo_3.image:
    current_file = train_images_path + f
    im_hash_bit,im_hash_hex,_ = show_image(current_file)
    print('Hash Value:',im_hash_hex)
    my_hash_vals_bit.append(im_hash_bit)
    my_hash_vals_hex.append(im_hash_hex)

In [None]:
# custom hash values
my_hash_vals_hex

<a id='4'></a>
# Compare color distributions

In [None]:
for f in df_demo_1.image:
    img = Image.open(train_images_path + f)

    # convert image to numeric array (3D)
    img = np.asarray(img)
    # extract RGB components
    img_R = img[:,:,0]
    img_G = img[:,:,1]
    img_B = img[:,:,2]

    f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(16,3))
    
    ax1.imshow(img)
    
    ax2.hist(img_R.flatten(), bins=16, density=True, color='red')
    ax2.set_title('Histogram - Red')

    ax3.hist(img_G.flatten(), bins=16, density=True, color='green')
    ax3.set_title('Histogram - Green')

    ax4.hist(img_B.flatten(), bins=16, density=True, color='blue')
    ax4.set_title('Hist - Blue')
    plt.show()

In [None]:
for f in df_demo_2.image:
    img = Image.open(train_images_path + f)

    # convert image to numeric array (3D)
    img = np.asarray(img)
    # extract RGB components
    img_R = img[:,:,0]
    img_G = img[:,:,1]
    img_B = img[:,:,2]

    f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(16,3))
    
    ax1.imshow(img)
    
    ax2.hist(img_R.flatten(), bins=16, density=True, color='red')
    ax2.set_title('Histogram - Red')

    ax3.hist(img_G.flatten(), bins=16, density=True, color='green')
    ax3.set_title('Histogram - Green')

    ax4.hist(img_B.flatten(), bins=16, density=True, color='blue')
    ax4.set_title('Hist - Blue')
    plt.show()

In [None]:
for f in df_demo_3.image:
    img = Image.open(train_images_path + f)

    # convert image to numeric array (3D)
    img = np.asarray(img)
    # extract RGB components
    img_R = img[:,:,0]
    img_G = img[:,:,1]
    img_B = img[:,:,2]

    f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(16,3))
    
    ax1.imshow(img)
    
    ax2.hist(img_R.flatten(), bins=16, density=True, color='red')
    ax2.set_title('Histogram - Red')

    ax3.hist(img_G.flatten(), bins=16, density=True, color='green')
    ax3.set_title('Histogram - Green')

    ax4.hist(img_B.flatten(), bins=16, density=True, color='blue')
    ax4.set_title('Hist - Blue')
    plt.show()