## The aim of this notebook is to make a pHash distance matrix which can help in the analysis of the dataset, and modelling the solution.

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd

import hashlib 

from scipy.spatial.distance import pdist

import pickle 

In [None]:
root_dir = '/kaggle/input/shopee-product-matching/'
train_imgs_dir = root_dir+'train_images/'
test_imgs_dir = root_dir+'test_images/'

train = pd.read_csv(root_dir+'train.csv')
test = pd.read_csv(root_dir+'test.csv')
submission = pd.read_csv(root_dir+'sample_submission.csv')

# add target column to training set
tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
train['target'] = train.label_group.map(tmp)

#### The hamming distance is a glorified way of saying how many characters are different between two given strings.

In [None]:
def hamming_distance(hash1, hash2):
    "Calculates hamming distance between two hashes"    
    return sum([c1 != c2 for c1, c2 in zip(hash1, hash2)])

In [None]:
# example for hamming distance between two images of the same label
train_rand_label = np.random.choice(train.label_group.value_counts().index)
tmp = train[train.label_group == train_rand_label]
img1, hash1 = tmp[['image', 'image_phash']].sample(1).values[0]
img2, hash2 = tmp[['image', 'image_phash']].sample(1).values[0]

print('Hamming distance:', hamming_distance(hash1, hash2))

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.imshow(mpimg.imread(train_imgs_dir+img1))
plt.title(hash1)
plt.subplot(1, 2, 2)
plt.imshow(mpimg.imread(train_imgs_dir+img2))
plt.title(hash2)
plt.axis('off');

In [None]:
# example for hamming distance between two random images
img1, hash1 = train[['image', 'image_phash']].sample(1).values[0]
img2, hash2 = train[['image', 'image_phash']].sample(1).values[0]

print('Hamming distance:', hamming_distance(hash1, hash2))

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.imshow(mpimg.imread(train_imgs_dir+img1))
plt.title(hash1)
plt.subplot(1, 2, 2)
plt.imshow(mpimg.imread(train_imgs_dir+img2))
plt.title(hash2)
plt.axis('off');

#### At first I tried usin numpy's pdist, but 32450x32450 pairwise distance adds up to more than a billion pairs.

#### After that I divised a solution which converts the pHash to a one hot encoded version consisting of 16 features for each of the 16 characters of the hash. Where the hamming distance is just 16 - dot(hash1, hash2).

In [None]:
# %%time

# # calculate pairwise distance matrix between postings using image_phash
# hashes = train.image_phash.values.reshape(-1, 1)
# dm = pdist(hashes, hamming_distance)

# save mat for later used
# pickle.dump(dm, open('phash_dist_mat.pkl', 'wb'))

In [None]:
# dm = squareform(pdist(hashes[:10000], lambda s1, s2: sum([c1 != c2 for h1, h2 in zip(s1, s2) for c1, c2 in zip(h1, h2)])))


In [None]:
class HashOneHot:
    def __init__(self):
        hexa = '0123456789abcdef'
        self.hexa_to_idx = {c: i for i, c in enumerate(hexa)}
        self.idx_to_hexa = {i: c for i, c in enumerate(hexa)}
    
    def encode_hash(self, hash_string):
        encoding = np.zeros((1, 16*len(hash_string)))
        for i, hexa in enumerate(hash_string):
            encoding[0, (i * 16) + self.hexa_to_idx[hexa]] = 1
        return encoding
    
    def decode_hash(self, encoding):
        hash_string = []
        hexa_onehot = list(np.where(encoding == 1)[-1])
        for i, hexa in zip(np.arange(0, 256, 16), hexa_onehot):
            hash_string.append(self.idx_to_hexa[hexa - i])
        return ''.join(hash_string)

In [None]:
# testing
hashes = train.image_phash.values

encoder = HashOneHot()
phash = hashes[0][0]
encoding = encoder.encode_hash(phash)
decoding = encoder.decode_hash(encoding)
decoding == phash

In [None]:
phash_onehot = np.empty((hashes.shape[0], 16*16))

for i, phash in enumerate(hashes):
    phash_onehot[i, :] = encoder.encode_hash(phash)

In [None]:
# another test
for i, phash in enumerate(phash_onehot):
    encoder.decode_hash(phash) == hashes[i]

In [None]:
phash_dist = np.empty(shape=[train.shape[0], train.shape[0]], dtype=np.int8)

#### Vanilla matrix multiplcation was too much for the memory to handle, so I opted for making the matrix 1000 posts at a time, which takes around 7 minutes.

In [None]:
%%time
for i in np.arange(0, 32450, 1000):
    phash_dist[i:i+1000] = 16 - (np.matmul(phash_onehot[i:i+1000], phash_onehot.T))

#### Using this matrix, we can find the n closest posts to any given post. For example, this function plots the top 50 closest posts are those and puts the hash and the hamming distance in the title.

In [None]:
def plot_50_closest_posts(i):
    closest_50_posts = np.argsort(phash_dist[i])[:50]
    closest_50_images = train.loc[closest_50_posts, 'image'].tolist()
    closest_50_hashes = train.loc[closest_50_posts, 'image_phash'].tolist()

    n_rows = 5
    n_cols = 10

    plt.figure(figsize=(n_cols*3.2, n_rows*3.2))
    for row in range(n_rows):
        for col in range(n_cols):
            idx = row * n_cols + col
            plt.subplot(n_rows, n_cols, idx+1)

            img = mpimg.imread(train_imgs_dir+closest_50_images[idx])
            plt.imshow(img)
            plt.axis('off')
            plt.title(closest_50_hashes[idx] + '-' + str(phash_dist[i, closest_50_posts[idx]]))

In [None]:
rand_post = np.random.choice(32450)
plot_50_closest_posts(rand_post)

#### One thing I noticed while fooling around with this function is that the similar posts won't probably appear in the top 50 closest posts. Definitely more analysis could carried out using this matrix, so I hope you enjoy it.