In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Loading all the required Libraries

In [None]:
from PIL import Image
import random
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics.pairwise import cosine_similarity

### Now let's analyse the data a little bit..

In [None]:
image_dir = '../input/shopee-product-matching'
train_images = '../input/shopee-product-matching/train_images'
test_images = '../input/shopee-product-matching/test_images'
train_csv = '../input/shopee-product-matching/train.csv'
test_csv = '../input/shopee-product-matching/test.csv'

In [None]:
train = pd.read_csv(train_csv)
train.head()

In [None]:
test = pd.read_csv(test_csv)
test.head()

In [None]:
images = os.listdir(train_images)
images[:2]

In [None]:
def show_image(class_num, examples = 2, train_df = train, train_images_path = train_images):
    image_list = train[train['label_group'] == class_num]['image'].sample(frac=1)[:examples].to_list()
    plt.figure(figsize=(20,10))
    for i, img in enumerate(image_list):
        full_path = os.path.join(train_images_path, img)
        img = Image.open(full_path)
        plt.subplot(1 ,examples, i%examples +1)
        plt.axis('off')
        plt.imshow(img)
        plt.title(f'Class: {class_num}')

In [None]:
nums = random.sample(list(train.label_group.unique()), 2)
for num in nums:
    show_image(num)

In [None]:
train.nunique()

### So what is a phash?

A perceptual hash is a fingerprint of a multimedia file derived from various features from its content. Unlike cryptographic hash functions which rely on the avalanche effect of small changes in input leading to drastic changes in the output, perceptual hashes are "close" to one another if the features are similar.

Perceptual hashes are robust enough to take into account transformations or "attacks" on a given input and yet be flexible enough to distinguish between dissimilar files. Such attacks can include rotation, skew, contrast adjustment and different compression/formats. All of these challenges make perceptual hashing an interesting one.

### Now let's try to see the relationship between image label group and phash.. 

Selecting two random images from the train csv

In [None]:
imageSample = random.sample(list(train.posting_id.unique()), 2)
sampleDf = pd.DataFrame(columns = train.columns)
for im in imageSample:
    sampleDf = pd.concat([sampleDf, train[train.posting_id == im]])

In [None]:
sampleDf

Now, let's compare the two images and see what the title of other images in the same label group of the sample image we have selected above.

In [None]:
identicalDf = pd.DataFrame(columns = train.columns)
for imageId in imageSample:
    label = train[train.posting_id == imageId]['label_group'].iloc[0] 
    phash = train[train.posting_id == imageId]['image_phash'].iloc[0]
    identicalDf = pd.concat([identicalDf, train[(train['label_group'] == label)]])


In [None]:
identicalDf

### Naive Model 

If the image phash and label group matches for 2 images, then it can be said to be similar.. 
This can be our naive approach

In [None]:
imagePosting = test.groupby(['posting_id']).count().reset_index()['posting_id'].tolist()

postings = []
matches = []

for imagePost in imagePosting:
    checkPhash = test[test['posting_id'] == imagePost]['image_phash'].iloc[0]
    postings.append(imagePost)
    matches.append(test[test['image_phash'] == checkPhash]['posting_id'].iloc[0])
    

In [None]:
submission = pd.DataFrame(columns = ['posting_id','matches'])
submission.posting_id = postings
submission.matches = matches

In [None]:
matches

In [None]:
submission.to_csv('submission.csv', index=False)