In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.listdir()

# Importing Necessary libraries

In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import cv2
import imagehash
from fuzzywuzzy import fuzz

# Helper Functions

In [None]:
def make_wordcloud(df):
    lst=list(df['title'])
    wordcloud_text=[]
    for ele in lst:
        wordcloud_text.extend(ele.split())
    wordcloud_text=' '.join(wordcloud_text)
    wordcloud = WordCloud().generate(wordcloud_text)

    # Display the generated image:
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    
    

def hamming_distance(val1,val2):
    res=0
    for v1,v2 in zip(val1,val2):
        if v1!=v2:
            res=res+1
    return res

        
def image_matrix(hash_values):
 
    phashs = hash_values.apply(lambda x: imagehash.hex_to_hash(x))
    
    hash_matrix = pd.DataFrame()

    for idx, i in enumerate(hash_values):
      
        hash_matrix = pd.concat([hash_matrix, phashs - imagehash.hex_to_hash(i)], 
                                 axis = 1)
        

    hash_matrix.columns = range(len(hash_values))
    return hash_matrix


def fuzz_calculation(idx,df):
    val=[]
    for i,ele in enumerate(df['title']):
        temp=fuzz.ratio(str.lower(df['title'][idx]),str.lower(ele))
        val.append((temp,i))
    return val

def fuzz_calculation_with_sep(idx,df):
    val=[]
    '''As some of out titles have multiple titles separated by / we will split the title on this 
    value if / is present and calculate fuzz_ratio for each of them'''
    
    vals=df['title'][idx].split('/')
    
    for i,ele in enumerate(df['title']):
        temp1=ele.split('/')
        max_val=0
        
        for sent1 in vals:
            for sent2 in temp1:
                temp=fuzz.ratio(str.lower(sent1),str.lower(sent2))
                if temp>max_val:
                    max_val=temp
            
        val.append((max_val,i))

    return val

# Exploring Train Data

In [None]:
train_df=pd.read_csv(r'/kaggle/input/shopee-product-matching/train.csv')

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
train_df.info()

# Observation:
We have 34,250 train images. In our dataframe we have the image title as text feature.We also have image_phash and label_group as other features. Similar posting_id is what we need to predict for each image. 

# Analyzing Test Data

In [None]:
test_df=pd.read_csv(r'/kaggle/input/shopee-product-matching/test.csv')

In [None]:
test_df.shape

In [None]:
test_df.head()

In [None]:
test_df.info()

# Exploring Train DataFrame

Similar images belong to the same label_group. Hence, let us first begin analyzing this column

In [None]:
label_group_df=train_df[['posting_id','label_group']].groupby(['label_group']).count().reset_index()
label_group_df.columns=['label_group','count']
label_group_df=label_group_df.sort_values(by=['count'],ascending=False)
label_group_df

In [None]:
#analyzing duplicate label_group
var=train_df[train_df["label_group"].duplicated() == True].shape[0]
print('Number of duplicated label_groups: ',var)


In [None]:
var=train_df[train_df["posting_id"].duplicated() == True].shape[0]
print('Number of duplicated posting_id: ',var)

In [None]:
print('Number of unique label groups: ',label_group_df.shape[0])

In [None]:
sns.barplot(x=label_group_df['label_group'][:50],y=label_group_df['count'][:50])
plt.title('Number of pictures in same label for first 50 label_groups')

In [None]:
sns.barplot(x=label_group_df['label_group'][-50:],y=label_group_df['count'][-50:])
plt.title('Number of pictures in same label for last 50 label_groups')

# Observation:
1. From the above information we can see that for any label_group the maximum number of products belonging to the similar label_group are 51
2. There are 23236 duplicate label groups in our train dataset as many products share the same label_group.


# Exploring Title
Now, we have given the titles of each image as well. We will explore this area and find out how similar these tiles are for similar products/images.

***We now see the length of every title***

In [None]:
train_df['title_length']=[len(train_df['title'][i].split()) for i in range(train_df.shape[0])]


In [None]:
train_df.sort_values(by='title_length',ascending=False).head()

In [None]:
train_df.sort_values(by='title_length',ascending=False).tail()

# Observaion:
We see that maximum number of words in any title is 61(not a very large number) and minimum is 1.

In [None]:
#selecting a label_group at random and seeing the titles for that group
sample_df=train_df[train_df['label_group']==1163569239]

In [None]:
sample_df.head()

**For a clearer idea let us draw a wordcloud on the titles for this label group and see if we find something**

In [None]:
print('Wordcloud for images from group_label: 1163569239')
print('Total number of images for this group_label are: ',
      label_group_df[label_group_df['label_group']==1163569239]['count'])
make_wordcloud(sample_df)

In [None]:
sns.barplot(x=sample_df['posting_id'],y=sample_df['title_length'])

In [None]:

sample_df.value_counts(['image_phash']).plot(kind='bar')

**Let us try making wordcloud for more group_labels**

In [None]:
#selecting a label_group at random and seeing the titles for that group
sample_df2=train_df[train_df['label_group']==2126962532]
print('Wordcloud for images from group_label: 2126962532')
print('Total number of images for this group_label are: ',label_group_df[label_group_df['label_group']==2126962532]['count'])
make_wordcloud(sample_df2)

In [None]:
sns.barplot(x=sample_df2['posting_id'],y=sample_df2['title_length'])

In [None]:
sample_df2.value_counts(['image_phash']).plot(kind='bar')

In [None]:
#selecting a label_group at random and seeing the titles for that group
sample_df3=train_df[train_df['label_group']==2357508171]
print('Wordcloud for images from group_label: 2357508171')
print('Total number of images for this group_label are: ',
      label_group_df[label_group_df['label_group']==2357508171]['count'])
make_wordcloud(sample_df3)

In [None]:
sns.barplot(x=sample_df3['posting_id'],y=sample_df3['title_length'])

In [None]:
sample_df3.value_counts(['image_phash']).plot(kind='bar')

In [None]:
#selecting a label_group at random and seeing the titles for that group
sample_df4=train_df[train_df['label_group']==3627744656]
print('Wordcloud for images from group_label: 3627744656')
print('Total number of images for this group_label are: ',label_group_df[label_group_df['label_group']==3627744656]['count'])
make_wordcloud(sample_df4)

In [None]:
sns.barplot(x=sample_df4['posting_id'],y=sample_df4['title_length'])

In [None]:
sample_df4.value_counts(['image_phash']).plot(kind='bar')

# Observations:
1. Products belonging to different label_groups have quite different words. Hence, it can be concluded that title plays a major role in this problem.
2. Also, for products belonging to same label_group there are few words appearing more frequently this once again supports the above statement.
3. There is no relation between number of words in title for images that belong to the same label_group.
4. The hash_value is widely scattered for images in the same label_group.

# Hash_Value Check:
We have checked that images belonging to the same label_group might have different hash_values. Let us check if the images with same hash_value has differet label_groups or not.

In [None]:
train_df.head()

In [None]:
hash_df=train_df[['image_phash','posting_id']].groupby(['image_phash']).count().reset_index()
hash_df.columns=['image_phash','count']
hash_df.sort_values(by='count',ascending=False)

In [None]:
hash_df.sort_values(by='count',ascending=False)[10:20]

In [None]:
hash_df_sample=train_df[train_df['image_phash']=='fad28daa2ad05595'].groupby(['label_group']).count()
hash_df_sample

In [None]:
hash_df_sample=train_df[train_df['image_phash']=='d0c0ea37bd9acce0'].groupby(['label_group']).count()
hash_df_sample

In [None]:
hash_df_sample=train_df[train_df['image_phash']=='f6d98134b904b56b'].groupby(['label_group']).count()
hash_df_sample

In [None]:
hash_df_sample=train_df[train_df['image_phash']=='be12e12f9ec1e198'].groupby(['label_group']).count()
hash_df_sample

In [None]:
hash_df_sample=train_df[train_df['image_phash']=='ada4c4781f93686e'].groupby(['label_group']).count()
hash_df_sample

In [None]:
hash_df_sample=train_df[train_df['image_phash']=='ad29e81e92b295b5'].groupby(['label_group']).count()
hash_df_sample

# Observation:
1. There are cases in which the same hash_value images belong to different groups. But it can be said that the majority belong to the same label_group only.

2. We are given perceptual hashing value, hence for calculating the similarity between two images we will consider hamming distance.

# Plotting Some Images:


In [None]:
#plt.figure(figsize=(30,30))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+train_df['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

For better observation let us try plotting images that belong to the same label_group

In [None]:
#plt.figure(figsize=(30,30))
sample_df=sample_df.reset_index()
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+sample_df['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
#plt.figure(figsize=(30,30))
sample_df2=sample_df2.reset_index()
for i in range(8):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+sample_df2['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
#plt.figure(figsize=(30,30))
sample_df3=sample_df3.reset_index()
for i in range(2):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+sample_df3['image'][i])
    plt.subplot(1,2,i+1)
    plt.imshow(img)

# Images with Similar Hash Value
Now we will see few image plots where the image had the same hash_value

In [None]:
hash_df_sample1=train_df[train_df['image_phash']=='ada4c4781f93686e'].reset_index()
#plt.figure(figsize=(30,30))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+hash_df_sample1['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
hash_df_sample2=train_df[train_df['image_phash']=='f6d98134b904b56b'].reset_index()
#plt.figure(figsize=(30,30))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+hash_df_sample2['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

# Observation:
1. Hash_value of each image is an important feature for any given image and it helps in grouping similar images together.
2. For perceptual hashing hamming distance is the main measure of calculating the similarity between two given images.
3. The shorter (smaller value) the value of hamming distance the more similar are the images.

# Phash Analysis
Based on the phash_value we will first calculate the has

In [None]:
hash_matrix_1_1000 = image_matrix(train_df['image_phash'][:1000])
hash_matrix_1_1000.head()

# Further Steps:
1. We are looking for similar product images. So, here to begin with what we can do is for every product take the 50 most similar product based on the value calculated here.

In [None]:
simialrity_dic={}
for i in range(len(hash_matrix_1_1000)):
    var=[(hash_matrix_1_1000[i][j],j) for j in range(len(hash_matrix_1_1000[i]))]
    var=sorted(var)
    simialrity_dic[i]=[var[k][1] for k in range(50)]

# Plotting Similar Products Based on Above Observation

In [None]:
similar_pdts_0=train_df['image'][simialrity_dic[0][:12]].reset_index()
plt.figure(figsize=(10,10))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+similar_pdts_0['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
similar_pdts_1=train_df['image'][simialrity_dic[1][:12]].reset_index()
plt.figure(figsize=(10,10))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+similar_pdts_1['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
similar_pdts_11=train_df['image'][simialrity_dic[11][:12]].reset_index()
plt.figure(figsize=(10,10))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+similar_pdts_11['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

# Observation:
Based on these the results are not very promising. This might also be because we are only taking a sample of images. Let us try with first 10 images taking their matching hash_value with all the images in the dataset.

In [None]:
hash_dic = {}
var=[]
val=imagehash.hex_to_hash(train_df['image_phash'][0])
for idx,ele in enumerate(train_df['image_phash']):
    temp=imagehash.hex_to_hash(ele)
    var.append((val-temp,idx))
hash_dic[0]=var

In [None]:
val=sorted(hash_dic[0])
val=[val[i][1] for i in range(len(val))]

similar_pdts_0_all=train_df['image'][val[:12]].reset_index()
plt.figure(figsize=(10,10))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+similar_pdts_0_all['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

# Let us now check for these 50 images, how many of them have same label_group

In [None]:
print('Label_group of image_0 is: ',train_df['label_group'][0])
for ele in val[:50]:
    print(ele,train_df['label_group'][ele],train_df['label_group'][ele]==train_df['label_group'][0])

# For Index 2937


In [None]:
hash_dic = {}
var=[]
val=imagehash.hex_to_hash(train_df['image_phash'][2937])
for idx,ele in enumerate(train_df['image_phash']):
    temp=imagehash.hex_to_hash(ele)
    var.append((val-temp,idx))
hash_dic[2937]=var

In [None]:
val=sorted(hash_dic[2937])
val=[val[i][1] for i in range(len(val))]

similar_pdts_2937_all=train_df['image'][val[:12]].reset_index()
plt.figure(figsize=(10,10))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+similar_pdts_2937_all['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
print('Label_group of image_0 is: ',train_df['label_group'][2937])
for ele in val[:50]:
    print(ele,train_df['label_group'][ele],train_df['label_group'][ele]==train_df['label_group'][2937])

# Observation:
1. We get good results but only in the case when there are images with same hash_value. Hence, one could rely on this technique till some extent. This also clearly shows that hash_value is an important parameter to consider here.
2. We see that for none of the top 50 similar images based on hash_difference is bringing the image of the same label. We will repeat these steps with Hamming distance and see if we find something

# Analyzing How Good Hamming Distance is Performing


In [None]:
labelgroup_sample_0=train_df[train_df['label_group']==train_df['label_group'][0]].reset_index()
labelgroup_sample_0

In [None]:
hamming_dic = {}
var=[]
for idx,ele in enumerate(train_df['image_phash']):
    var.append((hamming_distance(train_df['image_phash'][0],ele),idx))
hamming_dic[0]=var

In [None]:
temp=sorted(hamming_dic[0])
print(temp[:200])

# Observation:
1. The  most similar image for the image at index_0 has same group label for the image with index_33161. But the hamming distance between these two is 14. This is quite a big value and because of which image at index_33161 do not come even in the top 200 similar images with image at index_0 based on hamming distance value. Hence, this further gives us the idea that we just cannot rely on the hamming distance value only.

In [None]:
val=sorted(hamming_dic[0])
val=[val[i][1] for i in range(len(val))]

similar_pdts_0_all=train_df['image'][val[:12]].reset_index()
plt.figure(figsize=(10,10))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+similar_pdts_0_all['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
print('Label_group of image_0 is: ',train_df['label_group'][0])
for ele in val[:50]:
    print(ele,train_df['label_group'][ele],train_df['label_group'][ele]==train_df['label_group'][0])

# For Index 2937

In [None]:
var=[]
for idx,ele in enumerate(train_df['image_phash']):
    var.append((hamming_distance(train_df['image_phash'][2937],ele),idx))
hamming_dic[2937]=var

In [None]:
val=sorted(hamming_dic[2937])
val=[val[i][1] for i in range(len(val))]

similar_pdts_2937_all=train_df['image'][val[:12]].reset_index()
plt.figure(figsize=(10,10))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+similar_pdts_2937_all['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
plt.figure(figsize=(10,10))
for i in range(2):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+labelgroup_sample_0['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

# Analyzing Similarity Based on Titles:
We have considered the hamming distance and hash difference. Let us now see how good it performs for titles.

In [None]:
fuzz.ratio(str.lower(train_df['title'][0]),str.lower(train_df['title'][33161]))

In [None]:
fuzz_dic={}
fuzz_dic[0]=fuzz_calculation(0,train_df)
fuzz_dic[10]=fuzz_calculation(10,train_df)
fuzz_dic[30]=fuzz_calculation(30,train_df)
fuzz_dic[2937]=fuzz_calculation(2937,train_df)

# For Index 0:

In [None]:
val_0=sorted(fuzz_dic[0],reverse=True)
val_0=[val_0[i][1] for i in range(len(val_0))]

similar_pdts_0_all=train_df['image'][val_0[:12]].reset_index()
plt.figure(figsize=(10,10))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+
                   similar_pdts_0_all['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
print('Label_group of image_0 is: ',train_df['label_group'][0])
for ele in val_0[:12]:
    print(ele,train_df['label_group'][ele],train_df['label_group'][ele]==train_df['label_group'][0])

# For Index 10:

In [None]:
val_10=sorted(fuzz_dic[10],reverse=True)
val_10=[val_10[i][1] for i in range(len(val_10))]

similar_pdts_10_all=train_df['image'][val_10[:12]].reset_index()
plt.figure(figsize=(10,10))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+
                   similar_pdts_10_all['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
print('Label_group of image_0 is: ',train_df['label_group'][10])
for ele in val_10[:12]:
    print(ele,train_df['label_group'][ele],train_df['label_group'][ele]==train_df['label_group'][10])

In [None]:
train_df[train_df['label_group']==train_df['label_group'][10]]

# For Index 30:

In [None]:
val_30=sorted(fuzz_dic[30],reverse=True)
val_30=[val_30[i][1] for i in range(len(val_30))]

similar_pdts_30_all=train_df['image'][val_30[:12]].reset_index()
plt.figure(figsize=(10,10))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+
                   similar_pdts_30_all['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
print('Label_group of image_30 is: ',train_df['label_group'][30])
for ele in val_30[:12]:
    print(ele,train_df['label_group'][ele],train_df['label_group'][ele]==train_df['label_group'][30])

In [None]:
train_df[train_df['label_group']==train_df['label_group'][30]]

In [None]:
fuzz_dic[30]=fuzz_calculation_with_sep(30,train_df)

In [None]:
val_30=sorted(fuzz_dic[30],reverse=True)
val_30=[val_30[i][1] for i in range(len(val_30))]

similar_pdts_30_all=train_df['image'][val_30[:12]].reset_index()
plt.figure(figsize=(10,10))
for i in range(12):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+
                   similar_pdts_30_all['image'][i])
    plt.subplot(3,4,i+1)
    plt.imshow(img)

In [None]:
print('Label_group of image_30 is: ',train_df['label_group'][30])
for ele in val_30[:12]:
    print(ele,train_df['label_group'][ele],train_df['label_group'][ele]==train_df['label_group'][30])

# For Index 2937 

In [None]:
val_2937=sorted(fuzz_dic[2937],reverse=True)
val_2937=[val_2937[i][1] for i in range(len(val_2937))]

similar_pdts_2937_all=train_df['image'][val_2937[:20]].reset_index()
plt.figure(figsize=(10,10))
for i in range(20):
    img=cv2.imread(r'/kaggle/input/shopee-product-matching/train_images/'+similar_pdts_2937_all['image'][i])
    plt.subplot(4,5,i+1)
    plt.imshow(img)

In [None]:
print('Label_group of image_2937 is: ',train_df['label_group'][2937])
for ele in val_2937[:20]:
    print(ele,train_df['label_group'][ele],train_df['label_group'][ele]==train_df['label_group'][2937])

In [None]:
train_df[train_df['label_group']==train_df['label_group'][2937]]

# Observation:
1. Title is an important feature. We should not ignore words in the title. The higher the fuzz ratio the more similar the images are turning out to be.