In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

We start by loading the data.

In [None]:
import numpy as np
import pandas as pd
import time

In [None]:
train = pd.read_csv('/kaggle/input/shopee-product-matching/train.csv')
train.head()

In [None]:
test = pd.read_csv('/kaggle/input/shopee-product-matching/test.csv')
test.head()

# Initial EDA

Let's start by looking for missing values.

In [None]:
train.isnull().sum()

Looks like there are no missing values! Now let's look for unique values in each column.

In [None]:
print(np.shape(train))
print(f"Unique phashes: {len(np.unique(train['image_phash']))}")
print(f"Unique images: {len(np.unique(train['image']))}")
print(f"Unique titles: {len(np.unique(train['title']))}")

Here we can already see that while there are 34,250 listings in the training set, there are only 28,735 unique phashes and 33,117 unique titles, suggesting that several listings have the exact same image or title. In terms of image ids, it looks like a few thousand listings also share the exact same image id.

# A Basic Submission

Let's make a submission in which we assume all items with the same phash are the same item.

In [None]:
start_time = time.time()

ids = []
matches = []
for ind in test.index:
   ids.append(test['posting_id'][ind])
   indices = np.where(test['image_phash'] == test['image_phash'][ind])
   matches.append(' '.join(list(test.loc[indices[0]]['posting_id'])))
        
print(time.time() - start_time)

In [None]:
submission = pd.DataFrame({'posting_id': ids, 'matches': matches})
submission.to_csv('submission.csv', index = False)
submission

This code had a submission score of 0.559 and took about 20-30 minutes to score. Now let's try matching all products with identical titles, images, OR phashes.

In [None]:
start_time = time.time()

ids = []
matches = []
for ind in test.index:
   ids.append(test['posting_id'][ind])
   indices_phash = np.where(test['image_phash'] == test['image_phash'][ind])[0]
   indices_title = np.where(test['title'] == test['title'][ind])[0]
   indices_image = np.where(test['image'] == test['image'][ind])[0]
   indices = set(indices_phash).union(set(indices_title)).union(set(indices_image))
   matches.append(' '.join(list(test.loc[indices]['posting_id'])))
        
print(time.time() - start_time)

In [None]:
submission = pd.DataFrame({'posting_id': ids, 'matches': matches})
submission.to_csv('submission.csv', index = False)
submission

This code took about 30 mins to run after submission and scored 0.573. Let's try only matching identical titles.

In [None]:
start_time = time.time()

ids = []
matches = []
for ind in test.index:
   ids.append(test['posting_id'][ind])
   indices = np.where(test['title'] == test['title'][ind])[0]
   matches.append(' '.join(list(test.loc[indices]['posting_id'])))
        
print(time.time() - start_time)

In [None]:
submission = pd.DataFrame({'posting_id': ids, 'matches': matches})
submission.to_csv('submission.csv', index = False)
submission

This resulted in a submission score of 0.481. Let's try refining the code that matches images, phashes, and titles such that any two listings that match each other also match everything the other matches. i.e. if item A matches item B, and item B also matches C, A and C should also be listed as matches.

In [None]:
start_time = time.time()

ids = []
item_group = [] # tracks which group an item is in
groups = {} # tracks which items are in each group
for ind in test.index:
    
    indices_phash = test.loc[(test['image_phash'] == test['image_phash'][ind])].index 
    indices_title = test.loc[(test['title'] == test['title'][ind])].index 
    indices_image = test.loc[(test['image'] == test['image'][ind])].index
    indices = set(indices_phash).union(set(indices_title)).union(set(indices_image))
    
    # check if an item this matches already has a group
    match_ids = list(test.loc[indices]['posting_id'])
    if set(match_ids).intersection(ids):
        existing_group = item_group[ids.index(list(set(match_ids).intersection(ids))[0])]
        item_group.append(existing_group)
        groups[existing_group] = set(groups[existing_group]).union(match_ids)
    else:
        item_group.append(len(groups))
        groups[len(groups)] = match_ids
        
    ids.append(test['posting_id'][ind])
    
matches = [' '.join(list(groups[ind])) for ind in item_group]

print(time.time() - start_time)

In [None]:
submission = pd.DataFrame({'posting_id': ids, 'matches': matches})
submission.to_csv('submission.csv', index = False)
submission

The code above took just over 30 mins to score after submission and resulted in a score of 0.576. We can try building off of this moving forwards.

# Checking Runtime on Train Set

The test set we are scored on has over twice the amount of data as the train set. So to make sure the code runs without any memory issues, we can run our code on the training set concatenated with itself to simulate a dataset as large as the test set.

In [None]:
#train.loc[(train['image_phash'] == 'be9c90c3c16f631c')].index

In [None]:
#train_2 = pd.concat([train,train],axis=0,ignore_index=True)

In [None]:
#start_time = time.time()

#ids = []
#item_group = [] # tracks which group an item is in
#groups = {} # tracks which items are in each group
#for ind in train_2.index:
    
#    indices_phash = train_2.loc[(train_2['image_phash'] == train_2['image_phash'][ind])].index #np.where(train_2['image_phash'] == train_2['image_phash'][ind])[0]
#    indices_title = train_2.loc[(train_2['title'] == train_2['title'][ind])].index #np.where(train_2['title'] == train_2['title'][ind])[0]
#    indices_image = train_2.loc[(train_2['image'] == train_2['image'][ind])].index #np.where(train_2['image'] == train_2['image'][ind])[0]
#    indices = set(indices_phash).union(set(indices_title)).union(set(indices_image))
    
    # check if an item this matches already has a group
#    match_ids = list(train_2.loc[indices]['posting_id'])
#    if set(match_ids).intersection(ids):
#        existing_group = item_group[ids.index(list(set(match_ids).intersection(ids))[0])]
#        item_group.append(existing_group)
#        groups[existing_group] = set(groups[existing_group]).union(match_ids)
#    else:
#        item_group.append(len(groups))
#        groups[len(groups)] = match_ids
        
#    ids.append(train_2['posting_id'][ind])
    
#matches = [' '.join(list(groups[ind])) for ind in item_group]

#print(time.time() - start_time)

In [None]:
#start_time = time.time()

#ids = []
#item_group = [] # tracks which group an item is in
#groups = {} # tracks which items are in each group
#for ind in train_2.index:
    
#    indices_phash = np.where(train_2['image_phash'] == train_2['image_phash'][ind])[0]
#    indices_title = np.where(train_2['title'] == train_2['title'][ind])[0]
#    indices_image = np.where(train_2['image'] == train_2['image'][ind])[0]
#    indices = set(indices_phash).union(set(indices_title)).union(set(indices_image))
    
    # check if an item this matches already has a group
#    match_ids = list(train_2.loc[indices]['posting_id'])
#    if set(match_ids).intersection(ids):
#        existing_group = item_group[ids.index(list(set(match_ids).intersection(ids))[0])]
#        item_group.append(existing_group)
#        groups[existing_group] = set(groups[existing_group]).union(match_ids)
#    else:
#        item_group.append(len(groups))
#        groups[len(groups)] = match_ids
        
#    ids.append(train_2['posting_id'][ind])
    
#matches = [' '.join(list(groups[ind])) for ind in item_group]

#print(time.time() - start_time)

In [None]:
#train_submission = pd.DataFrame({'posting_id': ids, 'matches': matches})
#train_submission.head()

Some helpful resources: 

https://www.kaggle.com/ishandutta/v7-shopee-indepth-eda-one-stop-for-all-your-needs#notebook-container

https://www.kaggle.com/maksymshkliarevskyi/shopee-before-we-start-eda-phash-baseline#Shopee-Price-Match-Guarantee:-Before-we-start

https://www.kaggle.com/isaienkov/shopee-data-understanding-and-analysis#notebook-container