In [None]:
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from glob import glob

# Inspect dataset

- Train set has abount 30k images
- Test set makes only 3 images publicly available
- Columns
    - posting_id: unique id for the posted image
    - image: filename of each image. Most is unique, but not all
    - image_phash: perceptual hash value associated with an image, see details in https://en.wikipedia.org/wiki/Perceptual_hashing
    - title: description sentence
    - label_group: ID of identical product (can be posted with different image). Only in train set.

In [None]:
datadir = '/kaggle/input/shopee-product-matching'

train_image_dir = f'{datadir}/train_images'
test_image_dir = f'{datadir}/test_images'

train_image_paths = glob(f'{train_image_dir}/*.jpg')
print('Train images:', len(train_image_paths))
test_image_paths = glob(f'{test_image_dir}/*.jpg')
print('Test images:', len(test_image_paths))

In [None]:
df_train = pd.read_csv(f'{datadir}/train.csv')
df_test = pd.read_csv(f'{datadir}/test.csv')
df_sub = pd.read_csv(f'{datadir}/sample_submission.csv')

df_train.head()

In [None]:
df_test.head()

In [None]:
df_sub.head()

# Explore posting_id and image columns

- posting_id: unique
- image: most is unique but not all

In [None]:
df_train.posting_id.value_counts()

In [None]:
image_counts = df_train.image.value_counts()
print(image_counts.head(20))

fig, ax = plt.subplots(1, 1, figsize=(10, 8))
ax.set_xlabel('N appearances of image filename')
ax.set_ylabel('Counts')
ax.hist(image_counts, bins=10, range=(0, 10))

## Visualize train images

In [None]:
n_visualize = 5

fig, axes = plt.subplots(1, n_visualize, figsize=(n_visualize*5, 5))
for i in range(n_visualize):
    record = df_train.iloc[i]
    filename = record.image
    phash = record.image_phash
    title = record.title
    
    ax = axes[i]
    path = f'{train_image_dir}/{filename}'
    image = cv2.imread(path)[:,:,::-1]
    image = cv2.resize(image, (256, 256))
    ax.imshow(image)
    ax.set_title(f'phash: {phash}')
    print(f'Title {i+1}:', title)

## Visualize test images

In [None]:
n_visualize = 3

fig, axes = plt.subplots(1, n_visualize, figsize=(n_visualize*5, 5))
for i in range(n_visualize):
    record = df_test.iloc[i]
    filename = record.image
    title = record.title
    phash = record.image_phash
    
    ax = axes[i]
    path = f'{test_image_dir}/{filename}'
    image = cv2.imread(path)[:,:,::-1]
    image = cv2.resize(image, (256, 256))
    ax.imshow(image)
    ax.set_title(f'phash: {phash}')
    print(f'Title {i+1}:', title)

# Explore image_phash column

phash (perceptual hash) associates the same image from different posts.

In [None]:
phash_counts = df_train.image_phash.value_counts()
print(phash_counts)

fig, ax = plt.subplots(1, 1, figsize=(10, 8))
ax.set_xlabel('N appearances of image_phash value')
ax.set_ylabel('Counts')
ax.hist(phash_counts, bins=20, range=(0, 20))

In [None]:
n_vis_phash = 3
n_samples = 3

fig, axes = plt.subplots(n_vis_phash, n_samples, figsize=(n_samples*5, n_vis_phash*5))
for i in range(n_vis_phash):
    sample_phash = phash_counts.index[i]
    df_sample = df_train[df_train.image_phash == sample_phash]
    
    for j in range(n_samples):
        record = df_sample.iloc[j]
        filename = record.image
        title = record.title
        phash = record.image_phash
    
        ax = axes[i][j]
        path = f'{train_image_dir}/{filename}'
        image = cv2.imread(path)[:,:,::-1]
        image = cv2.resize(image, (256, 256))
        ax.imshow(image)
        ax.set_title(f'phash: {phash}')
        print(f'Title {i+1}:', title)

# Explore title column

- title column is a short description sentence of product. Each title is composed of ~70 words at most (see below histogram).
- Mostly English, but other languages and proper nouns are also included.

In [None]:
print(df_train.title.head(20))

def count_words(sentence):
    return len(sentence.split(' '))

df_train['n_words'] = df_train.title.map(count_words)
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
ax.set_xlabel('N words of title')
ax.set_ylabel('Counts')
ax.hist(df_train.n_words, bins=20, range=(0, 40))

Maximum number of words < 70. Recent deep NLP models can easily handle sentence with such length (e.g. BERT often allows 512 input length).

In [None]:
df_train.n_words.value_counts().sort_index(ascending=False).head(10)

# Explore label_group column

label_group connects the same product in posts (possively with different images). Because our goal is to predict the same product, label_group seems to be a key variable in this competition.

In the case predicting label_group from other variables, population of each label_group value should be cared.
Some label_group value (e.g. 854939659 in below) appears only 2 times. Splitting train set into train/val set can result in the case that 1. one for train (of train) set and one for val set or 2. only appear in one side. This situation requires careful split and evaluation strategy.

Additionally, this can be viewed as ***many classes and few sample per class*** situation. I think metric learning is a possible approach for this, like https://www.kaggle.com/c/humpback-whale-identification

In [None]:
label_counts = df_train.label_group.value_counts()
print(label_counts)
print('Total:', len(label_counts))

fig, ax = plt.subplots(1, 1, figsize=(10, 8))
ax.set_xlabel('N appearances of label_group')
ax.set_ylabel('Counts')
ax.hist(label_counts, bins=20, range=(0, 20))

In [None]:
n_vis_phash = 5
n_samples = 4

fig, axes = plt.subplots(n_vis_phash, n_samples, figsize=(n_samples*5, n_vis_phash*5), constrained_layout=True)
fig.suptitle('Different image but same product', fontsize=16)        
for i in range(n_vis_phash):
    sample_label = label_counts.index[i]
    df_sample = df_train[df_train.label_group == sample_label]
    
    for j in range(n_samples):
        record = df_sample.iloc[j]
        filename = record.image
        title = record.title
        phash = record.image_phash
    
        ax = axes[i][j]
        path = f'{train_image_dir}/{filename}'
        image = cv2.imread(path)[:,:,::-1]
        image = cv2.resize(image, (256, 256))
        ax.imshow(image)
        ax.set_title(f'phash: {phash}')
        print(f'Title {i+1}:', title)

I hope this notebook may help you ;)