In [1]:
import random
import glob
import os
import sys
import json
import math
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from pathlib import Path
import lightgbm as lgb
from typing import Iterable, Dict, Set, List
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras

In [2]:
PHASH_THRESHOLD = 0.3
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
INPUT = '/kaggle/input'
DATA = f'{INPUT}/shopee-product-matching'
OUTPUT = '/kaggle/temp'
RESOURCE_DIR = f'{INPUT}/shopee-product-matching-lib/kaggle-shopee-product-matching-1.0'
#LGB_MODEL_DIR = f'{RESOURCE_DIR}/models/lgb/20210220_213935'
#LGB_MODEL_DIR = f'{RESOURCE_DIR}/models/lgb/20210220_130330'
#MLP_MODEL_DIR = f'{RESOURCE_DIR}/models/mlp_20210222_221918'
#FEATURES_DIR = f'{RESOURCE_DIR}/features'
sys.path.append(f'{INPUT}/sgcharts-ml/src')
sys.path.append(f'{RESOURCE_DIR}/src')

In [3]:
import mylib
import scml
scml.seed_everything()

In [4]:
train = pd.read_csv(f"{DATA}/train.csv", engine="c", low_memory=False)
train["target"] = mylib.target_label(train)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  int64 
 5   target       34250 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.6+ MB


In [5]:
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,target
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]"
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]"
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]"
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]"
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]"


In [6]:
rows = []
for t in tqdm(train.itertuples()):
    ph = getattr(t, "image_phash")
    if len(ph) != 16:
        raise ValueError(f"expected len(ph) is 16 but found {len(ph)}")
    row = {}
    for i in range(16):
        row[f"ph{i}"] = int(ph[i], 16)
    rows.append(row)
df = pd.DataFrame.from_records(rows)
df = df.astype(np.int8)
df.info()

34250it [00:00, 64206.76it/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   ph0     34250 non-null  int8 
 1   ph1     34250 non-null  int8 
 2   ph2     34250 non-null  int8 
 3   ph3     34250 non-null  int8 
 4   ph4     34250 non-null  int8 
 5   ph5     34250 non-null  int8 
 6   ph6     34250 non-null  int8 
 7   ph7     34250 non-null  int8 
 8   ph8     34250 non-null  int8 
 9   ph9     34250 non-null  int8 
 10  ph10    34250 non-null  int8 
 11  ph11    34250 non-null  int8 
 12  ph12    34250 non-null  int8 
 13  ph13    34250 non-null  int8 
 14  ph14    34250 non-null  int8 
 15  ph15    34250 non-null  int8 
dtypes: int8(16)
memory usage: 535.3 KB


In [7]:
df.head()

Unnamed: 0,ph0,ph1,ph2,ph3,ph4,ph5,ph6,ph7,ph8,ph9,ph10,ph11,ph12,ph13,ph14,ph15
0,9,4,9,7,4,15,9,3,7,13,4,12,2,4,3,3
1,10,15,3,15,9,4,6,0,12,2,8,3,8,15,0,15
2,11,9,4,12,11,0,0,14,13,3,14,5,0,15,7,8
3,8,5,1,4,15,12,5,8,14,10,15,14,10,2,8,3
4,10,6,15,3,1,9,15,9,2,4,10,13,7,0,8,12


In [8]:
%%time
model = NearestNeighbors(n_neighbors=49, metric="hamming")
model.fit(df)
distances, indices = model.kneighbors()

CPU times: user 31.8 s, sys: 3.63 s, total: 35.5 s
Wall time: 35.5 s


In [9]:
ps: List[List[str]] = [[] for _ in range(len(indices))]
for i in tqdm(range(len(indices))):
    for j in range(len(indices[0])):
        if distances[i][j] > PHASH_THRESHOLD:
            break
        ps[i].append(train.iloc[indices[i][j]]["posting_id"])
train["phash_matches"] = ps

100%|██████████| 34250/34250 [00:05<00:00, 6092.89it/s]


In [10]:
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,target,phash_matches
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]",[]
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]",[]
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]",[]
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]",[]
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]",[]


In [11]:
def combine_as_list(row):
    s: Set[str] = set()
    s.add(row["posting_id"])
    s |= set(row["phash_matches"])
    return list(s)

def combine_as_string(row):
    s: Set[str] = set()
    s.add(row["posting_id"])
    s |= set(row["phash_matches"])
    return " ".join(s)

In [12]:
%%time
train["matches"] = train.apply(combine_as_list, axis=1)
train["f1"] = train.apply(mylib.metric_per_row("matches"), axis=1)
print(f"Final score={train.f1.mean():.3f}")

Final score=0.586
CPU times: user 3.24 s, sys: 24.7 ms, total: 3.26 s
Wall time: 3.26 s


In [13]:
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,target,phash_matches,matches,f1
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]",[],[train_129225211],0.666667
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]",[],[train_3386243561],0.666667
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]",[],[train_2288590299],0.666667
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]",[],[train_2406599165],0.666667
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]",[],[train_3369186413],0.666667


In [14]:
test = pd.read_csv(f"{DATA}/test.csv", engine="c", low_memory=False)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   3 non-null      object
 1   image        3 non-null      object
 2   image_phash  3 non-null      object
 3   title        3 non-null      object
dtypes: object(4)
memory usage: 224.0+ bytes


In [15]:
rows = []
for t in test.itertuples():
    ph = getattr(t, "image_phash")
    if len(ph) != 16:
        raise ValueError(f"expected len(ph) is 16 but found {len(ph)}")
    row = {}
    for i in range(16):
        row[f"ph{i}"] = int(ph[i], 16)
    rows.append(row)
df = pd.DataFrame.from_records(rows)
df = df.astype(np.int8)
model = NearestNeighbors(n_neighbors=min(49, len(test) - 1), metric="hamming")
model.fit(df)
distances, indices = model.kneighbors()
ps: List[List[str]] = [[] for _ in range(len(indices))]
for i in range(len(indices)):
    for j in range(len(indices[0])):
        if distances[i][j] > PHASH_THRESHOLD:
            break
        ps[i].append(test.iloc[indices[i][j]]["posting_id"])
test["phash_matches"] = ps
test["matches"] = test.apply(combine_as_string, axis=1)
test.head()

Unnamed: 0,posting_id,image,image_phash,title,phash_matches,matches
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[],test_2255846744
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[],test_3588702337
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[],test_4015706929


# Submission

In [16]:
sub = test[["posting_id", "matches"]]
sub.head()

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744
1,test_3588702337,test_3588702337
2,test_4015706929,test_4015706929


In [17]:
sub.to_csv("submission.csv", index = False)

# Debug

In [18]:
!pip list

Package                        Version             Location
------------------------------ ------------------- --------------
absl-py                        0.12.0
adal                           1.2.6
affine                         2.3.0
aiobotocore                    1.2.2
aiohttp                        3.7.3
aiohttp-cors                   0.7.0
aioitertools                   0.7.1
aioredis                       1.3.1
albumentations                 0.5.2
alembic                        1.5.8
allennlp                       2.2.0
altair                         4.1.0
annoy                          1.17.0
ansiwrap                       0.8.4
appdirs                        1.4.4
argon2-cffi                    20.1.0
arrow                          0.17.0
arviz                          0.11.2
asn1crypto                     1.4.0
astropy                        4.2
astunparse                     1.6.3
async-generator                1.10
async-timeout                  3.0