In [1]:
import random
import glob
import os
import sys
import json
import math
import numpy as np
import pandas as pd
import sklearn
from sklearn.neighbors import NearestNeighbors
from pathlib import Path
import lightgbm as lgb
from typing import Iterable, Dict, Set, List
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras

In [2]:
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 400)
INPUT = '/kaggle/input'
DATA = f'{INPUT}/shopee-product-matching'
OUTPUT = '/kaggle/temp'
RESOURCE_DIR = f'{INPUT}/shopee-product-matching-lib/kaggle-shopee-product-matching-1.0'
#LGB_MODEL_DIR = f'{RESOURCE_DIR}/models/lgb/20210220_213935'
#LGB_MODEL_DIR = f'{RESOURCE_DIR}/models/lgb/20210220_130330'
#MLP_MODEL_DIR = f'{RESOURCE_DIR}/models/mlp_20210222_221918'
#FEATURES_DIR = f'{RESOURCE_DIR}/features'
sys.path.append(f'{INPUT}/sgcharts-ml/src')
sys.path.append(f"{INPUT}/sentence-transformers/sentence-transformers-1.0.4")
sys.path.append(f'{RESOURCE_DIR}/src')

In [3]:
from sentence_transformers import SentenceTransformer
import mylib
import scml
scml.seed_everything()
test = pd.read_csv(f"{DATA}/test.csv", engine="c", low_memory=False)

# phash embedding

In [4]:
test["phash_matches"] = mylib.phash_matches(test, threshold=0.35)

# sentence embedding

In [5]:
train = pd.read_csv(f"{DATA}/train.csv", engine="c", low_memory=False)
train["target"] = mylib.target_label(train)
sentences = train["title"].to_numpy()

In [6]:
model = SentenceTransformer(f"{RESOURCE_DIR}/pretrained/sentence-transformers/stsb-distilbert-base")
model.max_seq_length = 256
em = model.encode(sentences, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
print(f"em.shape={em.shape}")

Batches:   0%|          | 0/1071 [00:00<?, ?it/s]

em.shape=(34250, 768)


In [7]:
%%time
d = sklearn.metrics.pairwise_distances(em, metric="cosine")

CPU times: user 29.3 s, sys: 4.43 s, total: 33.7 s
Wall time: 12.9 s


In [8]:
def combine_as_list(cols) -> List[str]:
    def fn(row):
        s: Set[str] = set()
        s.add(row["posting_id"])
        for col in cols:
            s |= set(row[col])
        return list(s)
    return fn


def combine_as_string(cols) -> List[str]:
    def fn(row):
        s: Set[str] = set()
        s.add(row["posting_id"])
        for col in cols:
            s |= set(row[col])
        return " ".join(s)
    return fn

In [9]:
%%time
knn = NearestNeighbors(n_neighbors=50, metric="precomputed")
knn.fit(d)
distances, indices = knn.kneighbors()
res: List[List[str]] = [[] for _ in range(len(indices))]
for i in range(len(indices)):
    for j in range(len(indices[0])):
        if distances[i][j] > 0.2:
            break
        res[i].append(train.iloc[indices[i][j]]["posting_id"])

CPU times: user 32.2 s, sys: 4.32 s, total: 36.5 s
Wall time: 36.5 s


In [10]:
train["phash_matches"] = mylib.phash_matches(train, threshold=0.35)

In [11]:
train["sbert_matches"] = res
cols = ["phash_matches", "sbert_matches"]
train["matches"] = train.apply(combine_as_list(cols), axis=1)
cols = ["target", "matches", "phash_matches", "sbert_matches"]
train[cols].head(20)

Unnamed: 0,target,matches,phash_matches,sbert_matches
0,"[train_129225211, train_2278313361]","[train_129225211, train_2278313361]",[],[train_2278313361]
1,"[train_3386243561, train_3423213080]",[train_3386243561],[],[]
2,"[train_2288590299, train_3803689425]","[train_3803689425, train_2288590299]",[],[train_3803689425]
3,"[train_2406599165, train_3342059966]","[train_1508100548, train_2043094887, train_2406599165, train_3576714541, train_1744956981, train_1593362411, train_3722433776, train_3526771004]",[],"[train_1744956981, train_3576714541, train_3526771004, train_1508100548, train_2043094887, train_1593362411, train_3722433776]"
4,"[train_3369186413, train_921438619]",[train_3369186413],[],[]
5,"[train_2464356923, train_2753295474, train_305884580]","[train_2464356923, train_2753295474]",[train_2753295474],[]
6,"[train_1802986387, train_1396161074, train_713073906, train_1275191373, train_2490201622, train_2411544001, train_1859060005]","[train_1802986387, train_409855776, train_1396161074, train_2490201622]",[],"[train_1396161074, train_2490201622, train_409855776]"
7,"[train_1806152124, train_3227306976]",[train_1806152124],[],[]
8,"[train_86570404, train_2837452969, train_77364776]","[train_86570404, train_115157077]",[],[train_115157077]
9,"[train_831680791, train_3031035861]",[train_831680791],[],[]


In [12]:
train["f1"] = train.apply(mylib.metric_per_row("matches"), axis=1)
print(f"Combined score={train.f1.mean():.3f}")

Combined score=0.674


In [13]:
sentences = test["title"].to_numpy()
em = model.encode(sentences, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
d = sklearn.metrics.pairwise_distances(em, metric="cosine")
knn = NearestNeighbors(n_neighbors=min(50, len(test) - 1), metric="precomputed")
knn.fit(d)
distances, indices = knn.kneighbors()
res: List[List[str]] = [[] for _ in range(len(indices))]
for i in range(len(indices)):
    for j in range(len(indices[0])):
        if distances[i][j] > 0.2:
            break
        res[i].append(test.iloc[indices[i][j]]["posting_id"])
test["sbert_matches"] = res
cols = ["phash_matches", "sbert_matches"]
test["matches"] = test.apply(combine_as_string(cols), axis=1)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

# Submission

In [14]:
#test["matches"] = test.apply(mylib.combine_as_string, axis=1)
sub = test[["posting_id", "matches"]]
sub.head()

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744
1,test_3588702337,test_3588702337
2,test_4015706929,test_4015706929


In [15]:
sub.to_csv("submission.csv", index = False)

# Debug

In [16]:
!pip list

Package                        Version             Location
------------------------------ ------------------- --------------
absl-py                        0.12.0
adal                           1.2.6
affine                         2.3.0
aiobotocore                    1.2.2
aiohttp                        3.7.3
aiohttp-cors                   0.7.0
aioitertools                   0.7.1
aioredis                       1.3.1
albumentations                 0.5.2
alembic                        1.5.8
allennlp                       2.2.0
altair                         4.1.0
annoy                          1.17.0
ansiwrap                       0.8.4
appdirs                        1.4.4
argon2-cffi                    20.1.0
arrow                          0.17.0
arviz                          0.11.2
asn1crypto                     1.4.0
astropy                        4.2
astunparse                     1.6.3
async-generator                1.10
async-timeout                  3.0