In [None]:
import random
import glob
import os
import sys
import json
import math
import configparser
import numpy as np
import pandas as pd
import sklearn
from sklearn.neighbors import NearestNeighbors
from pathlib import Path
import lightgbm as lgb
from typing import Iterable, Dict, Set, List, Optional
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
import cv2 as cv
import pytesseract

In [None]:
INPUT = '/kaggle/input'
DATA = f'{INPUT}/shopee-product-matching'
OUTPUT = '/kaggle/temp'
RESOURCE_DIR = f'{INPUT}/shopee-product-matching-lib/kaggle-shopee-product-matching-1.0'
#LGB_MODEL_DIR = f'{RESOURCE_DIR}/models/lgb/20210220_213935'
#LGB_MODEL_DIR = f'{RESOURCE_DIR}/models/lgb/20210220_130330'
#MLP_MODEL_DIR = f'{RESOURCE_DIR}/models/mlp_20210222_221918'
#FEATURES_DIR = f'{RESOURCE_DIR}/features'
sys.path.append(f'{INPUT}/sgcharts-ml/src')
sys.path.append(f"{INPUT}/sentence-transformers/sentence-transformers-1.0.4")
sys.path.append(f'{RESOURCE_DIR}/src')

In [None]:
from sentence_transformers import SentenceTransformer
import mylib
import scml
from scml.nlp import strip_punctuation, to_ascii_str
scml.seed_everything()

In [None]:
IMAGE = True
TITLE = True
PHASH = True
OCR = False
MODEL = 'efficientnetb3'
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
#os.environ["OMP_THREAD_LIMIT"] = "1"
CONF = configparser.ConfigParser()
CONF.read(f"{RESOURCE_DIR}/app.ini")
resolution = int(CONF[MODEL]["resolution"])
print(f"resolution={resolution}")

In [None]:
test = pd.read_csv(f"{DATA}/test.csv", engine="c", low_memory=False)
posting_ids = test["posting_id"].tolist()
test.head()

In [None]:
if IMAGE:
    model_dir = f"{RESOURCE_DIR}/models/eb3_arc_20210510_2300"
    m0 = keras.models.load_model(f"{model_dir}/trial_0/model.h5")
    m0 = keras.models.Model(inputs=m0.input[0], outputs=m0.get_layer("embedding_output").output)
    m0.summary()

In [None]:
if IMAGE:
    idg = keras.preprocessing.image.ImageDataGenerator(
        rescale=1./255,
        data_format="channels_last",
        dtype=np.float32
    )
    data = idg.flow_from_dataframe(
        dataframe=test,
        x_col="image",
        y_col="posting_id",  # y_col not needed for inference
        directory=f"{DATA}/test_images",
        target_size=(resolution, resolution),
        color_mode="rgb",
        batch_size=400,
        shuffle=False,
        class_mode="raw",
        interpolation="bicubic",
    )
    y0 = m0.predict(data, verbose=1)
    em = y0.astype(np.float32)
    print(f"em.shape={em.shape}")

In [None]:
if IMAGE:
    threshold = 1e-4
    nn = NearestNeighbors(
        n_neighbors=min(49, len(posting_ids) - 1), metric="euclidean"
    )
    nn.fit(em)
    distances, indices = nn.kneighbors()
    res: List[List[str]] = [[] for _ in range(len(indices))]
    for i in range(len(indices)):
        for j in range(len(indices[0])):
            if distances[i][j] > threshold:
                break
            res[i].append(posting_ids[indices[i][j]])
    test["image_matches"] = res

# phash embedding

In [None]:
if PHASH:
    test["phash_matches"] = mylib.phash_matches(test, threshold=0.3)

# SBERT sentence embedding 

In [None]:
# required for post-processing
test["title_p"] = test.apply(mylib.preprocess("title"), axis=1)

In [None]:
if TITLE:
    st_name = "stsb-distilbert-base"
    #st_name = "paraphrase-xlm-r-multilingual-v1"
    test["title_matches"] = mylib.sbert_matches(
        model_path=f"{RESOURCE_DIR}/pretrained/sentence-transformers/{st_name}",
        sentences=test["title_p"].tolist(),
        posting_ids=posting_ids,
        threshold=0.5
    )

# OCR Image to Text

In [None]:
def erode_dilate(img):
    kernel = np.ones((2, 2), np.uint8)
    img = cv.erode(img, kernel, iterations=1)
    img = cv.dilate(img, kernel, iterations=1)
    return img


def image_to_text(img_path, mode: str, timeout: float, neighbours: int=41, psm: int=3) -> Optional[str]:
    config = f"--psm {psm}"
    s1, s2 = None, None
    img = cv.imread(img_path, cv.IMREAD_GRAYSCALE)
    #img = cv.resize(img, None, fx=0.5, fy=0.5, interpolation=cv.INTER_AREA)
    img = cv.medianBlur(img, 3)
    if mode == "binary_inverted" or mode == "binary":
        th = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, neighbours, 2)
        th = erode_dilate(th)
        try:
            s1 = pytesseract.image_to_string(th, timeout=timeout, config=config)
        except:
            s1 = None
    if mode == "binary_inverted" or mode == "inverted":
        th = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, neighbours, 2)
        th = erode_dilate(th)
        try:
            s2 = pytesseract.image_to_string(th, timeout=timeout, config=config)
        except:
            s2 = None
    if s1 is None and s2 is None:
        return None
    tokens = []
    if s1 is not None:
        s1 = to_ascii_str(s1)
        s1 = strip_punctuation(s1)
        tokens += s1.split()
    if s2 is not None:
        s2 = to_ascii_str(s2)
        s2 = strip_punctuation(s2)
        tokens += s2.split()
    return " ".join(tokens)

In [None]:
if OCR:
    res = []
    for t in test.itertuples():
        img_path = getattr(t, "image_path")
        s = image_to_text(img_path, mode="inverted", timeout=0.4, neighbours=41, psm=11)
        if s is None:
            s = ""
        res.append(s)
    test["itext"] = res
    test["text"] = test["title"] + " " + test["itext"]
    st_name = "stsb-distilbert-base"
    #st_name = "paraphrase-xlm-r-multilingual-v1"
    test["text_p"] = test.apply(mylib.preprocess("text"), axis=1)
    test["text_matches"] = mylib.sbert_matches(
        model_path=f"{RESOURCE_DIR}/pretrained/sentence-transformers/{st_name}",
        sentences=test["text_p"].tolist(),
        posting_ids=posting_ids,
        threshold=0.5
    )

# Post-processing

In [None]:
imap = {}
for t in test.itertuples():
    pid = getattr(t, "posting_id")
    title = getattr(t, "title_p")
    imap[pid] = mylib.extract(title)
fs = []
if IMAGE:
    fs.append("image_matches")
if TITLE:
    fs.append("title_matches")
if PHASH:
    fs.append("phash_matches")
if OCR:
    fs.append("text_matches")
test["matches"] = test.apply(mylib.combine_as_string(
    fs,
    imap=imap,
    brand_threshold=0.3,
    measurement_threshold=0.3
), axis=1)

# Submission

In [None]:
sub = test[["posting_id", "matches"]]
sub.head()

In [None]:
sub.to_csv("submission.csv", index = False)

# Debug

In [None]:
#!pip list