In [1]:
import os
import math
import configparser
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import tensorflow as tf
from tensorflow import keras
import py
import mylib
import cv2 as cv
import pytesseract
from tqdm import tqdm
from typing import Optional, List, Dict, Set, Tuple
from scml.nlp import strip_punctuation, to_ascii_str

In [2]:
IMAGE = True
TITLE = True
PHASH = True
OCR = False
MODEL = 'efficientnetb3'
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
#os.environ["OMP_THREAD_LIMIT"] = "1"
CONF = configparser.ConfigParser()
CONF.read("app.ini")
resolution = int(CONF[MODEL]["resolution"])
print(f"resolution={resolution}")

resolution=300


In [3]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train["target"] = mylib.target_label(train)
train["image_path"] = "input/train_images/" + train["image"]
posting_ids = train["posting_id"].tolist()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  int64 
 5   target       34250 non-null  object
 6   image_path   34250 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.8+ MB


In [4]:
%%time
# required for post-processing
train["title_p"] = train.apply(mylib.preprocess("title"), axis=1)

CPU times: user 33.9 s, sys: 281 ms, total: 34.2 s
Wall time: 34.2 s


In [5]:
imap = {}
for t in tqdm(train.itertuples()):
    pid = getattr(t, "posting_id")
    title = getattr(t, "title_p")
    imap[pid] = mylib.extract(title)

34250it [02:04, 275.74it/s]


# PHash
th=.25, f1=.586 | th=.30, f1=.586 | th=.35, f1=.587 | th=.40, f1=.583

In [6]:
%%time
if PHASH:
    train["phash_matches"] = mylib.phash_matches(train, threshold=0.3)

CPU times: user 56.6 s, sys: 51.1 s, total: 1min 47s
Wall time: 54.4 s


# Title

In [7]:
%%time
if TITLE:
    st_name = "stsb-distilbert-base"
    #st_name = "paraphrase-distilroberta-base-v1"
    #st_name = "paraphrase-xlm-r-multilingual-v1"
    train["title_matches"] = mylib.sbert_matches(
        model_path=f"pretrained/sentence-transformers/{st_name}",
        sentences=train["title_p"].tolist(),
        posting_ids=posting_ids,
        threshold=0.5
    )

CPU times: user 21min, sys: 33.5 s, total: 21min 33s
Wall time: 4min 14s


# Image 

In [8]:
if IMAGE:
    model_dir = "models/eb3_arc_20210510_1800"
    m0 = keras.models.load_model(f"{model_dir}/trial_0/model.h5")
    m0 = keras.models.Model(inputs=m0.input[0], outputs=m0.get_layer("embedding_output").output)
    m0.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_input (InputLayer)     [(None, 300, 300, 3)]     0         
_________________________________________________________________
efficientnetb3 (Functional)  (None, 1536)              10783535  
_________________________________________________________________
layer_normalization_1 (Layer (None, 1536)              3072      
_________________________________________________________________
dense_1 (Dense)              (None, 1536)              2360832   
_________________________________________________________________
embedding_output (LayerNorma (None, 1536)              3072      
Total params: 13,150,511
Trainable params: 2,366,976
Non-trainable params: 10,783,535
_________________________________________________________________


In [None]:
if IMAGE:
    idg = keras.preprocessing.image.ImageDataGenerator(
        rescale=1./255,
        data_format="channels_last",
        dtype=np.float32
    )
    data = idg.flow_from_dataframe(
        dataframe=train,
        x_col="image",
        y_col="label_group",
        directory="input/train_images",
        target_size=(resolution, resolution),
        color_mode="rgb",
        batch_size=1024,
        shuffle=False,
        class_mode="raw",
        interpolation="nearest",
    )
    y0 = m0.predict(data, verbose=1)
    #y1 = m1.predict(data, verbose=1)
    #y2 = m2.predict(data, verbose=1)
    #y3 = m3.predict(data, verbose=1)
    #y4 = m4.predict(data, verbose=1)
    #assert y0.shape == y1.shape == y2.shape == y3.shape == y4.shape
    #print(f"y0.shape={y0.shape}")
    em = y0.astype(np.float32)
    print(f"em.shape={em.shape}")

Found 34250 validated image filenames.

In [None]:
#res = []
#for i in range(len(y0)):
    #a = np.vstack((y0[i], y1[i], y2[i], y3[i], y4[i]))
    #a = np.vstack((y0[i], y1[i]))
    #m = np.mean(a, axis=0)
    #res.append(m)
#em = np.array(res, dtype=np.float32)
#assert y0.shape == em.shape
#print(f"em.shape={em.shape}")

In [None]:
%%time
if IMAGE:
    threshold = 1e-4
    nn = NearestNeighbors(
        n_neighbors=min(49, len(posting_ids) - 1), metric="euclidean", n_jobs=-1
    )
    nn.fit(em)
    distances, indices = nn.kneighbors()
    res: List[List[str]] = [[] for _ in range(len(indices))]
    for i in range(len(indices)):
        for j in range(len(indices[0])):
            if distances[i][j] > threshold:
                break
            res[i].append(posting_ids[indices[i][j]])
    train["image_matches"] = res

# OCR

In [None]:
def erode_dilate(img):
    kernel = np.ones((2, 2), np.uint8)
    img = cv.erode(img, kernel, iterations=1)
    img = cv.dilate(img, kernel, iterations=1)
    return img


def image_to_text(img_path, mode: str, timeout: float, neighbours: int=41, psm: int=3) -> Optional[str]:
    config = f"--psm {psm}"
    s1, s2 = None, None
    img = cv.imread(img_path, cv.IMREAD_GRAYSCALE)
    #img = cv.resize(img, None, fx=0.5, fy=0.5, interpolation=cv.INTER_AREA)
    img = cv.medianBlur(img, 3)
    if mode == "binary_inverted" or mode == "binary":
        th = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, neighbours, 2)
        th = erode_dilate(th)
        try:
            s1 = pytesseract.image_to_string(th, timeout=timeout, config=config)
        except:
            s1 = None
    if mode == "binary_inverted" or mode == "inverted":
        th = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, neighbours, 2)
        th = erode_dilate(th)
        try:
            s2 = pytesseract.image_to_string(th, timeout=timeout, config=config)
        except:
            s2 = None
    if s1 is None and s2 is None:
        return None
    tokens = []
    if s1 is not None:
        s1 = to_ascii_str(s1)
        s1 = strip_punctuation(s1)
        tokens += s1.split()
    if s2 is not None:
        s2 = to_ascii_str(s2)
        s2 = strip_punctuation(s2)
        tokens += s2.split()
    return " ".join(tokens)

In [None]:
if OCR:
    res = []
    n_timeout = 0
    for t in tqdm(train.itertuples()):
        img_path = getattr(t, "image_path")
        s = image_to_text(img_path, mode="inverted", timeout=0.4, neighbours=41, psm=11)
        if s is None:
            s = ""
            n_timeout += 1
        res.append(s)
    print(f"n_timeout={n_timeout}")

In [None]:
if OCR:
    train["itext"] = res
    train["text"] = train["title"] + " " + train["itext"]
    cols = ["text", "itext", "title"]
    train[cols].head()

In [None]:
%%time
if OCR:
    train["text_p"] = train.apply(mylib.preprocess("text"), axis=1)

In [None]:
if OCR:
    st_name = "stsb-distilbert-base"
    #st_name = "paraphrase-distilroberta-base-v1"
    #st_name = "paraphrase-xlm-r-multilingual-v1"
    train["text_matches"] = mylib.sbert_matches(
        model_path=f"pretrained/sentence-transformers/{st_name}",
        sentences=train["text_p"].tolist(),
        posting_ids=posting_ids,
        threshold=0.5
    )

# Result

In [None]:
fs = []
if IMAGE:
    fs.append("image_matches")
if TITLE:
    fs.append("title_matches")
if PHASH:
    fs.append("phash_matches")
if OCR:
    fs.append("text_matches")
train["matches"] = train.apply(mylib.combine_as_list(
    fs,
    imap=imap,
    brand_threshold=0.5,
    measurement_threshold=0.5,
), axis=1)
train["f1"] = train.apply(mylib.metric_per_row("matches"), axis=1)
print(f"Combined score={train.f1.mean():.3f}")

In [None]:
res = [
    {
        "score": 0.654,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 1e-6,
        "image_pretrained": "enb3",
        "brand_theshold": 0.3,
        "measurement_threshold": 0.3,
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.654,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 1e-5,
        "image_pretrained": "enb3",
        "brand_theshold": 0.3,
        "measurement_threshold": 0.3,
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.654,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 1e-4,
        "image_pretrained": "enb3",
        "brand_theshold": 0.3,
        "measurement_threshold": 0.3,
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.645,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 1e-3,
        "image_pretrained": "enb3",
        "brand_theshold": 0.3,
        "measurement_threshold": 0.3,
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.656,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 5e-3,
        "image_pretrained": "enb3",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.522,
        "phash_threshold": None,
        "title_threshold": None,
        "image_threshold": 5e-3,
        "image_pretrained": "enb3",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.473,
        "phash_threshold": None,
        "title_threshold": None,
        "image_threshold": 0.01,
        "image_pretrained": "enb3",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.502,
        "phash_threshold": None,
        "title_threshold": None,
        "image_threshold": 1e-3,
        "image_pretrained": "enb3",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.651,
        "phash_threshold": 0.2,
        "title_threshold": 0.5,
        "image_threshold": 1e-4,
        "image_pretrained": "enb3",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.654,
        "phash_threshold": 0.2,
        "title_threshold": 0.5,
        "image_threshold": 1e-5,
        "image_pretrained": "enb3",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.658,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 1e-5,
        "image_pretrained": "enb3",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.656,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 1e-4,
        "image_pretrained": "enb3",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.562,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 0.001,
        "image_pretrained": "enb3",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.514,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 0.001,
        "image_pretrained": "enb0",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.498,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 0.01,
        "image_pretrained": "enb0",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.136,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 0.05,
        "image_pretrained": "enb0",
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.674,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "text_threshold": 0.5,
        "image_threshold": None,
        "image_pretrained": None,
        "ocr_threshold": "inverted",
        "ocr_timeout": 0.4,
        "ocr_neighbours": 41,
        "ocr_psm": 11
    },
    {
        "score": 0.674,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "text_threshold": 0.5,
        "image_threshold": None,
        "image_pretrained": None,
        "ocr_threshold": "binary",
        "ocr_timeout": 0.4,
        "ocr_neighbours": 41,
        "ocr_psm": 11
    },
    {
        "score": 0.674,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": None,
        "image_pretrained": None,
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    }
]
df = pd.DataFrame.from_records(res)
df.sort_values("score", ascending=False, inplace=True, ignore_index=True)
df.T.head(30)

In [None]:
cols = ["f1", "target", "matches"] + fs
train[cols].head(30)

In [None]:
df = train.sort_values("f1", ascending=True, ignore_index=True)
df[cols].head()