In [15]:
import numpy as np
import pandas as pd
import py
import mylib
import cv2 as cv
import pytesseract
from tqdm import tqdm
from typing import Optional

In [2]:
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 1000)

In [3]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train["target"] = mylib.target_label(train)
train["image_path"] = "input/train_images/" + train["image"]
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  int64 
 5   target       34250 non-null  object
 6   image_path   34250 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.8+ MB


th=.25, f1=.586
th=.30, f1=.586
th=.35, f1=.587
th=.40, f1=.583

In [4]:
%%time
train["phash_matches"] = mylib.phash_matches(train, threshold=0.3)

CPU times: user 27.9 s, sys: 14.1 s, total: 42 s
Wall time: 42.1 s


In [5]:
%%time
train["title_p"] = train.apply(mylib.preprocess("title"), axis=1)

CPU times: user 30 s, sys: 93.8 ms, total: 30.1 s
Wall time: 30.2 s


In [6]:
posting_ids = train["posting_id"].tolist()

In [7]:
%%time
st_name = "stsb-distilbert-base"
#st_name = "paraphrase-distilroberta-base-v1"
#st_name = "paraphrase-xlm-r-multilingual-v1"
train["title_matches"] = mylib.sbert_matches(
    model_path=f"pretrained/sentence-transformers/{st_name}",
    sentences=train["title_p"].tolist(),
    posting_ids=posting_ids,
    threshold=0.5
)

CPU times: user 22min 17s, sys: 1min 10s, total: 23min 28s
Wall time: 4min 37s


In [27]:
def dilate_erode(img):
    kernel = np.ones((2, 2), np.uint8)
    img = cv.dilate(img, kernel, iterations=1)
    img = cv.erode(img, kernel, iterations=1)
    return img


def image_to_text(img_path) -> Optional[str]:
    img = cv.imread(img_path, cv.IMREAD_GRAYSCALE)
    img = cv.resize(img, None, fx=0.5, fy=0.5, interpolation=cv.INTER_AREA)
    img = cv.medianBlur(img, 3)
    by = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 41, 2)
    iv = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, 41, 2)
    by = dilate_erode(by)
    iv = dilate_erode(iv)
    try:
        s1 = pytesseract.image_to_string(by, timeout=0.4, config=r"--psm 11")
    except RuntimeError as timeout_error:
        s1 = None
    try:
        s2 = pytesseract.image_to_string(iv, timeout=0.4, config=r"--psm 11")
    except RuntimeError as timeout_error:
        s2 = None
    if s1 is None and s2 is None:
        return None
    tokens = []
    if s1 is not None:
        tokens += s1.split()
    if s2 is not None:
        tokens += s2.split()
    return " ".join(tokens)

In [None]:
res = []
n_timeout = 0
for t in tqdm(train.itertuples()):
    img_path = getattr(t, "image_path")
    s = image_to_text(img_path)
    if s is None:
        s = ""
        n_timeout += 1
    res.append(s)
print(f"n_timeout={n_timeout}")

202it [02:16,  1.56it/s]

In [None]:
train["itext"] = res
train["text"] = train["title"] + " " + train["itext"]
cols = ["text", "itext", "title"]
train[cols].head()

In [None]:
%%time
train["text_p"] = train.apply(mylib.preprocess("text"), axis=1)

In [None]:
st_name = "stsb-distilbert-base"
#st_name = "paraphrase-distilroberta-base-v1"
#st_name = "paraphrase-xlm-r-multilingual-v1"
train["text_matches"] = mylib.sbert_matches(
    model_path=f"pretrained/sentence-transformers/{st_name}",
    sentences=train["text_p"].tolist(),
    posting_ids=posting_ids,
    threshold=0.5
)

In [None]:
fs = ["phash_matches", "title_matches", "text_matches"]
train["matches"] = train.apply(mylib.combine_as_list(fs), axis=1)
train["f1"] = train.apply(mylib.metric_per_row("matches"), axis=1)
print(f"Combined score={train.f1.mean():.3f}")

In [None]:
cols = ["f1", "target", "matches"] + fs
train[cols].head(30)

In [None]:
train.sort_values("f1", ascending=True, inplace=True, ignore_index=True)
train[cols].head()