In [11]:
import numpy as np
import pandas as pd
import py
import mylib
import cv2
import pytesseract

In [12]:
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 1000)

In [13]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train["target"] = mylib.target_label(train)
train["image_path"] = "input/train_images/" + train["image"]
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  int64 
 5   target       34250 non-null  object
 6   image_path   34250 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.8+ MB


th=.25, f1=.586
th=.30, f1=.586
th=.35, f1=.587
th=.40, f1=.583

In [14]:
%%time
train["phash_matches"] = mylib.phash_matches(train, threshold=0.3)

CPU times: user 27.8 s, sys: 14.9 s, total: 42.7 s
Wall time: 42.7 s


In [15]:
%%time
train["title_p"] = train.apply(mylib.preprocess("title"), axis=1)

CPU times: user 29.7 s, sys: 141 ms, total: 29.8 s
Wall time: 29.8 s


In [16]:
posting_ids = train["posting_id"].tolist()

In [17]:
%%time
st_name = "stsb-distilbert-base"
#st_name = "paraphrase-distilroberta-base-v1"
#st_name = "paraphrase-xlm-r-multilingual-v1"
train["title_matches"] = mylib.sbert_matches(
    model_path=f"pretrained/sentence-transformers/{st_name}",
    sentences=train["title_p"].tolist(),
    posting_ids=posting_ids,
    threshold=0.5
)

CPU times: user 22min 45s, sys: 1min 12s, total: 23min 57s
Wall time: 4min 48s


In [18]:
def image_to_text(row) -> str:
    res = ""
    img = cv2.imread(row["image_path"])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    res = pytesseract.image_to_string(img, timeout=10)
    res = " ".join(res.split())
    return res

In [None]:
%%time
train["itext"] = train.apply(image_to_text, axis=1)

In [None]:
%%time
train["itext_p"] = train.apply(mylib.preprocess("itext"), axis=1)

In [None]:
st_name = "stsb-distilbert-base"
#st_name = "paraphrase-distilroberta-base-v1"
#st_name = "paraphrase-xlm-r-multilingual-v1"
train["itext_matches"] = mylib.sbert_matches(
    model_path=f"pretrained/sentence-transformers/{st_name}",
    sentences=train["itext_p"].tolist(),
    posting_ids=posting_ids,
    threshold=0.5
)

In [None]:
fs = ["phash_matches", "title_matches", "itext_matches"]
train["matches"] = train.apply(mylib.combine_as_list(fs), axis=1)
train["f1"] = train.apply(mylib.metric_per_row("matches"), axis=1)
print(f"Combined score={train.f1.mean():.3f}")

In [None]:
cols = ["f1", "target", "matches"] + fs
train[cols].head(30)

In [None]:
train.sort_values("f1", ascending=True, inplace=True, ignore_index=True)
train[cols].head()