In [1]:
import os
import numpy as np
import pandas as pd
import py
import mylib
import cv2 as cv
import pytesseract
from tqdm import tqdm
from typing import Optional
from scml.nlp import strip_punctuation, to_ascii_str

In [2]:
#os.environ["OMP_THREAD_LIMIT"] = "1"
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 1000)

In [3]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train["target"] = mylib.target_label(train)
train["image_path"] = "input/train_images/" + train["image"]
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  int64 
 5   target       34250 non-null  object
 6   image_path   34250 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.8+ MB


th=.25, f1=.586
th=.30, f1=.586
th=.35, f1=.587
th=.40, f1=.583

In [4]:
%%time
train["phash_matches"] = mylib.phash_matches(train, threshold=0.3)

CPU times: user 28.6 s, sys: 15.1 s, total: 43.7 s
Wall time: 43.8 s


In [5]:
%%time
train["title_p"] = train.apply(mylib.preprocess("title"), axis=1)

CPU times: user 30.1 s, sys: 93.8 ms, total: 30.2 s
Wall time: 30.3 s


In [6]:
posting_ids = train["posting_id"].tolist()

In [7]:
%%time
st_name = "stsb-distilbert-base"
#st_name = "paraphrase-distilroberta-base-v1"
#st_name = "paraphrase-xlm-r-multilingual-v1"
train["title_matches"] = mylib.sbert_matches(
    model_path=f"pretrained/sentence-transformers/{st_name}",
    sentences=train["title_p"].tolist(),
    posting_ids=posting_ids,
    threshold=0.5
)

CPU times: user 20min 56s, sys: 1min 11s, total: 22min 7s
Wall time: 4min 17s


In [8]:
def erode_dilate(img):
    kernel = np.ones((2, 2), np.uint8)
    img = cv.erode(img, kernel, iterations=1)
    img = cv.dilate(img, kernel, iterations=1)
    return img


def image_to_text(img_path, mode: str, timeout: float, neighbours: int=41) -> Optional[str]:
    s1, s2 = None, None
    img = cv.imread(img_path, cv.IMREAD_GRAYSCALE)
    #img = cv.resize(img, None, fx=0.5, fy=0.5, interpolation=cv.INTER_AREA)
    img = cv.medianBlur(img, 3)
    if mode == "binary_inverted" or mode == "binary":
        th = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, neighbours, 2)
        th = erode_dilate(th)
        try:
            s1 = pytesseract.image_to_string(th, timeout=timeout, config=r"--psm 11")
        except:
            s1 = None
    if mode == "binary_inverted" or mode == "inverted":
        th = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, neighbours, 2)
        th = erode_dilate(th)
        try:
            s2 = pytesseract.image_to_string(th, timeout=timeout, config=r"--psm 11")
        except:
            s2 = None
    if s1 is None and s2 is None:
        return None
    tokens = []
    if s1 is not None:
        s1 = to_ascii_str(s1)
        s1 = strip_punctuation(s1)
        tokens += s1.split()
    if s2 is not None:
        s2 = to_ascii_str(s2)
        s2 = strip_punctuation(s2)
        tokens += s2.split()
    return " ".join(tokens)

In [9]:
res = []
n_timeout = 0
for t in tqdm(train.itertuples()):
    img_path = getattr(t, "image_path")
    s = image_to_text(img_path, mode="inverted", timeout=0.4)
    if s is None:
        s = ""
        n_timeout += 1
    res.append(s)
print(f"n_timeout={n_timeout}")

14812it [1:57:21,  2.05it/s]Exception in thread Thread-14818:
Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.7/threading.py", line 1177, in run
    self.function(*self.args, **self.kwargs)
  File "/mnt/s/dev/seahrh/kaggle-shopee-product-matching/venv/lib/python3.7/site-packages/pytesseract/pytesseract.py", line 109, in kill
    process.terminate()
  File "/usr/lib/python3.7/subprocess.py", line 1785, in terminate
    self.send_signal(signal.SIGTERM)
  File "/usr/lib/python3.7/subprocess.py", line 1780, in send_signal
    os.kill(self.pid, sig)
ProcessLookupError: [Errno 3] No such process

16184it [2:08:19,  2.34it/s]Exception in thread Thread-16190:
Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.7/threading.py", line 1177, in run
    self.function(*self.args, **self.kwargs)
  File "/mnt/s

n_timeout=0





In [10]:
train["itext"] = res
train["text"] = train["title"] + " " + train["itext"]
cols = ["text", "itext", "title"]
train[cols].head()

Unnamed: 0,text,itext,title
0,Paper Bag Victoria Secret Vw ie AM KG Ji Ge UC Ny ai oh if H i diy Yu LHI y AN na FON rt fo AZZ OY Ss,Vw ie AM KG Ji Ge UC Ny ai oh if H i diy Yu LHI y AN na FON rt fo AZZ OY Ss,Paper Bag Victoria Secret
1,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DOUBLE FOAM TAPE",,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DOUBLE FOAM TAPE"
2,Maling TTS Canned Pork Luncheon Meat 397 gr,,Maling TTS Canned Pork Luncheon Meat 397 gr
3,Daster Batik Lengan pendek - Motif Acak / Campur - Leher Kancing (DPT001-00) Batik karakter Alhadi re se Y ai eat Se a cs Yy tre i a a mon 3 ee iW i 4 4 a f ieee I i i i Multa Cc a i q ti Wy a,re se Y ai eat Se a cs Yy tre i a a mon 3 ee iW i 4 4 a f ieee I i i i Multa Cc a i q ti Wy a,Daster Batik Lengan pendek - Motif Acak / Campur - Leher Kancing (DPT001-00) Batik karakter Alhadi
4,Nescafe \xc3\x89clair Latte 220ml SS newts OFFICIAL STORE eRD Dae cod Ga eia3 as,SS newts OFFICIAL STORE eRD Dae cod Ga eia3 as,Nescafe \xc3\x89clair Latte 220ml


In [11]:
%%time
train["text_p"] = train.apply(mylib.preprocess("text"), axis=1)

CPU times: user 1min 10s, sys: 125 ms, total: 1min 10s
Wall time: 1min 10s


In [12]:
st_name = "stsb-distilbert-base"
#st_name = "paraphrase-distilroberta-base-v1"
#st_name = "paraphrase-xlm-r-multilingual-v1"
train["text_matches"] = mylib.sbert_matches(
    model_path=f"pretrained/sentence-transformers/{st_name}",
    sentences=train["text_p"].tolist(),
    posting_ids=posting_ids,
    threshold=0.5
)

In [13]:
fs = ["phash_matches", "title_matches", "text_matches"]
train["matches"] = train.apply(mylib.combine_as_list(fs), axis=1)
train["f1"] = train.apply(mylib.metric_per_row("matches"), axis=1)
print(f"Combined score={train.f1.mean():.3f}")

Combined score=0.674


In [14]:
cols = ["f1", "target", "matches"] + fs
train[cols].head(30)

Unnamed: 0,f1,target,matches,phash_matches,title_matches,text_matches
0,1.0,"[train_129225211, train_2278313361]","[train_2278313361, train_129225211]",[],[train_2278313361],[]
1,1.0,"[train_3386243561, train_3423213080]","[train_3423213080, train_3386243561]",[],[train_3423213080],[train_3423213080]
2,1.0,"[train_2288590299, train_3803689425]","[train_2288590299, train_3803689425]",[],[train_3803689425],[]
3,0.333333,"[train_2406599165, train_3342059966]","[train_3526771004, train_3576714541, train_1744956981, train_2406599165]",[],"[train_1744956981, train_3576714541, train_3526771004]",[train_3526771004]
4,1.0,"[train_3369186413, train_921438619]","[train_921438619, train_3369186413]",[],[train_921438619],[]
5,0.8,"[train_2464356923, train_2753295474, train_305884580]","[train_2753295474, train_2464356923]",[train_2753295474],[],[]
6,0.444444,"[train_1802986387, train_1396161074, train_713073906, train_1275191373, train_2490201622, train_2411544001, train_1859060005]","[train_1396161074, train_1802986387]",[],[train_1396161074],[]
7,0.666667,"[train_1806152124, train_3227306976]",[train_1806152124],[],[],[]
8,0.5,"[train_86570404, train_2837452969, train_77364776]",[train_86570404],[],[],[]
9,0.666667,"[train_831680791, train_3031035861]",[train_831680791],[],[],[]


In [15]:
train.sort_values("f1", ascending=True, inplace=True, ignore_index=True)
train[cols].head()

Unnamed: 0,f1,target,matches,phash_matches,title_matches,text_matches
0,0.037037,"[train_207039286, train_1619338643, train_3721308865, train_1851882944, train_1236075269, train_2855329700, train_1997808959, train_2840508723, train_2729160884, train_2902736490, train_1789512220, train_2172967167, train_3083046401, train_3596545373, train_1673660532, train_675983017, train_799363063, train_2034561720, train_1459295979, train_1473844208, train_2283849789, train_30488954, train_2937433253, train_3851641413, train_3274592078, train_2847244918, train_4286138740, train_3560974982, train_2072061027, train_3134779648, train_3633103106, train_1906102508, train_3869843407, train_4280580992, train_3671554064, train_2683750575, train_1142799664, train_2513892071, train_3980820034, train_3055192895, train_1495753847, train_2266041454, train_2036340727, train_2338443281, train_467196729, train_1061695682, train_2258929039, train_4238545699, train_2160269935, train_1141098045, train_3414293340]","[train_1495753847, train_3345376128, train_1780574535]",[],"[train_1780574535, train_3345376128]",[]
1,0.038462,"[train_3497907844, train_4175229751, train_2930186018, train_515008716, train_1354360830, train_516663932, train_2677100375, train_3480877149, train_2041937727, train_1699906038, train_3534764813, train_664339299, train_1296711926, train_1591104350, train_2918032299, train_2531777612, train_3731734583, train_1086903685, train_658862317, train_2123729460, train_2708192355, train_3454652975, train_1913449144, train_1540761220, train_3195325438, train_365461299, train_4060693827, train_2664816648, train_3332324456, train_1850201761, train_3956550694, train_1799291694, train_3178715139, train_2230152472, train_3945800827, train_2789820394, train_4003658555, train_240158006, train_2497676427, train_1943472851, train_711042017, train_2146279151, train_2813067780, train_846192553, train_627475146, train_3366673512, train_4199111972, train_2114123891, train_4178955354, train_112182868, train_3668806308]",[train_2708192355],[],[],[]
2,0.038462,"[train_4184037897, train_1010868925, train_1561375840, train_2963630570, train_2382946865, train_2486671168, train_61811259, train_2358518833, train_1521931249, train_408229003, train_543954089, train_3486600899, train_417261045, train_2344885693, train_226112794, train_1423119969, train_1374193074, train_880338666, train_210204308, train_2073151758, train_303073611, train_1750065022, train_59393500, train_2864046583, train_2566709185, train_3307737696, train_47863261, train_11694834, train_3434987130, train_1000804730, train_1500350068, train_1641622956, train_3173013958, train_784571098, train_2074205316, train_1302902041, train_1785829508, train_1714224702, train_2919386341, train_3897008118, train_64918479, train_843663648, train_477561587, train_1643485252, train_889614566, train_118604281, train_1157582002, train_1710200794, train_3099122287, train_1431505272, train_2767483557]",[train_1302902041],[],[],[]
3,0.038462,"[train_37143550, train_513102842, train_3524722132, train_983584924, train_305118117, train_3082994608, train_2281819964, train_406475183, train_2440428831, train_1795803441, train_3012501582, train_2191483137, train_2735482142, train_931662080, train_35171381, train_3998154540, train_353877543, train_2005660717, train_438036326, train_1190240122, train_3785317484, train_1357739170, train_1156325383, train_3254405700, train_90356826, train_49782600, train_163710515, train_2200661537, train_1975054729, train_901922858, train_1775427584, train_3382714342, train_2326532166, train_253683720, train_232189854, train_548782513, train_2548010081, train_463544835, train_2870807441, train_1062046612, train_2102271992, train_1905898995, train_2880913122, train_2494367387, train_3903258334, train_3441737294, train_1874494631, train_1944956984, train_595049990, train_1364655400, train_4103350755]",[train_1357739170],[],[],[]
4,0.038462,"[train_3497907844, train_4175229751, train_2930186018, train_515008716, train_1354360830, train_516663932, train_2677100375, train_3480877149, train_2041937727, train_1699906038, train_3534764813, train_664339299, train_1296711926, train_1591104350, train_2918032299, train_2531777612, train_3731734583, train_1086903685, train_658862317, train_2123729460, train_2708192355, train_3454652975, train_1913449144, train_1540761220, train_3195325438, train_365461299, train_4060693827, train_2664816648, train_3332324456, train_1850201761, train_3956550694, train_1799291694, train_3178715139, train_2230152472, train_3945800827, train_2789820394, train_4003658555, train_240158006, train_2497676427, train_1943472851, train_711042017, train_2146279151, train_2813067780, train_846192553, train_627475146, train_3366673512, train_4199111972, train_2114123891, train_4178955354, train_112182868, train_3668806308]",[train_3480877149],[],[],[]
