In [1]:
import pandas as pd
import mylib

In [2]:
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 1000)

In [3]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train["target"] = mylib.target_label(train)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  int64 
 5   target       34250 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.6+ MB


th=.25, f1=.586
th=.30, f1=.586
th=.35, f1=.587
th=.40, f1=.583

In [4]:
%%time
train["phash_matches"] = mylib.phash_matches(train, threshold=0.3)

CPU times: user 28 s, sys: 14.9 s, total: 42.9 s
Wall time: 43 s


In [5]:
%%time
train["title_p"] = train.apply(mylib.preprocess, axis=1)

CPU times: user 7.08 s, sys: 46.9 ms, total: 7.12 s
Wall time: 7.13 s


In [6]:
posting_ids = train["posting_id"].tolist()
sentences = train["title_p"].tolist()

In [7]:
%%time
st_name = "stsb-distilbert-base"
#st_name = "paraphrase-distilroberta-base-v1"
train["sbert_matches"] = mylib.sbert_matches(
    model_path=f"pretrained/sentence-transformers/{st_name}",
    sentences=sentences,
    posting_ids=posting_ids,
    threshold=0.6
)

CPU times: user 29min 44s, sys: 1min 15s, total: 30min 59s
Wall time: 6min 9s


In [8]:
fs = ["phash_matches", "sbert_matches"]
train["matches"] = train.apply(mylib.combine_as_list(fs), axis=1)
train["f1"] = train.apply(mylib.metric_per_row("matches"), axis=1)
print(f"Combined score={train.f1.mean():.3f}")

Combined score=0.682


In [9]:
cols = ["f1", "target", "matches"] + fs
train[cols].head(30)

Unnamed: 0,f1,target,matches,phash_matches,sbert_matches
0,1.0,"[train_129225211, train_2278313361]","[train_2278313361, train_129225211]",[],[train_2278313361]
1,0.666667,"[train_3386243561, train_3423213080]",[train_3386243561],[],[]
2,1.0,"[train_2288590299, train_3803689425]","[train_2288590299, train_3803689425]",[],[train_3803689425]
3,0.285714,"[train_2406599165, train_3342059966]","[train_3576714541, train_3526771004, train_1744956981, train_2406599165, train_1508100548]",[],"[train_1744956981, train_3576714541, train_3526771004, train_1508100548]"
4,0.666667,"[train_3369186413, train_921438619]",[train_3369186413],[],[]
5,0.8,"[train_2464356923, train_2753295474, train_305884580]","[train_2753295474, train_2464356923]",[train_2753295474],[]
6,0.444444,"[train_1802986387, train_1396161074, train_713073906, train_1275191373, train_2490201622, train_2411544001, train_1859060005]","[train_1802986387, train_1396161074]",[],[train_1396161074]
7,0.666667,"[train_1806152124, train_3227306976]",[train_1806152124],[],[]
8,0.4,"[train_86570404, train_2837452969, train_77364776]","[train_86570404, train_115157077]",[],[train_115157077]
9,0.666667,"[train_831680791, train_3031035861]",[train_831680791],[],[]


In [10]:
train.sort_values("f1", ascending=True, inplace=True, ignore_index=True)
train[cols].head()

Unnamed: 0,f1,target,matches,phash_matches,sbert_matches
0,0.035088,"[train_1251926547, train_576940044, train_2821917682, train_3942479788, train_2262165918, train_2713325894, train_4024729812, train_3500331962, train_1833257672, train_239849978, train_2416568905, train_1605540675, train_2411860849, train_1533892478, train_2921972011, train_3901859980, train_4084569444, train_1359948092, train_2361997442, train_2836494610, train_2767216329, train_1130274962, train_1642454560, train_1625513302, train_2997354311, train_2811016096, train_1518569647, train_4128551344, train_2475719623, train_3178048969, train_389026721, train_3609122916, train_155628598, train_1025570379, train_876376110, train_319894214, train_2573060829, train_745074374, train_3307385969, train_378911549, train_422235145, train_2980795248, train_2419943180, train_493684647, train_2343771203, train_4071624010, train_2032306771, train_4153223724, train_221812587, train_1796048911, train_3730694224]","[train_2851836923, train_2695709813, train_2921972011, train_2135675768, train_3411347209, train_1776665772]",[],"[train_2851836923, train_3411347209, train_2135675768, train_2695709813, train_1776665772]"
1,0.037037,"[train_37143550, train_513102842, train_3524722132, train_983584924, train_305118117, train_3082994608, train_2281819964, train_406475183, train_2440428831, train_1795803441, train_3012501582, train_2191483137, train_2735482142, train_931662080, train_35171381, train_3998154540, train_353877543, train_2005660717, train_438036326, train_1190240122, train_3785317484, train_1357739170, train_1156325383, train_3254405700, train_90356826, train_49782600, train_163710515, train_2200661537, train_1975054729, train_901922858, train_1775427584, train_3382714342, train_2326532166, train_253683720, train_232189854, train_548782513, train_2548010081, train_463544835, train_2870807441, train_1062046612, train_2102271992, train_1905898995, train_2880913122, train_2494367387, train_3903258334, train_3441737294, train_1874494631, train_1944956984, train_595049990, train_1364655400, train_4103350755]","[train_1357739170, train_4121616168, train_2514049518]",[],"[train_4121616168, train_2514049518]"
2,0.037037,"[train_4184037897, train_1010868925, train_1561375840, train_2963630570, train_2382946865, train_2486671168, train_61811259, train_2358518833, train_1521931249, train_408229003, train_543954089, train_3486600899, train_417261045, train_2344885693, train_226112794, train_1423119969, train_1374193074, train_880338666, train_210204308, train_2073151758, train_303073611, train_1750065022, train_59393500, train_2864046583, train_2566709185, train_3307737696, train_47863261, train_11694834, train_3434987130, train_1000804730, train_1500350068, train_1641622956, train_3173013958, train_784571098, train_2074205316, train_1302902041, train_1785829508, train_1714224702, train_2919386341, train_3897008118, train_64918479, train_843663648, train_477561587, train_1643485252, train_889614566, train_118604281, train_1157582002, train_1710200794, train_3099122287, train_1431505272, train_2767483557]","[train_2677414178, train_3486600899, train_242776411]",[],"[train_242776411, train_2677414178]"
3,0.037736,"[train_4184037897, train_1010868925, train_1561375840, train_2963630570, train_2382946865, train_2486671168, train_61811259, train_2358518833, train_1521931249, train_408229003, train_543954089, train_3486600899, train_417261045, train_2344885693, train_226112794, train_1423119969, train_1374193074, train_880338666, train_210204308, train_2073151758, train_303073611, train_1750065022, train_59393500, train_2864046583, train_2566709185, train_3307737696, train_47863261, train_11694834, train_3434987130, train_1000804730, train_1500350068, train_1641622956, train_3173013958, train_784571098, train_2074205316, train_1302902041, train_1785829508, train_1714224702, train_2919386341, train_3897008118, train_64918479, train_843663648, train_477561587, train_1643485252, train_889614566, train_118604281, train_1157582002, train_1710200794, train_3099122287, train_1431505272, train_2767483557]","[train_2149797782, train_1302902041]",[],[train_2149797782]
4,0.037736,"[train_37143550, train_513102842, train_3524722132, train_983584924, train_305118117, train_3082994608, train_2281819964, train_406475183, train_2440428831, train_1795803441, train_3012501582, train_2191483137, train_2735482142, train_931662080, train_35171381, train_3998154540, train_353877543, train_2005660717, train_438036326, train_1190240122, train_3785317484, train_1357739170, train_1156325383, train_3254405700, train_90356826, train_49782600, train_163710515, train_2200661537, train_1975054729, train_901922858, train_1775427584, train_3382714342, train_2326532166, train_253683720, train_232189854, train_548782513, train_2548010081, train_463544835, train_2870807441, train_1062046612, train_2102271992, train_1905898995, train_2880913122, train_2494367387, train_3903258334, train_3441737294, train_1874494631, train_1944956984, train_595049990, train_1364655400, train_4103350755]","[train_2765497796, train_2870807441]",[],[train_2765497796]
