In [1]:
import os
import math
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import tensorflow as tf
from tensorflow import keras
import py
import mylib
import cv2 as cv
import pytesseract
from tqdm import tqdm
from typing import Optional, List, Dict, Set, Tuple
from scml.nlp import strip_punctuation, to_ascii_str

In [2]:
#os.environ["OMP_THREAD_LIMIT"] = "1"
pd.set_option("use_inf_as_na", True)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)

In [3]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train["target"] = mylib.target_label(train)
train["image_path"] = "input/train_images/" + train["image"]
posting_ids = train["posting_id"].tolist()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  int64 
 5   target       34250 non-null  object
 6   image_path   34250 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.8+ MB


In [4]:
class ArcMarginProduct(keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [5]:
model_dir = "models/eb3_arc_20210502_0000"
m0 = keras.models.load_model(f"{model_dir}/trial_0/fold_0/model.h5", custom_objects={"ArcMarginProduct": ArcMarginProduct})
m0 = keras.models.Model(inputs=m0.input[0], outputs=m0.get_layer("embedding_output").output)
m0.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_input (InputLayer)     [(None, 300, 300, 3)]     0         
_________________________________________________________________
efficientnetb3 (Functional)  (None, 1536)              10783535  
_________________________________________________________________
layer_normalization_1 (Layer (None, 1536)              3072      
_________________________________________________________________
dense_1 (Dense)              (None, 1536)              2360832   
_________________________________________________________________
embedding_output (LayerNorma (None, 1536)              3072      
Total params: 13,150,511
Trainable params: 2,366,976
Non-trainable params: 10,783,535
_________________________________________________________________


In [6]:
m1 = keras.models.load_model(f"{model_dir}/trial_0/fold_1/model.h5", custom_objects={"ArcMarginProduct": ArcMarginProduct})
m1 = keras.models.Model(inputs=m1.input[0], outputs=m1.get_layer("embedding_output").output)
m1.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_input (InputLayer)     [(None, 300, 300, 3)]     0         
_________________________________________________________________
efficientnetb3 (Functional)  (None, 1536)              10783535  
_________________________________________________________________
layer_normalization_2 (Layer (None, 1536)              3072      
_________________________________________________________________
dense_2 (Dense)              (None, 1536)              2360832   
_________________________________________________________________
embedding_output (LayerNorma (None, 1536)              3072      
Total params: 13,150,511
Trainable params: 2,366,976
Non-trainable params: 10,783,535
_________________________________________________________________


In [7]:
m2 = keras.models.load_model(f"{model_dir}/trial_0/fold_2/model.h5", custom_objects={"ArcMarginProduct": ArcMarginProduct})
m2 = keras.models.Model(inputs=m2.input[0], outputs=m2.get_layer("embedding_output").output)
m2.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_input (InputLayer)     [(None, 300, 300, 3)]     0         
_________________________________________________________________
efficientnetb3 (Functional)  (None, 1536)              10783535  
_________________________________________________________________
layer_normalization_3 (Layer (None, 1536)              3072      
_________________________________________________________________
dense_3 (Dense)              (None, 1536)              2360832   
_________________________________________________________________
embedding_output (LayerNorma (None, 1536)              3072      
Total params: 13,150,511
Trainable params: 2,366,976
Non-trainable params: 10,783,535
_________________________________________________________________


In [8]:
m3 = keras.models.load_model(f"{model_dir}/trial_0/fold_3/model.h5", custom_objects={"ArcMarginProduct": ArcMarginProduct})
m3 = keras.models.Model(inputs=m3.input[0], outputs=m3.get_layer("embedding_output").output)
m3.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_input (InputLayer)     [(None, 300, 300, 3)]     0         
_________________________________________________________________
efficientnetb3 (Functional)  (None, 1536)              10783535  
_________________________________________________________________
layer_normalization_4 (Layer (None, 1536)              3072      
_________________________________________________________________
dense_4 (Dense)              (None, 1536)              2360832   
_________________________________________________________________
embedding_output (LayerNorma (None, 1536)              3072      
Total params: 13,150,511
Trainable params: 2,366,976
Non-trainable params: 10,783,535
_________________________________________________________________


In [9]:
m4 = keras.models.load_model(f"{model_dir}/trial_0/fold_4/model.h5", custom_objects={"ArcMarginProduct": ArcMarginProduct})
m4 = keras.models.Model(inputs=m4.input[0], outputs=m4.get_layer("embedding_output").output)
m4.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_input (InputLayer)     [(None, 300, 300, 3)]     0         
_________________________________________________________________
efficientnetb3 (Functional)  (None, 1536)              10783535  
_________________________________________________________________
layer_normalization_5 (Layer (None, 1536)              3072      
_________________________________________________________________
dense_5 (Dense)              (None, 1536)              2360832   
_________________________________________________________________
embedding_output (LayerNorma (None, 1536)              3072      
Total params: 13,150,511
Trainable params: 2,366,976
Non-trainable params: 10,783,535
_________________________________________________________________


In [10]:
idg = keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    data_format="channels_last",
    dtype=np.float32
)
data = idg.flow_from_dataframe(
    dataframe=train,
    x_col="image",
    y_col="label_group",
    directory="input/train_images",
    target_size=(300, 300),
    color_mode="rgb",
    batch_size=1024,
    shuffle=False,
    class_mode="raw",
    interpolation="nearest",
)
y0 = m0.predict(data, verbose=1)
y1 = m1.predict(data, verbose=1)
#y2 = m2.predict(data, verbose=1)
#y3 = m3.predict(data, verbose=1)
#y4 = m4.predict(data, verbose=1)
assert y0.shape == y1.shape #== y2.shape == y3.shape == y4.shape
print(f"y0.shape={y0.shape}")

Found 34250 validated image filenames.
y0.shape=(34250, 1536)


In [11]:
res = []
for i in range(len(y0)):
    #a = np.vstack((y0[i], y1[i], y2[i], y3[i], y4[i]))
    a = np.vstack((y0[i], y1[i]))
    m = np.mean(a, axis=0)
    res.append(m)
em = np.array(res, dtype=np.float32)
assert y0.shape == em.shape
print(f"em.shape={em.shape}")

em.shape=(34250, 1536)


In [12]:
%%time
threshold = 0.1
nn = NearestNeighbors(
    n_neighbors=min(49, len(posting_ids) - 1), metric="euclidean"
)
nn.fit(em)
distances, indices = nn.kneighbors()
res: List[List[str]] = [[] for _ in range(len(indices))]
for i in range(len(indices)):
    for j in range(len(indices[0])):
        if distances[i][j] > threshold:
            break
        res[i].append(posting_ids[indices[i][j]])
train["image_matches"] = res

CPU times: user 2min 21s, sys: 1min 21s, total: 3min 42s
Wall time: 1min 8s


th=.25, f1=.586
th=.30, f1=.586
th=.35, f1=.587
th=.40, f1=.583

In [13]:
%%time
train["phash_matches"] = mylib.phash_matches(train, threshold=0.3)

CPU times: user 27.7 s, sys: 14.6 s, total: 42.3 s
Wall time: 42.3 s


In [14]:
%%time
train["title_p"] = train.apply(mylib.preprocess("title"), axis=1)

CPU times: user 32.8 s, sys: 125 ms, total: 33 s
Wall time: 33 s


In [15]:
%%time
st_name = "stsb-distilbert-base"
#st_name = "paraphrase-distilroberta-base-v1"
#st_name = "paraphrase-xlm-r-multilingual-v1"
train["title_matches"] = mylib.sbert_matches(
    model_path=f"pretrained/sentence-transformers/{st_name}",
    sentences=train["title_p"].tolist(),
    posting_ids=posting_ids,
    threshold=0.5
)

CPU times: user 20min 19s, sys: 1min 4s, total: 21min 23s
Wall time: 4min 3s


In [16]:
def erode_dilate(img):
    kernel = np.ones((2, 2), np.uint8)
    img = cv.erode(img, kernel, iterations=1)
    img = cv.dilate(img, kernel, iterations=1)
    return img


def image_to_text(img_path, mode: str, timeout: float, neighbours: int=41, psm: int=3) -> Optional[str]:
    config = f"--psm {psm}"
    s1, s2 = None, None
    img = cv.imread(img_path, cv.IMREAD_GRAYSCALE)
    #img = cv.resize(img, None, fx=0.5, fy=0.5, interpolation=cv.INTER_AREA)
    img = cv.medianBlur(img, 3)
    if mode == "binary_inverted" or mode == "binary":
        th = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, neighbours, 2)
        th = erode_dilate(th)
        try:
            s1 = pytesseract.image_to_string(th, timeout=timeout, config=config)
        except:
            s1 = None
    if mode == "binary_inverted" or mode == "inverted":
        th = cv.adaptiveThreshold(img, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, neighbours, 2)
        th = erode_dilate(th)
        try:
            s2 = pytesseract.image_to_string(th, timeout=timeout, config=config)
        except:
            s2 = None
    if s1 is None and s2 is None:
        return None
    tokens = []
    if s1 is not None:
        s1 = to_ascii_str(s1)
        s1 = strip_punctuation(s1)
        tokens += s1.split()
    if s2 is not None:
        s2 = to_ascii_str(s2)
        s2 = strip_punctuation(s2)
        tokens += s2.split()
    return " ".join(tokens)

In [17]:
OCR = False
if OCR:
    res = []
    n_timeout = 0
    for t in tqdm(train.itertuples()):
        img_path = getattr(t, "image_path")
        s = image_to_text(img_path, mode="inverted", timeout=0.4, neighbours=41, psm=11)
        if s is None:
            s = ""
            n_timeout += 1
        res.append(s)
    print(f"n_timeout={n_timeout}")

In [18]:
if OCR:
    train["itext"] = res
    train["text"] = train["title"] + " " + train["itext"]
    cols = ["text", "itext", "title"]
    train[cols].head()

In [19]:
%%time
if OCR:
    train["text_p"] = train.apply(mylib.preprocess("text"), axis=1)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.5 µs


In [20]:
if OCR:
    st_name = "stsb-distilbert-base"
    #st_name = "paraphrase-distilroberta-base-v1"
    #st_name = "paraphrase-xlm-r-multilingual-v1"
    train["text_matches"] = mylib.sbert_matches(
        model_path=f"pretrained/sentence-transformers/{st_name}",
        sentences=train["text_p"].tolist(),
        posting_ids=posting_ids,
        threshold=0.5
    )

In [21]:
fs = ["phash_matches", "title_matches", "image_matches"]
if OCR:
    fs.append("text_matches")
train["matches"] = train.apply(mylib.combine_as_list(fs), axis=1)
train["f1"] = train.apply(mylib.metric_per_row("matches"), axis=1)
print(f"Combined score={train.f1.mean():.3f}")

Combined score=0.090


In [22]:
res = [
    {
        "score": 0.088,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 0.5,
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    },
    {
        "score": 0.674,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "text_threshold": 0.5,
        "image_threshold": 0.5,
        "ocr_threshold": "inverted",
        "ocr_timeout": 0.4,
        "ocr_neighbours": 41,
        "ocr_psm": 11
    },
    {
        "score": 0.674,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "text_threshold": 0.5,
        "image_threshold": 0.5,
        "ocr_threshold": "binary",
        "ocr_timeout": 0.4,
        "ocr_neighbours": 41,
        "ocr_psm": 11
    },
    {
        "score": 0.674,
        "phash_threshold": 0.3,
        "title_threshold": 0.5,
        "image_threshold": 0.5,
        "text_threshold": None,
        "ocr_threshold": None,
        "ocr_timeout": None,
        "ocr_neighbours": None,
        "ocr_psm": None
    }
]
df = pd.DataFrame.from_records(res)
df.sort_values("score", ascending=False, inplace=True, ignore_index=True)
df.T.head(30)

Unnamed: 0,0,1,2,3
score,0.674,0.674,0.674,0.088
phash_threshold,0.3,0.3,0.3,0.3
title_threshold,0.5,0.5,0.5,0.5
image_threshold,0.5,0.5,0.5,0.5
text_threshold,0.5,0.5,,
ocr_threshold,inverted,binary,,
ocr_timeout,0.4,0.4,,
ocr_neighbours,41.0,41.0,,
ocr_psm,11.0,11.0,,


In [23]:
cols = ["f1", "target", "matches"] + fs
train[cols].head(30)

Unnamed: 0,f1,target,matches,phash_matches,title_matches,image_matches
0,0.075472,"[train_129225211, train_2278313361]","[train_129225211, train_1309289735, train_2366339891, train_243513028, train_200903631, train_2748139905, train_2573648452, train_2454588595, train_1787280150, train_991683036, train_3088297643, train_3041150821, train_1439009721, train_1146285530, train_432509608, train_2278313361, train_2760452222, train_1861408010, train_3293716060, train_3359300733, train_1899884457, train_2159791112, train_3265145891, train_1388515987, train_753533759, train_2328684746, train_32476901, train_1952799344, train_4180620038, train_2360647092, train_3357267393, train_1938566453, train_685214387, train_13672904, train_2954084829, train_2120095754, train_2745874524, train_3435796905, train_3556769241, train_3980754834, train_1310160910, train_3122164478, train_3036907789, train_3668342417, train_1645355366, train_335719894, train_55865111, train_274485277, train_6477270, train_2795921944, train_1340791296]",[],[train_2278313361],"[train_200903631, train_6477270, train_1861408010, train_3435796905, train_2159791112, train_13672904, train_3357267393, train_1952799344, train_3036907789, train_2745874524, train_2454588595, train_3293716060, train_3359300733, train_1787280150, train_432509608, train_3041150821, train_991683036, train_1146285530, train_1340791296, train_335719894, train_753533759, train_2120095754, train_1309289735, train_55865111, train_3122164478, train_3556769241, train_1899884457, train_1938566453, train_1645355366, train_274485277, train_3088297643, train_4180620038, train_1439009721, train_3980754834, train_2366339891, train_2328684746, train_685214387, train_32476901, train_243513028, train_2954084829, train_2748139905, train_2360647092, train_1310160910, train_2573648452, train_1388515987, train_3265145891, train_2795921944, train_2760452222, train_3668342417]"
1,0.075472,"[train_3386243561, train_3423213080]","[train_3133415900, train_2202259592, train_1395269509, train_2323869681, train_1164304703, train_1968568473, train_3586681855, train_2807111137, train_3024467494, train_2147846099, train_1081931791, train_290486421, train_1927662855, train_2403269575, train_755003837, train_3386243561, train_3430807642, train_1378218030, train_3730004566, train_580639524, train_2591684002, train_3281335331, train_3243886803, train_3423213080, train_626830736, train_2371105215, train_3938323603, train_1188715677, train_3544995555, train_690190523, train_3267429805, train_1909423994, train_2838573674, train_3605286262, train_658167853, train_1026884481, train_2994234361, train_910933116, train_4044410214, train_426195351, train_3441550980, train_3548269661, train_2545348563, train_2758967773, train_2509069129, train_1264882101, train_753570045, train_2558647836, train_2522158364, train_900519743, train_632745654]",[],[train_3423213080],"[train_2545348563, train_2147846099, train_3441550980, train_2202259592, train_1081931791, train_910933116, train_580639524, train_1968568473, train_2807111137, train_3243886803, train_2371105215, train_2994234361, train_3544995555, train_2838573674, train_2323869681, train_626830736, train_632745654, train_3024467494, train_2558647836, train_2403269575, train_290486421, train_2758967773, train_3548269661, train_1927662855, train_1188715677, train_1164304703, train_900519743, train_1378218030, train_4044410214, train_690190523, train_3730004566, train_426195351, train_3133415900, train_3586681855, train_1264882101, train_1026884481, train_1395269509, train_753570045, train_2509069129, train_3281335331, train_2591684002, train_658167853, train_3605286262, train_3267429805, train_1909423994, train_755003837, train_2522158364, train_3430807642, train_3938323603]"
2,0.075472,"[train_2288590299, train_3803689425]","[train_63996627, train_3803689425, train_2185892330, train_4139437520, train_2161215884, train_885706109, train_1569846146, train_646522662, train_2166590500, train_2564717706, train_174183094, train_3592352534, train_1751020705, train_754866683, train_2462443684, train_1689011486, train_3571679985, train_1522436210, train_1129270557, train_236443778, train_3924805715, train_2288590299, train_3256768265, train_1889634838, train_2014708552, train_1823753834, train_1027285265, train_3939596147, train_1026730946, train_921763449, train_2858688688, train_1489322613, train_3340794377, train_1866688264, train_3139218180, train_3098628279, train_1496467514, train_1619919993, train_1550228268, train_2416390351, train_862768853, train_3212356069, train_2348128412, train_1455336593, train_2265651088, train_1144774622, train_3016630764, train_1624112584, train_3934337015, train_3035141351, train_239432105]",[],[train_3803689425],"[train_3924805715, train_2185892330, train_2265651088, train_1569846146, train_239432105, train_1489322613, train_1522436210, train_1823753834, train_3592352534, train_1496467514, train_1026730946, train_63996627, train_3098628279, train_2564717706, train_2858688688, train_1129270557, train_885706109, train_1624112584, train_1455336593, train_3212356069, train_754866683, train_3016630764, train_236443778, train_646522662, train_3340794377, train_862768853, train_3934337015, train_1550228268, train_921763449, train_1866688264, train_2462443684, train_3571679985, train_2166590500, train_1689011486, train_2416390351, train_3035141351, train_3256768265, train_174183094, train_2348128412, train_3139218180, train_1751020705, train_2014708552, train_1027285265, train_4139437520, train_3939596147, train_2161215884, train_1889634838, train_1619919993, train_1144774622]"
3,0.036364,"[train_2406599165, train_3342059966]","[train_3248441377, train_243513028, train_1837520086, train_2454588595, train_2760497146, train_3088297643, train_1056425019, train_2485621759, train_2704088184, train_2476652138, train_686441719, train_3067118885, train_2288514435, train_376321406, train_3952641780, train_3265145891, train_2018871378, train_3090266623, train_3526771004, train_1949045793, train_1744956981, train_205126562, train_2029562712, train_4148791584, train_2010316654, train_1952799344, train_4180620038, train_2698985189, train_1601836260, train_424680584, train_3576714541, train_3513028143, train_2833966941, train_3786847172, train_2799780712, train_1252383802, train_2336064338, train_170639565, train_1761051175, train_3402391379, train_4253798517, train_627328419, train_1364742229, train_861349498, train_1672586850, train_1092687314, train_2406599165, train_2240175202, train_4091790877, train_2045279456, train_274485277, train_2817530009, train_2096937844]",[],"[train_1744956981, train_3576714541, train_3526771004]","[train_4148791584, train_4253798517, train_2096937844, train_3248441377, train_2336064338, train_1952799344, train_3513028143, train_1761051175, train_2454588595, train_3088297643, train_4180620038, train_274485277, train_3402391379, train_4091790877, train_861349498, train_686441719, train_1056425019, train_376321406, train_3786847172, train_2799780712, train_243513028, train_1672586850, train_2010316654, train_2288514435, train_1092687314, train_2018871378, train_1364742229, train_627328419, train_3952641780, train_2817530009, train_3265145891, train_2698985189, train_2760497146, train_3090266623, train_1949045793, train_2476652138, train_170639565, train_1601836260, train_424680584, train_205126562, train_2045279456, train_2240175202, train_2833966941, train_2704088184, train_2029562712, train_2485621759, train_1837520086, train_3067118885, train_1252383802]"
4,0.075472,"[train_3369186413, train_921438619]","[train_3790921618, train_3829394802, train_234261561, train_3809881145, train_316108212, train_1225687579, train_518928170, train_813852942, train_2907999856, train_1074605792, train_489884886, train_2530551114, train_933366685, train_1480852021, train_1217335117, train_921438619, train_2816366923, train_3369186413, train_1601238489, train_2347604359, train_578470285, train_3851556677, train_2748376682, train_3410558364, train_1997808959, train_1736549347, train_3425409472, train_1177695212, train_522807539, train_930458613, train_3852005057, train_4206380801, train_2909113620, train_3289840180, train_3848732432, train_476013554, train_3767027123, train_3653943439, train_3576182994, train_2945969300, train_3466253051, train_1225847450, train_3010916169, train_1248358305, train_1443731780, train_1913761923, train_293196592, train_182839633, train_488709541, train_1154619027, train_3730229113]",[],[train_921438619],"[train_518928170, train_1480852021, train_2816366923, train_2909113620, train_3010916169, train_3809881145, train_1248358305, train_3730229113, train_1154619027, train_1177695212, train_3852005057, train_316108212, train_813852942, train_3425409472, train_182839633, train_2748376682, train_3410558364, train_2945969300, train_1601238489, train_1225687579, train_522807539, train_930458613, train_2907999856, train_578470285, train_3848732432, train_3289840180, train_3790921618, train_3829394802, train_1074605792, train_2347604359, train_489884886, train_933366685, train_3767027123, train_488709541, train_1225847450, train_3851556677, train_476013554, train_1443731780, train_1997808959, train_3653943439, train_4206380801, train_3576182994, train_1217335117, train_1913761923, train_3466253051, train_1736549347, train_2530551114, train_293196592, train_234261561]"
5,0.075472,"[train_2464356923, train_2753295474, train_305884580]","[train_1142799664, train_339248773, train_702690747, train_2691971460, train_470018962, train_1527680237, train_3695485002, train_804658471, train_2974807196, train_490281509, train_2848298126, train_477105242, train_345455943, train_979993973, train_1144979117, train_2150673056, train_2057764463, train_1059874205, train_4223515304, train_3781417721, train_1424741044, train_2372612525, train_361766242, train_3056131110, train_306951335, train_2464356923, train_1286268457, train_3696129043, train_2154507196, train_2753295474, train_3215382846, train_258004981, train_4208062667, train_3243826013, train_1029900113, train_3934526303, train_2891225845, train_581104351, train_962986655, train_46854445, train_3341330490, train_1635265624, train_1141098045, train_629893333, train_89792685, train_177144190, train_2783841899, train_2118604314, train_4026430365, train_3000574708]",[train_2753295474],[],"[train_470018962, train_1029900113, train_3215382846, train_4223515304, train_2154507196, train_962986655, train_2118604314, train_1527680237, train_3000574708, train_306951335, train_979993973, train_3056131110, train_3934526303, train_3695485002, train_46854445, train_2691971460, train_1144979117, train_3341330490, train_629893333, train_3696129043, train_89792685, train_3781417721, train_339248773, train_2891225845, train_2753295474, train_1141098045, train_1286268457, train_1142799664, train_2372612525, train_2974807196, train_1635265624, train_2150673056, train_345455943, train_477105242, train_177144190, train_702690747, train_3243826013, train_804658471, train_2783841899, train_1424741044, train_4208062667, train_2848298126, train_4026430365, train_361766242, train_581104351, train_258004981, train_2057764463, train_1059874205, train_490281509]"
6,0.068966,"[train_1802986387, train_1396161074, train_713073906, train_1275191373, train_2490201622, train_2411544001, train_1859060005]","[train_684593423, train_3390572471, train_4026347562, train_3082358948, train_2042278632, train_4253011463, train_1540889923, train_1168724268, train_300916181, train_1546390308, train_2098272543, train_748404710, train_2816424204, train_32208584, train_3359890363, train_1184289466, train_1634320893, train_3164517139, train_1379958338, train_4130662197, train_3138102712, train_1802986387, train_362302335, train_598260672, train_1516573836, train_3626316894, train_30790664, train_1265602885, train_4168582129, train_3444022543, train_1680347015, train_1392640327, train_3263469218, train_1696744036, train_280192455, train_4003609264, train_3227177026, train_1396161074, train_1816968361, train_492093082, train_692598438, train_580680223, train_3948327382, train_1873419172, train_2005531157, train_283454397, train_2329961706, train_3113328007, train_3630367129, train_3741046028, train_3580750137]",[],[train_1396161074],"[train_3626316894, train_692598438, train_3948327382, train_1546390308, train_32208584, train_684593423, train_1379958338, train_1634320893, train_1184289466, train_2329961706, train_283454397, train_1516573836, train_1168724268, train_300916181, train_1392640327, train_2005531157, train_3444022543, train_492093082, train_3390572471, train_1540889923, train_2098272543, train_598260672, train_580680223, train_3741046028, train_3263469218, train_4130662197, train_4168582129, train_4003609264, train_3082358948, train_1680347015, train_3164517139, train_3138102712, train_2816424204, train_1696744036, train_1873419172, train_30790664, train_3113328007, train_3359890363, train_4026347562, train_362302335, train_1265602885, train_1816968361, train_3580750137, train_280192455, train_3227177026, train_3630367129, train_2042278632, train_4253011463, train_748404710]"
7,0.038462,"[train_1806152124, train_3227306976]","[train_3579646090, train_1119950228, train_73914758, train_3138803353, train_1806152124, train_185348797, train_164400283, train_399435290, train_1579287993, train_2811725149, train_2502395000, train_3900072982, train_756122932, train_746062062, train_3234552667, train_2850622756, train_1437647096, train_2490092661, train_4028538739, train_177122973, train_120583569, train_182522005, train_429068901, train_166625623, train_545334724, train_3024280986, train_3181393727, train_2948591420, train_1421006265, train_4263385637, train_3884157725, train_2770305290, train_1531383583, train_3715501661, train_1262620130, train_1927779663, train_1013348276, train_644504147, train_2051260634, train_4198413354, train_1767090565, train_1061185057, train_964720681, train_1535611996, train_3979129442, train_3616034922, train_3480573800, train_1203829705, train_1218725871, train_4251674985]",[],[],"[train_73914758, train_2770305290, train_3024280986, train_177122973, train_1119950228, train_2051260634, train_1013348276, train_3138803353, train_3616034922, train_2811725149, train_1927779663, train_4198413354, train_185348797, train_545334724, train_644504147, train_1218725871, train_166625623, train_1535611996, train_164400283, train_1262620130, train_3480573800, train_756122932, train_120583569, train_399435290, train_964720681, train_3900072982, train_1437647096, train_1767090565, train_3884157725, train_3979129442, train_4028538739, train_2948591420, train_3181393727, train_3234552667, train_3715501661, train_4251674985, train_1531383583, train_1061185057, train_2490092661, train_746062062, train_182522005, train_3579646090, train_2502395000, train_1203829705, train_1579287993, train_1421006265, train_2850622756, train_4263385637, train_429068901]"
8,0.037736,"[train_86570404, train_2837452969, train_77364776]","[train_1074170511, train_2995246306, train_1007547921, train_460355753, train_3340567571, train_86570404, train_402969553, train_3220223860, train_595612322, train_1537873079, train_1189949692, train_519564161, train_3432898912, train_3439319573, train_2504152429, train_3526414204, train_1879498153, train_555959245, train_1164430959, train_3837417318, train_993188081, train_618728083, train_1082064373, train_1794636981, train_4289550952, train_1509228178, train_1060609338, train_3837607576, train_1930027239, train_2645427269, train_1443907224, train_177300313, train_1449432418, train_4164132244, train_2867388227, train_1759206121, train_693336917, train_420249086, train_1549316072, train_1397646595, train_140778492, train_4080544101, train_3704978123, train_3373500830, train_2946106600, train_13334119, train_318366004, train_3057739889, train_274036399, train_4230504640]",[],[],"[train_519564161, train_4080544101, train_1082064373, train_1794636981, train_3837417318, train_3526414204, train_3837607576, train_1164430959, train_3373500830, train_4289550952, train_2946106600, train_2645427269, train_402969553, train_2867388227, train_318366004, train_1007547921, train_3340567571, train_1930027239, train_1879498153, train_420249086, train_177300313, train_2504152429, train_595612322, train_1074170511, train_1537873079, train_1443907224, train_618728083, train_3439319573, train_4164132244, train_4230504640, train_1509228178, train_993188081, train_274036399, train_3704978123, train_1549316072, train_1759206121, train_140778492, train_3432898912, train_3220223860, train_13334119, train_693336917, train_1189949692, train_555959245, train_2995246306, train_460355753, train_3057739889, train_1397646595, train_1449432418, train_1060609338]"
9,0.038462,"[train_831680791, train_3031035861]","[train_123960860, train_3717130264, train_2012413405, train_3031235277, train_397285818, train_1670080376, train_2532354710, train_3247918077, train_4076652844, train_113472922, train_2583451782, train_1297449548, train_2983420652, train_2950073828, train_718919006, train_737471793, train_495294808, train_3011620107, train_1341171751, train_3237721314, train_282615373, train_543815701, train_3669906467, train_4066627404, train_4247731633, train_335040229, train_3942959422, train_3335337784, train_1219083130, train_3112943472, train_2154758897, train_4218151880, train_546070236, train_3550756971, train_33810486, train_2141708149, train_705512967, train_1492916855, train_1963165572, train_658404593, train_1140345955, train_1266761007, train_2065100009, train_2999166368, train_213029798, train_831680791, train_2773405758, train_2078625559, train_2743017919, train_2307037398]",[],[],"[train_4247731633, train_3717130264, train_705512967, train_123960860, train_213029798, train_4076652844, train_1963165572, train_2950073828, train_658404593, train_2983420652, train_1492916855, train_3942959422, train_1670080376, train_2999166368, train_3112943472, train_2012413405, train_2307037398, train_113472922, train_3011620107, train_3550756971, train_2773405758, train_1266761007, train_2583451782, train_737471793, train_3335337784, train_1297449548, train_4218151880, train_1140345955, train_1219083130, train_495294808, train_546070236, train_335040229, train_2154758897, train_2743017919, train_1341171751, train_2532354710, train_718919006, train_3669906467, train_543815701, train_4066627404, train_2065100009, train_33810486, train_2141708149, train_397285818, train_3237721314, train_3031235277, train_3247918077, train_282615373, train_2078625559]"


In [24]:
train.sort_values("f1", ascending=True, inplace=True, ignore_index=True)
train[cols].head()

Unnamed: 0,f1,target,matches,phash_matches,title_matches,image_matches
0,0.019802,"[train_1671347582, train_1175819799, train_1525859862, train_1261404451, train_1865413913, train_943427380, train_3789517483, train_3171223531, train_623546295, train_3598710032, train_2375162891, train_3071392783, train_2960536609, train_1149371320, train_219504178, train_518547082, train_242506613, train_349984975, train_1848632939, train_3967271817, train_2309045515, train_100918337, train_612230357, train_913153922, train_450257935, train_1639533062, train_2378747866, train_2305367958, train_726415384, train_948110932, train_1041715554, train_149240701, train_4070413770, train_2901880479, train_897619948, train_1935153677, train_4211797822, train_1369435429, train_613832447, train_2624032383, train_2280558444, train_1182906906, train_1191962025, train_3700503493, train_3207964616, train_2529405184, train_4139159388, train_799917659, train_1630060072, train_282632898, train_3240884969]","[train_4223316428, train_2347650205, train_127090228, train_2167669212, train_1990118232, train_4266957336, train_1369435429, train_1146850576, train_3499107485, train_4252770623, train_4077115599, train_4045506704, train_3041237187, train_1398823299, train_2391922308, train_1607881011, train_1450330719, train_2628505961, train_1273780316, train_462541577, train_3759944592, train_728530820, train_3097670231, train_3227882310, train_3062074862, train_914489157, train_27726943, train_3097411763, train_3542363901, train_3680814205, train_1492895690, train_2102271992, train_818478179, train_574540438, train_617293422, train_515373139, train_2016221292, train_2788214893, train_2212043415, train_3375374759, train_3904448127, train_1643513946, train_4208761047, train_2625177498, train_1488886434, train_3647730166, train_461131136, train_2186527082, train_2044224389, train_635864748]",[],[],"[train_1607881011, train_2044224389, train_3680814205, train_2212043415, train_3499107485, train_1488886434, train_3375374759, train_2628505961, train_1990118232, train_515373139, train_27726943, train_2016221292, train_3904448127, train_3097670231, train_2167669212, train_127090228, train_617293422, train_1643513946, train_4208761047, train_2625177498, train_4252770623, train_2186527082, train_3041237187, train_4045506704, train_1398823299, train_4077115599, train_635864748, train_728530820, train_1492895690, train_3227882310, train_2788214893, train_3647730166, train_3097411763, train_1450330719, train_2347650205, train_574540438, train_1273780316, train_3062074862, train_818478179, train_2102271992, train_462541577, train_461131136, train_1146850576, train_3759944592, train_4266957336, train_3542363901, train_2391922308, train_914489157, train_4223316428]"
1,0.019802,"[train_207039286, train_1619338643, train_3721308865, train_1851882944, train_1236075269, train_2855329700, train_1997808959, train_2840508723, train_2729160884, train_2902736490, train_1789512220, train_2172967167, train_3083046401, train_3596545373, train_1673660532, train_675983017, train_799363063, train_2034561720, train_1459295979, train_1473844208, train_2283849789, train_30488954, train_2937433253, train_3851641413, train_3274592078, train_2847244918, train_4286138740, train_3560974982, train_2072061027, train_3134779648, train_3633103106, train_1906102508, train_3869843407, train_4280580992, train_3671554064, train_2683750575, train_1142799664, train_2513892071, train_3980820034, train_3055192895, train_1495753847, train_2266041454, train_2036340727, train_2338443281, train_467196729, train_1061695682, train_2258929039, train_4238545699, train_2160269935, train_1141098045, train_3414293340]","[train_252985147, train_957290611, train_1877277803, train_3608372934, train_1203116238, train_226883072, train_901065118, train_3272596230, train_1381454307, train_3629499796, train_87681822, train_3628020090, train_1824565844, train_1794537377, train_3449179875, train_3284124770, train_3368756977, train_3872408179, train_2653817710, train_1137495971, train_3002937896, train_3773611946, train_3281036138, train_3728970541, train_1944468611, train_3082338701, train_4036886801, train_2643153468, train_1458435983, train_778085922, train_3904838612, train_4072058526, train_3671554064, train_2248630472, train_1192550962, train_2645497743, train_3675691968, train_2326578941, train_709873163, train_2070416681, train_789885897, train_3967631975, train_3042616951, train_2049480052, train_1500963217, train_3746535839, train_4224187112, train_2068975294, train_3358036755, train_4002392833]",[],[],"[train_3904838612, train_789885897, train_3629499796, train_2068975294, train_1381454307, train_1500963217, train_2248630472, train_226883072, train_87681822, train_709873163, train_3675691968, train_3272596230, train_3746535839, train_3281036138, train_2070416681, train_2326578941, train_2653817710, train_3608372934, train_4002392833, train_3628020090, train_1203116238, train_2643153468, train_3872408179, train_901065118, train_3284124770, train_3002937896, train_1192550962, train_3449179875, train_778085922, train_1824565844, train_957290611, train_3368756977, train_2645497743, train_3358036755, train_3967631975, train_3773611946, train_3728970541, train_4036886801, train_4072058526, train_3042616951, train_2049480052, train_1794537377, train_3082338701, train_1944468611, train_1877277803, train_252985147, train_1458435983, train_1137495971, train_4224187112]"
2,0.019802,"[train_4206743389, train_984412308, train_121852154, train_762535630, train_3517115705, train_2262735115, train_3755300225, train_368430234, train_2317288464, train_3208392688, train_1458933053, train_4282331229, train_3949355563, train_1086352645, train_1446216827, train_2642455336, train_3838978688, train_98545496, train_3569591480, train_797513730, train_863149549, train_1580073987, train_2081303877, train_3194609880, train_678936454, train_2677777576, train_864736904, train_2184916710, train_1551941123, train_902429172, train_3338833167, train_1079287407, train_1966427691, train_2974564568, train_225670337, train_1905044128, train_1806160391, train_592604757, train_2128389901, train_4028145462, train_2333705544, train_686541160, train_2710828031, train_3340371184, train_1724556465, train_3003421228, train_1166656904, train_150004523, train_2057764463, train_1273401808, train_219281992]","[train_1446806219, train_2890840693, train_2625628124, train_3897715894, train_1184106017, train_521084183, train_4043119471, train_2275369863, train_2215579597, train_4220964327, train_589019250, train_349715306, train_2255128830, train_3098810504, train_2846404271, train_2333705544, train_1450257251, train_2446616959, train_131056969, train_486863369, train_4120272994, train_3077233694, train_2633099892, train_2871123156, train_1393926975, train_366905830, train_3027859528, train_4203480636, train_2151437244, train_720704979, train_1540799062, train_3458709564, train_2839644821, train_3089680369, train_1170782124, train_3475188104, train_1916430386, train_3336598360, train_410895741, train_277615634, train_4279405182, train_3558252318, train_1325200458, train_331269043, train_457446751, train_1255642456, train_1297112083, train_1469169243, train_2868779529, train_171012661]",[],[],"[train_3475188104, train_1446806219, train_3336598360, train_349715306, train_410895741, train_2890840693, train_486863369, train_277615634, train_4279405182, train_3027859528, train_1450257251, train_1916430386, train_1184106017, train_1469169243, train_366905830, train_131056969, train_2255128830, train_3098810504, train_2633099892, train_3558252318, train_1170782124, train_4043119471, train_1540799062, train_2625628124, train_4220964327, train_1255642456, train_171012661, train_3897715894, train_2215579597, train_3089680369, train_720704979, train_2868779529, train_1325200458, train_2871123156, train_4203480636, train_331269043, train_2151437244, train_2275369863, train_1297112083, train_2446616959, train_2846404271, train_589019250, train_1393926975, train_4120272994, train_3077233694, train_521084183, train_3458709564, train_457446751, train_2839644821]"
3,0.019802,"[train_3497907844, train_4175229751, train_2930186018, train_515008716, train_1354360830, train_516663932, train_2677100375, train_3480877149, train_2041937727, train_1699906038, train_3534764813, train_664339299, train_1296711926, train_1591104350, train_2918032299, train_2531777612, train_3731734583, train_1086903685, train_658862317, train_2123729460, train_2708192355, train_3454652975, train_1913449144, train_1540761220, train_3195325438, train_365461299, train_4060693827, train_2664816648, train_3332324456, train_1850201761, train_3956550694, train_1799291694, train_3178715139, train_2230152472, train_3945800827, train_2789820394, train_4003658555, train_240158006, train_2497676427, train_1943472851, train_711042017, train_2146279151, train_2813067780, train_846192553, train_627475146, train_3366673512, train_4199111972, train_2114123891, train_4178955354, train_112182868, train_3668806308]","[train_504965579, train_4071413192, train_795716890, train_1876077383, train_1211423023, train_2591707079, train_4231666385, train_1698846674, train_2070644662, train_2440532965, train_1347964248, train_2758645861, train_2091748956, train_913343891, train_3383491465, train_1923525047, train_2304779029, train_1319813882, train_3611663610, train_1932392577, train_4142808846, train_623713390, train_4097304947, train_2323168031, train_463489902, train_2958095326, train_1329909739, train_1952195376, train_2302420400, train_3068162759, train_2900038174, train_582378833, train_112182868, train_268024497, train_2568522711, train_4172997341, train_1923788617, train_1386324696, train_1321032898, train_578456215, train_4049723728, train_1276207556, train_4166405545, train_3336909646, train_1571033966, train_4292360632, train_2255217317, train_2702771053, train_3006397311, train_2929639614]",[],[],"[train_1276207556, train_1347964248, train_2070644662, train_913343891, train_4172997341, train_2758645861, train_1571033966, train_2302420400, train_3336909646, train_3068162759, train_2255217317, train_1329909739, train_1321032898, train_3383491465, train_2568522711, train_1923525047, train_463489902, train_1932392577, train_623713390, train_2091748956, train_1319813882, train_1211423023, train_582378833, train_3006397311, train_4071413192, train_3611663610, train_4231666385, train_1923788617, train_2929639614, train_4142808846, train_1386324696, train_4097304947, train_1952195376, train_2304779029, train_504965579, train_4292360632, train_2900038174, train_2440532965, train_4049723728, train_268024497, train_795716890, train_2702771053, train_2958095326, train_1876077383, train_2323168031, train_578456215, train_2591707079, train_4166405545, train_1698846674]"
4,0.019802,"[train_37143550, train_513102842, train_3524722132, train_983584924, train_305118117, train_3082994608, train_2281819964, train_406475183, train_2440428831, train_1795803441, train_3012501582, train_2191483137, train_2735482142, train_931662080, train_35171381, train_3998154540, train_353877543, train_2005660717, train_438036326, train_1190240122, train_3785317484, train_1357739170, train_1156325383, train_3254405700, train_90356826, train_49782600, train_163710515, train_2200661537, train_1975054729, train_901922858, train_1775427584, train_3382714342, train_2326532166, train_253683720, train_232189854, train_548782513, train_2548010081, train_463544835, train_2870807441, train_1062046612, train_2102271992, train_1905898995, train_2880913122, train_2494367387, train_3903258334, train_3441737294, train_1874494631, train_1944956984, train_595049990, train_1364655400, train_4103350755]","[train_2893123996, train_2323306906, train_3076387099, train_3169113646, train_3120519708, train_3541926042, train_3361682975, train_1943466047, train_494093746, train_2437293213, train_2492678162, train_240158006, train_4294536428, train_1174931869, train_2144881134, train_1964108397, train_478960551, train_4264226281, train_3405008424, train_2870807441, train_2936578934, train_1280676096, train_4223951707, train_3226910378, train_343332249, train_3496323471, train_2774250857, train_208589774, train_3903410467, train_3470882559, train_2720921085, train_3781848847, train_3584399802, train_2895720387, train_1056765120, train_3090681724, train_2755546390, train_3227446527, train_3506697307, train_565785054, train_2179692783, train_2124608846, train_2109114219, train_2981505684, train_921352661, train_3351248344, train_1497731060, train_2600079121, train_3643272318, train_3057844057]",[],[],"[train_240158006, train_2323306906, train_3227446527, train_2720921085, train_2124608846, train_2895720387, train_3781848847, train_1943466047, train_3541926042, train_2144881134, train_3470882559, train_3584399802, train_1964108397, train_3057844057, train_343332249, train_3405008424, train_565785054, train_3120519708, train_2755546390, train_1497731060, train_2179692783, train_3506697307, train_1056765120, train_2437293213, train_2600079121, train_3169113646, train_1174931869, train_3643272318, train_2936578934, train_4264226281, train_2981505684, train_4223951707, train_1280676096, train_921352661, train_208589774, train_4294536428, train_478960551, train_3496323471, train_2492678162, train_3226910378, train_2774250857, train_3090681724, train_494093746, train_2109114219, train_3351248344, train_3361682975, train_3076387099, train_3903410467, train_2893123996]"
