# Parts of 3rd Place solution

This is simplified version of my solution on 3rd place (train + inference) without trained embeddings, but here I use LightAutoML instead of catboost.  
Also I increase individual embeddings cutoffs to make models trainable with Kaggle Kernels memory limit.


General approach is described here https://www.kaggle.com/c/shopee-product-matching/discussion/238515


Even simplified, it steel scores in Gold range) Good luck!

In [None]:
%%time
!pip install ../input/lama-whl/efficientnet_pytorch-0.7.0/dist/efficientnet_pytorch-0.7.0.tar ../input/lama-whl/log_calls-0.3.2/log_calls-0.3.2/ ../input/lama-whl/sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl ../input/lama-whl/sphinxcontrib_htmlhelp-1.0.3-py2.py3-none-any.whl ../input/lama-whl/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl ../input/lama-whl/sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl ../input/lama-whl/sphinxcontrib_applehelp-1.0.2-py2.py3-none-any.whl ../input/lama-whl/sphinxcontrib_serializinghtml-1.1.4-py2.py3-none-any.whl ../input/lama-whl/importlib_metadata-1.7.0-py2.py3-none-any.whl ../input/lama-whl/poetry_core-1.0.3-py2.py3-none-any.whl ../input/lama-whl/imagesize-1.2.0-py2.py3-none-any.whl ../input/lama-whl/docutils-0.16-py2.py3-none-any.whl ../input/lama-whl/alabaster-0.7.12-py2.py3-none-any.whl ../input/lama-whl/snowballstemmer-2.1.0-py2.py3-none-any.whl ../input/lama-whl/Sphinx-3.5.4-py3-none-any.whl ../input/lama-whl/sphinx_autodoc_typehints-1.11.1-py3-none-any.whl ../input/lama-whl/nbsphinx-0.8.0-py3-none-any.whl ../input/lama-whl/nbsphinx_link-1.3.0-py2.py3-none-any.whl ../input/lama-whl/cssselect-1.1.0-py2.py3-none-any.whl ../input/lama-whl/pyquery-1.4.3-py3-none-any.whl ../input/lama-whl/chuanconggao-html2json-0.2.4.1-0-g99d7fbb/chuanconggao-html2json-99d7fbb/ ../input/lama-whl/json2html-1.3.0/json2html-1.3.0 ../input/lama-whl/lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl ../input/lama-whl/AutoWoE-1.2.1-py3-none-any.whl ../input/lama-whl/LightAutoML-0.2.14-py3-none-any.whl

In [None]:
%%time
!pip install /kaggle/input/nvidia-dali/nvidia_dali_cuda100-1.1.0-2239998-py3-none-manylinux2014_x86_64.whl

In [None]:
%%time
import sys
!cp -r ../input/clip-pretrained/CLIP/CLIP-main /tmp/

# Kaggle likes to unpack .gz files in datasets... so we have to pack it back
!gzip -c /tmp/CLIP-main/clip/bpe_simple_vocab_16e6.txt > /tmp/CLIP-main/clip/bpe_simple_vocab_16e6.txt.gz
sys.path.append('/tmp/CLIP-main')
!cp -r /tmp/CLIP-main/clip /opt/conda/lib/python3.7/site-packages/

!pip install ../input/clip-pretrained/ftfy-5.9/ftfy-5.9 \
             ../input/clip-pretrained/torchvision-0.8.2+cu110-cp37-cp37m-linux_x86_64.whl \
             ../input/clip-pretrained/torch-1.7.1+cu110-cp37-cp37m-linux_x86_64.whl 

In [None]:
EFFNET_PATH = '../input/shopee-effnet/effnet_b2.pth'
ID_BERT_PATH = '../input/shopee-id-bert/'
ML_PATH = '../input/shopee-ml/'

DEBUG = False

if DEBUG:
    test = 'train'
else:
    test = 'test'

In [None]:
import sys

sys.path.append('../input/shopee-effnet/efficientnet_pytorch-0.7.0/')

import numpy as np
import pandas as pd
import joblib

import tqdm

import torch
import os
import gc
import networkx as nx
import treelite
import json
import cuml
import clip
import catboost as cb

import nvidia.dali.ops as ops
import nvidia.dali.types as types

from pandas import Series, DataFrame
from efficientnet_pytorch import EfficientNet
from transformers import AlbertTokenizer, AlbertModel, BertTokenizer, BertModel, \
        AutoModel, AutoTokenizer, BertTokenizerFast

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

from copy import deepcopy

from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


from nvidia.dali.pipeline import Pipeline
from nvidia.dali.plugin.pytorch import DALIGenericIterator
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast

### Define utils

In [None]:
def get_validation_folds(df, nfolds=5, random_state=42):
    """
    Function to create validation folds. Split not only by label group, but also by title, image, phash
    """
    np.random.seed(random_state)
    G = nx.Graph()

    for col in ['label_group', 'title', 'image_phash', 'image']:

        agg = df.groupby(col)['posting_id'].agg(list).tolist()
        for p in agg:
            nx.add_path(G, p)

    cc = {}
    for n, c in enumerate(nx.connected_components(G)):
        val = min(c)
        for x in c:
            cc[x] = val

    group_idx = df['posting_id'].map(cc).values
    groups = np.unique(group_idx)
    np.random.shuffle(groups)

    split = np.array_split(groups, nfolds)

    folds = np.zeros(df.shape[0], dtype=np.int32)

    for n, s in enumerate(split):
        folds[np.isin(group_idx, s)] = n

    return folds


def phash_to_bag(x):
    """
    Transform single phash to OHE representation
    """
    res = np.zeros(16 * 16, dtype=np.int32)

    for n, i in enumerate(x):
        res[int(i, 16) + n * 16] += 1

    return res / ((res ** 2).sum() ** .5)


def get_phash_embed(df):
    """
    Transform df phash to OHE representation
    """
    embed = np.stack(df['image_phash'].map(phash_to_bag).tolist())
    return embed.astype(np.float32)


def union_pred(*preds):
    """
    Union preds from different embeds
    """
    res = []

    for pp in zip(*preds):
        row = []
        for p in pp:
            row.extend(list(p))

        row = list(set(row))
        res.append(row)

    return res

def get_dist_features(D):
    """
    Get density features for embed point
    """
    features = []
    for i in [2, 3, 5, 10, 20, 50]:
        features.append(D[:, 1: i].mean(axis=1))

    for i in [.5, .6, .7, .8, .9, .95, .97, .99]:
        features.append((D >= i).sum(axis=1))

    return np.stack(features, axis=1).astype(np.float32)


def get_paired_indexes(pred, folds=None):
    """
    Create points pairs candidates. First half of pairs is left/right pairs, second part is reflection - right/left
    """
    left, right, fold = [], [], []

    if folds is None:
        folds = np.zeros(len(pred))

    added_pairs = set(zip(left, right))

    for n, (pp, f) in enumerate(zip(pred, folds)):
        for p in pp:
            if n != p and (p, n) not in added_pairs:
                left.append(n)
                right.append(p)
                fold.append(f)
                added_pairs.add((n, p))

    # add reversed pairs
    lc, rc = left.copy(), right.copy()
    left.extend(rc)
    right.extend(lc)
    fold.extend(fold.copy())

    return np.array(left), np.array(right), np.array(fold)


def get_pairwise_dist(left, right, embed, folds=None, batch_size=20000):
    """
    Compute pairwise distance features
    """
    res = np.zeros((left.shape[0], 6), dtype=np.float32)
    embed = torch.from_numpy(embed).cuda()

    q_int = (np.array([0.25, 0.975]) * (embed.shape[1] - 1)).astype(np.int32)
    f = None

    for i in range(0, left.shape[0], batch_size):
        left_embed, right_embed = embed[left[i: i + batch_size]], embed[right[i: i + batch_size]]
        # in case of out-of-fold embeddings
        if folds is not None and len(embed.shape) == 3:
            f = torch.from_numpy(folds[i: i + batch_size].astype(np.int64)
                                 ).view(-1, 1, 1).cuda().repeat(1, embed.shape[1], 1)
            left_embed = torch.gather(left_embed, dim=2, index=f).squeeze(dim=2)
            right_embed = torch.gather(right_embed, dim=2, index=f).squeeze(dim=2)

        for n, fn in enumerate([lambda x, y: x * y, lambda x, y: (x - y) ** 2]):
            coords = fn(left_embed, right_embed)
            # distance
            Dist = coords.sum(dim=1)
            if len(Dist.shape) == 2:
                Dist = Dist.mean(dim=1)

            res[i: i + batch_size, n * 3] = Dist.cpu().numpy()
            # quantiles
            idx = coords.argsort(dim=1)[:, q_int]
            Qs = torch.gather(coords, dim=1, index=idx)
            if len(Qs.shape) == 3:
                Qs = Qs.mean(dim=2)
            res[i: i + batch_size, (n * 3) + 1: (n * 3) + 3] = Qs.cpu().numpy()

    del embed, left_embed, right_embed, coords, f, Dist, Qs, idx
    torch.cuda.empty_cache()
    gc.collect()

    return res


def get_cross_pairwise_dist(left, right, embed_pair, batch_size=20000):
    """
    Compute text to image/image to text pairwise features (for CLIP embedding)
    """
    res = np.zeros((left.shape[0], 4), dtype=np.float32)
    embed_pair = [torch.from_numpy(x).cuda() for x in embed_pair]

    for n, emb0 in enumerate(embed_pair):
        for k, emb1 in enumerate(embed_pair):
            for i in range(0, left.shape[0], batch_size):
                left_embed, right_embed = emb0[left[i: i + batch_size]], emb1[right[i: i + batch_size]]
                res[i: i + batch_size, n * 2 + k] = (left_embed * right_embed).sum(dim=1).cpu().numpy()

    del embed_pair, emb0, emb1, left_embed, right_embed
    torch.cuda.empty_cache()
    gc.collect()

    return res


def get_sparse_pairwise_dist(left, right, embed):
    """
    Get pairwise features for sparse embedding (tfidf)
    """
    left = embed[left]
    right = embed[right]

    return np.array(left.multiply(right).sum(axis=1))


def get_pairwise(left, right, *embeds, folds=None, scores_only=False):
    """
    Get pairwise features for embeddings list
    """
    feats = []

    for embed in embeds:
        # for inference - not to keep embeddings in memory
        if type(embed) is str:
            embed = joblib.load(embed)

        if isinstance(embed, np.ndarray):
            pairw = get_pairwise_dist(left, right, embed, folds=folds, batch_size=10000)
        elif isinstance(embed, list):
            pairw = get_cross_pairwise_dist(left, right, embed, batch_size=10000)
        else:
            pairw = get_sparse_pairwise_dist(left, right, embed)

        if scores_only:
            pairw = pairw[:, [0]]

        feats.append(pairw)

    return np.concatenate(feats, axis=1)


def get_features(left, right, texts, points_feats_list, embed_list, count_params, bypass_feats=None, folds=None):
    """
    Get GBM features from left/right pairs idx
    """
    feats = []

    for points_feats in points_feats_list:
        for index in [left, right]:
            feats.append(points_feats[index])

    feats.append(get_pairwise(left, right, *embed_list, folds=folds))

    for param in count_params:
        feats.append(get_length_features(left, right, texts, param))

    if bypass_feats is not None:
        feats.extend(bypass_feats)

    return np.concatenate(feats, axis=1).astype(np.float32)


def get_prediction_index(left, right, prob, k, cutoff=0.3, hard_cutoff=False, exact_add=2):
    """
    Transform paired prediction to list of product predictions
    """
    res = [[x] for x in range(k)]
    probs = [[1] for x in range(k)]
    sl = prob > cutoff

    for n, (l, r, flg) in enumerate(zip(left, right, sl)):
        res[l].append(r)
        probs[l].append(prob[n])

    # calc proxy f1 score and decide - if it worth to add next point
    # TODO: Check it later !!! small score decrease
    for i in range(len(res)):
        arr = res[i]

        orders = np.array(probs[i]).argsort()[::-1][:50]
        indexes = np.array(arr)[orders]
        ps = np.array(probs[i])[orders]

        result = []
        prev_proxy_f1 = 0
        proxy_total_yt = ps[ps > cutoff].sum()
        proxy_total_inter = 0
        len_yp = 0
        for n, (idx, p) in enumerate(zip(indexes, ps[ps > cutoff])):

            len_yp += 1
            proxy_total_inter += 2 * p
            proxy_f1 = proxy_total_inter / (len_yp + proxy_total_yt)

            if (proxy_f1 >= prev_proxy_f1) or hard_cutoff:
                result.append(idx)
                prev_proxy_f1 = proxy_f1
            else:
                break

        if len(result) < exact_add:
            result = list(indexes[:exact_add])

        res[i] = result
        probs[i] = ps[:len(result)]

    return res, probs


def cutoff_prediction(D, I, cutoff, exact_add=2):
    """
    Cutoff prediction of distances/indices matrices
    """
    res = []

    ranger = np.arange(D.shape[1])

    for d, i in zip(D, I):
        res.append(i[(d > cutoff) | (ranger < exact_add)])
    return res


def get_y_true(df):
    """
    Get true prediction indices
    """
    index = Series(np.arange(df.shape[0]))
    grp = index.groupby(df['label_group'].values).agg(list)

    return df['label_group'].map(grp).tolist()


def f1_score(y_true, y_pred):
    """
    F1 score
    """
    metric = 0

    for yt, yp in zip(y_true, y_pred):
        inter = np.intersect1d(yt, yp)
        metric += 2 * len(inter) / (len(yt) + len(yp))

    return metric / len(y_true)


def f1_score_co_search(y_true, D, I, cutoffs=None):
    """
    F1 score with cutoff search for distances/indices matrices
    """
    if cutoffs is None:
        cutoffs = np.linspace(0.5, 1, 20)[:-1]

    else:
        cutoffs = np.sort(cutoffs)

    metric = np.zeros_like(cutoffs)

    for yt, yp, d in zip(y_true, I, D):
        if not isinstance(yp, np.ndarray):
            yp = np.array(yp)

        sls = cutoffs[:, np.newaxis] <= d[np.newaxis, :]
        sls_sum = sls.sum(axis=1)

        prev_s = -np.inf
        for n, (sl, s) in enumerate(zip(sls, sls_sum)):
            # if we pass cutoff - switch slice and recalc metric
            if s != prev_s:
                yp_ = yp[sl]
                inter = np.intersect1d(yt, yp_)
                met_ = 2 * len(inter) / (len(yt) + len(yp_))
                prev_s = s

            metric[n] += met_

    metric = metric / len(y_true)

    best_val = metric.argmax()

    return metric[best_val], cutoffs[best_val]


def get_di_torch(embed, n_candidates=50, batch_size=1000):
    """
    Calc distances/indices matrices from embeddings
    """
    D = np.zeros((embed.shape[0], n_candidates), dtype=np.float32)
    I = np.zeros((embed.shape[0], n_candidates), dtype=np.int32)

    flg_dense = isinstance(embed, np.ndarray)

    if flg_dense:
        embed_cuda = torch.from_numpy(embed).cuda()
    else:
        embed_cuda = csr_to_torch_sparse(embed).cuda()

    for i in range(0, embed.shape[0], batch_size):

        if flg_dense:
            embed_batch = embed_cuda[i: i + batch_size]
            d = torch.matmul(embed_cuda, embed_batch.T).T
        else:
            embed_batch = torch.from_numpy(embed[i: i + batch_size].toarray().T).cuda()
            d = torch.matmul(embed_cuda, embed_batch).T

        idx = torch.argsort(d, dim=1, descending=True)[:, :n_candidates]
        I[i: i + batch_size, :idx.shape[1]] = idx.cpu().numpy()
        D[i: i + batch_size, :idx.shape[1]] = torch.gather(d, 1, idx).cpu().numpy()

    del d, idx, embed_cuda, embed_batch
    torch.cuda.empty_cache()

    return D, I


def get_cross_di_torch(embed_x, embed_y, n_candidates=50, batch_size=1000):
    """
    Calc cross distances/indices matrices (for CLIP)
    """
    D = np.zeros((embed_y.shape[0], n_candidates), dtype=np.float32)
    I = np.zeros((embed_y.shape[0], n_candidates), dtype=np.int32)

    flg_dense = isinstance(embed_x, np.ndarray)

    if flg_dense:
        embed_x = torch.from_numpy(embed_x).T.cuda()
        embed_y = torch.from_numpy(embed_y).cuda()
    else:
        embed_x = csr_to_torch_sparse(embed_x).cuda()

    for i in range(0, embed_y.shape[0], batch_size):

        if flg_dense:
            embed_batch = embed_y[i: i + batch_size]
            # d = torch.matmul(embed_x, embed_batch.T).T
            d = torch.matmul(embed_batch, embed_x)
        else:
            embed_batch = torch.from_numpy(embed_y[i: i + batch_size].toarray().T).cuda()
            d = torch.matmul(embed_x, embed_batch).T

        idx = torch.argsort(d, dim=1, descending=True)[:, :n_candidates]
        I[i: i + batch_size, :idx.shape[1]] = idx.cpu().numpy()
        D[i: i + batch_size, :idx.shape[1]] = torch.gather(d, 1, idx).cpu().numpy()

    del d, idx, embed_x, embed_y, embed_batch
    torch.cuda.empty_cache()

    return D, I



def csr_to_torch_sparse(csr_mat):
    """
    Transform csr matrix to torch Sparse format
    """
    coo_mat = csr_mat.astype(np.float32).tocoo()

    row = torch.from_numpy(coo_mat.row).type(torch.int64)
    col = torch.from_numpy(coo_mat.col).type(torch.int64)
    edge_index = torch.stack([row, col], dim=0)

    val = torch.from_numpy(coo_mat.data)
    out = torch.sparse.FloatTensor(edge_index, val, torch.Size(coo_mat.shape))

    return out


class ReadPipeline(Pipeline):
    """
    DALI Image read pipeline for torch
    """
    def __init__(self, img_list, batch_size, img_size=300, num_threads=2, device_id=0, num_gpus=1, shuffle=False,
                 name='Reader', hflip_p=0, scale=1, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
        super().__init__(batch_size, num_threads, device_id)

        self.input = ops.FileReader(files=img_list, random_shuffle=False,
                                    shard_id=device_id, shuffle_after_epoch=shuffle, num_shards=num_gpus)

        self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
        self.resize = ops.Resize(device="gpu", resize_shorter=img_size,
                                 interp_type=types.INTERP_LINEAR)
        self.cmn = ops.CropMirrorNormalize(device="gpu",
                                           dtype=types.FLOAT,
                                           crop=(img_size, img_size),
                                           mean=[255 * x for x in mean],
                                           std=[255 * x for x in std])

        self.uniform = ops.random.Uniform(range=(0.0, 1.0))
        self.resize_rng = ops.random.Uniform(range=(300, int(img_size * scale) + 1))
        self.coin = ops.random.CoinFlip(probability=hflip_p)

        self.name = 'Reader'

    def define_graph(self):
        inputs, labels = self.input(name=self.name)
        images = self.decode(inputs)
        images = self.resize(images  # , resize_shorter=self.resize_rng()
                             )
        output = self.cmn(images, mirror=self.coin()
                          # , out_of_bounds_policy='trim_to_shape', crop_pos_x=self.uniform(),
                          # crop_pos_y=self.uniform()
                          )
        return (output, labels)


class DaliTorchLoader(DataLoader):
    """
    Torch DataLoader with DALI ReadPipeline
    """
    def __init__(self, image_list, image_size=300, batch_size=128, num_threads=2, n_gpus=1, shuffle=False,
                 drop_last=False, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), name='Reader'):

        self._initialize(image_list, image_size=image_size, batch_size=batch_size, num_threads=num_threads,
                         n_gpus=n_gpus, shuffle=shuffle,
                         drop_last=drop_last, mean=mean, std=std, name=name)

    def _initialize(self, image_list, image_size=300, batch_size=128, num_threads=2, n_gpus=1, shuffle=False,
                    drop_last=False, name='Reader', hflip_p=0, scale=1,
                    mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):

        assert n_gpus == 1, 'For now only 1 GPU'

        pipes = [ReadPipeline(image_list, batch_size=batch_size, img_size=image_size, num_threads=num_threads,
                              device_id=device_id, num_gpus=n_gpus, shuffle=shuffle, name=name, hflip_p=hflip_p,
                              scale=scale, mean=mean, std=std)
                 for device_id in range(n_gpus)]

        self.length = len(image_list) // batch_size
        self.last_batch = len(image_list) % batch_size

        if self.last_batch == 0 or drop_last:
            self.last_batch = batch_size
            self.drop_last = True
        else:
            self.length += 1
            self.drop_last = drop_last

        pipes[0].build()
        self.dali_iter = DALIGenericIterator(pipes, ['data', 'label'], reader_name=name)

    def __len__(self):

        return self.length

    def _process_batch(self, batch):
        # assume single GPU
        batch = batch[0]
        return batch

    def __iter__(self):

        for n, batch in enumerate(self.dali_iter):
            batch = self._process_batch(batch)
            # if not drop last - cut last batch
            if n == (len(self) - 1) and not self.drop_last:
                for k in batch:
                    batch[k] = batch[k][:self.last_batch]
                yield batch
            else:
                yield batch
                # if drop last and last full batch - raise stop iteration
                if n == (len(self) - 1) and self.drop_last:
                    return
                
                

def score_with_image_model_dali(df, model, image_path, image_size=300, batch_size=32, device='cuda:0', n_jobs=10,
                                mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
    """
    Get image embedding
    """
    dl = DaliTorchLoader(df['image'].map(lambda x: os.path.join(image_path, x)).tolist(), \
                         image_size=image_size, batch_size=batch_size, num_threads=n_jobs, mean=mean, std=std)

    model.eval()

    res = []

    with torch.set_grad_enabled(False):
        for batch in tqdm.tqdm(dl):
            with autocast():
                pred = model.extract_features(batch['data'].to(device))
            pred = pred.view(*pred.shape[:2], -1).mean(dim=-1).detach().cpu().numpy().astype(np.float32)
            res.append(pred)

    res = np.concatenate(res, axis=0)
    res = res / ((res ** 2).sum(axis=1, keepdims=True) ** .5)
    return res


class MatchingTextTest(Dataset):
    """
    Torch Dataset for text models
    """
    def __init__(self, data, random_state=42):
        self.data = data

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = row['title']

        return text

    def __len__(self):
        return self.data.shape[0]

    
def score_with_text_model(df, text_model, tokenizer=None, batch_size=32, device='cuda:0',
                          max_length=128, n_jobs=10, standartize=True, normalize=True):
    """
    Get text embedding
    """
    ds = MatchingTextTest(df)
    dl = DataLoader(ds, batch_size=batch_size, num_workers=n_jobs, drop_last=False, shuffle=False)

    if tokenizer is None:
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    text_model.eval()

    res = []

    with torch.set_grad_enabled(False):
        for batch in tqdm.tqdm(dl):
            texts = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
            with autocast():
                texts = {x: texts[x].to(device) for x in texts}
                texts = text_model(**texts).last_hidden_state[:, 0, :]

            pred = texts.detach().cpu().numpy().astype(np.float32)
            res.append(pred)

    res = np.concatenate(res, axis=0)
    if standartize:
        res = (res - res.mean(axis=0)) / res.std(axis=0)

    if normalize:
        res = res / ((res ** 2).sum(axis=1, keepdims=True) ** .5)
    return res



class TokenizeWrapper:
    """
    Tokenizer for CLIP
    """
    def __init__(self, max_len=77):
        self.tokenizer = clip.simple_tokenizer.SimpleTokenizer()
        self.max_len = max_len

    def __call__(self, texts):
        res = torch.zeros((len(texts), self.max_len), dtype=torch.long)
        res[:, 0] = 49406

        for n, tx in enumerate(texts):
            enc = self.tokenizer.encode(tx)
            enc = enc[:self.max_len - 2] + [49407]
            res[n, 1:len(enc) + 1] = torch.tensor(enc)

        return res


class DaliClipLoader(DaliTorchLoader):
    """
    DataLodaer for CLIP
    """
    def __init__(self, train, image_path='train_images', batch_size=128, num_threads=2, n_gpus=1,
                 name='Reader', hflip_p=0.5, scale=1.3):
        self.train = train
        self.params = {

            'image_size': 244,
            'batch_size': batch_size,
            'num_threads': num_threads,
            'n_gpus': n_gpus,
            'name': name,
            'shuffle': False,
            'drop_last': False,
            'hflip_p': hflip_p,
            'scale': scale,
            'mean': (0.48145466, 0.4578275, 0.40821073),
            'std': (0.26862954, 0.26130258, 0.27577711)

        }
        self.texts = TokenizeWrapper()(self.train['title'])
        self.image_path = image_path
        self._initialize(train['image'].map(lambda x: os.path.join(image_path, x)).tolist(), **self.params)

    def _process_batch(self, batch):
        # assume single GPU
        batch = super()._process_batch(batch)
        batch['text'] = self.texts[batch['label'].type(torch.long)[:, 0]]
        return batch


def score_with_clip_model_dali(df, model, image_path, batch_size=32, device='cuda:0', n_jobs=10,
                               ):
    """
    Get CLIP embedding
    """
    dl = DaliClipLoader(df, image_path=image_path, batch_size=batch_size, num_threads=n_jobs)

    model.eval()

    res_img, res_text = [], []

    with torch.set_grad_enabled(False):
        for batch in tqdm.tqdm(dl):
            with autocast():
                res_img.append(model.encode_image(batch['data'].to(device)).detach().cpu().numpy().astype(np.float32))
                res_text.append(model.encode_text(batch['text'].to(device)).detach().cpu().numpy().astype(np.float32))

    res_img = np.concatenate(res_img, axis=0)
    res_img = res_img / ((res_img ** 2).sum(axis=1, keepdims=True) ** .5)

    res_text = np.concatenate(res_text, axis=0)
    res_text = res_text / ((res_text ** 2).sum(axis=1, keepdims=True) ** .5)

    return res_img, res_text


def get_length_features(left, right, texts, vect_params):
    """
    Get additional features from text
    """
    res = np.empty((len(left), 3), dtype=np.float32)

    token_counts = CountVectorizer(**vect_params, dtype=np.bool, binary=True).fit_transform(texts)
    left, right = token_counts[left], token_counts[right]

    inter = left.multiply(right)
    res[:, 0] = inter.sum(axis=1).ravel()

    diff = left + right - inter
    res[:, 1] = diff.sum(axis=1).ravel()

    res[:, 2] = np.abs(left.sum(axis=1).ravel() - right.sum(axis=1).ravel())

    return res


class FeaturesGenerator:
    """
    Features generator (for batch inference)
    """
    def __init__(self, feature_fn, cache_dir=None, **kwargs):

        self.kwargs = kwargs
        self.cache_dir = cache_dir

        if cache_dir is not None:

            self.batch = os.path.join(cache_dir, 'batch_{0}.pkl')
            os.makedirs(cache_dir, exist_ok=True)

            for f in [x for x in os.listdir(cache_dir) if x[:5] == 'batch']:

                path = os.path.join(cache_dir, f)

                if os.path.exists(path):
                    os.remove(path)

        self.feature_fn = feature_fn

    def features(self, left, right, folds=None, bypass_feats=None):

        if bypass_feats is None:
            bypass_feats = []

        X = self.feature_fn(left=left, right=right, folds=folds,
                            bypass_feats=bypass_feats, **self.kwargs)
        
        X = DataFrame(X, columns=['feat_{0}'.format(x) for x in range(X.shape[1])])

        return X

    def features_generator(self, left, right, bypass_feats=None, batch_size=500000):

        if bypass_feats is None:
            bypass_feats = []

        self.cached_files = []

        for n, i in enumerate(range(0, len(left), batch_size)):

            X = self.feature_fn(left=left[i: i + batch_size], right=right[i: i + batch_size], folds=None,
                                bypass_feats=[x[i: i + batch_size] for x in bypass_feats], **self.kwargs)

            if self.cache_dir is not None:
                joblib.dump(X, self.batch.format(n))
                self.cached_files.append(self.batch.format(n))
            
            X = DataFrame(X, columns=['feat_{0}'.format(x) for x in range(X.shape[1])])

            yield X

    def cached_generator(self):

        assert len(self.cached_files) > 0, 'No cached files'

        for fname in self.cached_files:
            yield joblib.load(fname)

def reflect_prediction(pred):
    """
    Average left/right and right/left prediction
    """
    L = pred.shape[0] // 2
    reflected = pred.copy()

    reflected[:L] += pred[L:]
    reflected[L:] += pred[:L]

    reflected /= 2

    return reflected



### Clustering functions


def calc_components_dist(comp0, comp1, orig_dist):
    res = []

    for c0 in comp0:
        for c1 in comp1:
            d = orig_dist[c0].get(c1)
            if d is None:
                d = orig_dist[c1].get(c0, 0)
            res.append(d)

    res = np.array(res)

    return np.mean(res) * 0.75 + res.max() * 0.25


def upd_cutoff(co, cl_s0, cl_s1):
    cl_size = max(cl_s0, cl_s1)

    if cl_size < 2:
        co = max(co, 0.55)
    elif cl_size < 5:
        co = max(co, 0.45)
    elif cl_size < 10:
        co = max(co, 0.35)
    else:
        co = max(co, 0.25)

    return co


def _default_cutoff_fn(co, *args, **kwargs):
    return co


def update_clusters(clusters, cluster_candidates, cluster_distances, cutoff=0.99, max_add=2, max_cl_size=50,
                    cutoff_fn=None):
    if cutoff_fn is None:
        cutoff_fn = _default_cutoff_fn

    G = nx.Graph()

    for cname in clusters:

        cand = cluster_candidates[cname]
        dist = cluster_distances[cname]
        valid_candidates = [x for (x, y) in zip(cand, dist)
                            if y > cutoff_fn(cutoff, len(clusters[x]), len(clusters[cname]))
                            ][:max_add]

        path = clusters[cname].copy()
        for cand in valid_candidates:

            valid = clusters[cand]
            if ((len(valid) + len(path)) <= max_cl_size):
                path.extend(clusters[cand])

        nx.add_path(G, path)

    clusters = {}
    backmap = {}

    for comp in nx.connected_components(G):

        comp = list(comp)
        cname = min(comp)

        clusters[cname] = comp

        for c in comp:
            backmap[c] = cname

    for c in range(len(backmap)):
        if c not in backmap:
            clusters[c] = [c]
            backmap[c] = c

    return clusters, backmap


def update_distances(clusters, backmap, orig_dist):
    cluster_candidates = {}
    cluster_distances = {}

    for cname in clusters:
        comp = set(clusters[cname])
        candidates = []
        for c in comp:
            candidates.extend([backmap[x] for x in orig_dist[c] if x not in comp])
        candidates = list(set(candidates))

        distances = np.array([calc_components_dist(clusters[x], comp, orig_dist) for x in candidates])
        order = distances.argsort()[::-1]
        cluster_candidates[cname] = list(np.array(candidates)[order])
        cluster_distances[cname] = distances[order]

    return cluster_candidates, cluster_distances


def init_clusters(oof_pred, oof_probs):
    orig_dist = []

    for cc, pp in zip(oof_pred, oof_probs):
        orig_dist.append({x: y for (x, y) in zip(cc[1:], pp[1:])})

    clusters, cluster_candidates, cluster_distances = {}, {}, {}

    for c in range(len(oof_pred)):
        clusters[c] = [c]
        cluster_candidates[c] = oof_pred[c]
        cluster_distances[c] = oof_probs[c]

    return orig_dist, clusters, cluster_candidates, cluster_distances


def get_pred_from_cluster(clusters, orig_dist, cluster_candidates, cluster_distances,
                          cl_co=0.5, co=0.6, exact_add=1):
    pred = [None for _ in range(len(orig_dist))]

    for cname in clusters:
        comp = clusters[cname]

        to_merge = []
        cand = cluster_candidates[cname]
        dist = cluster_distances[cname]

        for n, (c, d) in enumerate(zip(cand, dist)):
            if (d > cl_co) or ((n < exact_add) and len(comp) == 1):
                to_merge.extend(clusters[c])

        for c in comp:

            pp = [c] + [x for x in comp if x != c] + to_merge

            dist = [1]
            for p in pp[1:]:
                dist.append(orig_dist[c].get(p, 0))

            dist = np.array(dist)
            orders = dist.argsort()
            pp = np.array(pp)[orders][:50]

            s_pp = set(pp)

            additional = [x for x in orig_dist[c] if x not in s_pp and orig_dist[c][x] > co]
            pred[c] = list(pp) + additional[:50 - len(pp)]

    return pred

### Train part

In [None]:
data = pd.read_csv('../input/shopee-product-matching/train.csv')
device = 'cuda:0'
y_true = get_y_true(data)
folds = get_validation_folds(data, 5, 42)

In [None]:
def extract_feats_and_preds(embed, co):
    """
    Calculate points frequencies (features for meta model) and prediction from embedding
    """
    D, I = get_di_torch(embed)
    points = get_dist_features(D)
    pred = cutoff_prediction(D, I, co)
    
    return points, pred 


### Image embedding

In [None]:
%%time
def get_effnet_embed(data, co=0.7, images_path='train'):
    """
    Get embeddings, predctions and frequency stats from Efficient Net
    """
    model = EfficientNet.from_name('efficientnet-b2')
    model.load_state_dict(joblib.load(EFFNET_PATH))
    model = model.to(device)
    embed = score_with_image_model_dali(data, model, 
                '../input/shopee-product-matching/{0}_images/'.format(images_path), 
                380, batch_size=128, device=device, n_jobs=2)
    
    points, pred = extract_feats_and_preds(embed, co)

    del model
    gc.collect()
    torch.cuda.empty_cache()
    
    return embed, pred, points

image_embed, image_pred, img_points = get_effnet_embed(data)

### Multilang BERT

In [None]:
%%time
def get_multilang_embed(data, co=0.6):
    """
    Get embeddings, predctions and frequency stats from Multi language model setu4993/LaBSE
    """    
    model = AutoModel.from_pretrained('{0}/ml.model'.format(ML_PATH), 
                                  return_dict=True).to(device)
    tokenizer = BertTokenizerFast.from_pretrained('{0}/ml.token'.format(ML_PATH))

    embed = score_with_text_model(data, model, tokenizer=tokenizer, 
                                   batch_size=128, device=device, max_length=160, n_jobs=2)
    
    points, pred = extract_feats_and_preds(embed, co)

    del model
    gc.collect()
    torch.cuda.empty_cache()
    
    return embed, pred, points


ml_embed, ml_pred, ml_points = get_multilang_embed(data, co=0.6)

### Indonesian BERT

In [None]:
%%time
def get_id_embed(data, co=0.6):
    
    """
    Get embeddings, predctions and frequency stats from Indonesian model cahya/bert-base-indonesian-522M
    """    

    model = BertModel.from_pretrained('{0}/id.model'.format(ID_BERT_PATH), 
                                return_dict=True).to(device)
    tokenizer = BertTokenizer.from_pretrained('{0}/id.token'.format(ID_BERT_PATH))

    embed = score_with_text_model(data, model, tokenizer=tokenizer, 
                                batch_size=128, device=device, max_length=160, n_jobs=2)
    
    points, pred = extract_feats_and_preds(embed, co)

    del model
    gc.collect()
    torch.cuda.empty_cache()
    
    return embed, pred, points


id_embed, id_pred, id_points = get_id_embed(data, co=0.6)

### CLIP

In [None]:
%%time
def get_clip_embed(data, co_img, co_text, images_path='train'):
    """
    Get embeddings from CLIP
    """
    clip_model, _ = clip.load("../input/clip-pretrained/ViT-B-32.pt")
    clip_model = clip_model.to(device)
    clip_img_embed, clip_text_embed = score_with_clip_model_dali(data, clip_model,
            '../input/shopee-product-matching/{0}_images/'.format(images_path))


    clip_img_D, clip_img_I = get_cross_di_torch(clip_img_embed, clip_text_embed)
    clip_img_points = get_dist_features(clip_img_D)

    clip_text_D, clip_text_I = get_cross_di_torch(clip_text_embed, clip_img_embed)
    clip_text_points = get_dist_features(clip_text_D)
    

    clip_img_pred = cutoff_prediction(clip_img_D, clip_img_I, co_img)
    clip_text_pred = cutoff_prediction(clip_text_D, clip_text_I, co_text)
    
    del clip_text_D, clip_text_I, clip_img_D, clip_img_I, clip_model
    torch.cuda.empty_cache()
    gc.collect()
    
    return clip_img_embed, clip_text_embed, clip_img_pred, clip_text_pred, clip_img_points, clip_text_points

clip_img_embed, clip_text_embed, clip_img_pred, \
    clip_text_pred, clip_img_points, clip_text_points = get_clip_embed(data, 0.34, 0.34)

### TF IDFs

In [None]:
%%time
def get_tfidf_embed(data, param_list, cutoffs):
    """
    Get TfIdf embeddings with different tokenize params
    """
    tfidf_embed, tfidf_D, tfidf_I, tfidf_points = [], [], [], []

    for params in param_list:
        vect = TfidfVectorizer(**params, dtype=np.float32)
        tfidf_embed.append(vect.fit_transform(data['title']))

        _d, _i = get_di_torch(tfidf_embed[-1])
        tfidf_D.append(_d)
        tfidf_I.append(_i)

        tfidf_points.append(get_dist_features(tfidf_D[-1]))
        print(params)
        
    tfidf_preds = []

    for d, i, co in zip(tfidf_D, tfidf_I, cutoffs):

        tfidf_preds.append(cutoff_prediction(d, i, co))
        print(sum(map(len, tfidf_preds[-1])) / len(data))

    del tfidf_I, tfidf_D
    gc.collect()
        
    return tfidf_embed, tfidf_preds, tfidf_points
    

tfidf_embed, tfidf_preds, tfidf_points = get_tfidf_embed(data,                  
                                        param_list = [
                                            {'lowercase': True, 'ngram_range': (1, 1)}, 
                                            {'lowercase': True, 'ngram_range': (3, 3),
                                             'analyzer': 'char'},  
                                        ], 
                                        cutoffs=[0.45, 0.45])

### Union candidates

In [None]:
%%time
total = union_pred(image_pred, 
                   ml_pred, 
                   id_pred, 
                   clip_img_pred, clip_text_pred,
                   *tfidf_preds)
left, right, fold = get_paired_indexes(total, folds)

In [None]:
left.shape

In [None]:
# del image_pred, ml_pred, id_pred, tfidf_preds
# gc.collect()

### Get features

In [None]:
xgen = FeaturesGenerator(get_features, texts=data['title'], 
                         
                         points_feats_list=[img_points, id_points, ml_points] + tfidf_points, 
                         
                         embed_list=[image_embed, id_embed, ml_embed, 
                                     [clip_img_embed, clip_text_embed]] + tfidf_embed,
                         
                         count_params=[
                            {'lowercase': True, 'ngram_range': (1, 1)}, 
                            {'lowercase': True, 'ngram_range': (3, 3), 
                             'analyzer': 'char'}, 
                        ])

In [None]:
%%time
X = xgen.features(left, right, fold)
X['target'] = (data['label_group'].values[left] == data['label_group'].values[right]).astype(np.float32)
X['fold'] = fold

In [None]:
# del xgen, img_points, id_points, ml_points,  tfidf_points, \
#     image_embed, id_embed, ml_embed, tfidf_embed, total

# gc.collect()

### LightAutoML fit predict

In [None]:
%%time

task = Task('binary')
roles = {'target': 'target', 'group': 'fold'}

automl = TabularAutoML(task = task, 
                       timeout = 3600,
                       cpu_limit = 2,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'cb']]},
                       reader_params = {'cv': 5, 'random_state': 42, 'advanced_roles': False}, 
                       cb_params = {'default_params': {'learning_rate': 0.03, "od_wait": 300, "max_bin": 128, 
                                                       "min_data_in_leaf": 10, "max_depth": 8}, 'freeze_defaults': True},
                       selection_params = {'mode': 0},
                       verbose=2)

preds = automl.fit_predict(X, roles=roles).data[:, 0]

In [None]:
del X
gc.collect()

### Make prediction

In [None]:
%%time
oof_pred, oof_probs = get_prediction_index(left, right, 
                                           prob=preds,
                                           k=data.shape[0], cutoff = .4, hard_cutoff=True, 
                                           exact_add=2)

In [None]:
print('OOF score with no post processing {0}'.format(f1_score(y_true, oof_pred)))

### Make clustering for train data

In [None]:
oof_pred, oof_probs = get_prediction_index(left, right, 
                                           prob=preds,
                                           k=data.shape[0], cutoff = .1, hard_cutoff=True, 
                                           exact_add=2)

orig_dist, clusters, cluster_candidates, cluster_distances = init_clusters(oof_pred, oof_probs)

cutoff = 1.00
for i in range(90 ):
    
    cutoff -= 0.01
        
    clusters, backmap = update_clusters(clusters, cluster_candidates, 
                                        cluster_distances, cutoff=cutoff, 
                                        max_add=2, max_cl_size=50, 
                                        cutoff_fn=upd_cutoff)
    
    cluster_candidates, cluster_distances = update_distances(clusters, backmap, orig_dist)
    
    print('Cutoff {0} done. N clusters = {1}'.format(round(cutoff, 3), len(clusters)))
    
    new_oof_pred = get_pred_from_cluster(clusters, orig_dist, 
                                         cluster_candidates, cluster_distances, 
                                         cl_co=1, co=1, exact_add=1)
    sc = f1_score(y_true, new_oof_pred)
    
    print(' Score {0}'.format(sc))
    
    if (len(data) / len(clusters)) >= 2.82:
        break

In [None]:
print('OOF score with post processing {0}'.format(f1_score(y_true, new_oof_pred)))

In [None]:
del y_true, oof_pred, new_oof_pred, oof_probs, clusters, backmap, cluster_candidates, cluster_distances, orig_dist, data
gc.collect()

### Inference for test

In [None]:
data = pd.read_csv('../input/shopee-product-matching/{0}.csv'.format(test))

In [None]:
%%time
image_embed, image_pred, img_points = get_effnet_embed(data, co=0.7, images_path=test)
ml_embed, ml_pred, ml_points = get_multilang_embed(data, co=0.6)
id_embed, id_pred, id_points = get_id_embed(data, co=0.6)

clip_img_embed, clip_text_embed, clip_img_pred, \
    clip_text_pred, clip_img_points, clip_text_points = get_clip_embed(data, 0.34, 0.34, test)

tfidf_embed, tfidf_preds, tfidf_points = get_tfidf_embed(data,                  
                                        param_list = [
                                            {'lowercase': True, 'ngram_range': (1, 1)}, 
                                            {'lowercase': True, 'ngram_range': (3, 3), 
                                             'analyzer': 'char'},  
                                        ], 
                                        cutoffs=[0.45, 0.45])


total = union_pred(image_pred, 
                   ml_pred, 
                   id_pred, 
                   clip_img_pred, clip_text_pred,
                   *tfidf_preds)

left, right, _ = get_paired_indexes(total)

xgen = FeaturesGenerator(get_features, texts=data['title'], 
                         
                         points_feats_list=[img_points, id_points, ml_points] + tfidf_points, 
                         
                         embed_list=[image_embed, id_embed, ml_embed, 
                                     [clip_img_embed, clip_text_embed]] + tfidf_embed,
                         
                         count_params=[
                            {'lowercase': True, 'ngram_range': (1, 1)}, 
                            {'lowercase': True, 'ngram_range': (3, 3), 
                             'analyzer': 'char'}, 
                        ])

In [None]:
del image_pred,  ml_pred, id_pred, tfidf_preds
gc.collect()

In [None]:
%%time
if len(left) > 0:
    
    prediction = []
    
    for batch in xgen.features_generator(left, right, batch_size=250000):
        
        print('Batch shape {0}'.format(batch.shape))
        
        # automl predict
        prediction.append(automl.predict(batch).data[:, 0])
        
        del batch
        gc.collect()
        torch.cuda.empty_cache()
    
    prediction = reflect_prediction(np.concatenate(prediction))
    # calibrate predictions (assume we have same amount of positive and twice amount of negative)
    prediction = prediction / (prediction + (1 - prediction) * 2)

In [None]:
if len(left) > 0:
    pred_index, probs = get_prediction_index(left, right, prob=prediction,
                                             k=data.shape[0], 
                                             cutoff = .1, hard_cutoff=True, exact_add=2)

    orig_dist, clusters, cluster_candidates, cluster_distances = init_clusters(pred_index, probs)

    cutoff = 1.00
    for i in range(90):
        cutoff -= 0.01

        clusters, backmap = update_clusters(clusters, cluster_candidates, 
                                            cluster_distances, cutoff=cutoff, 
                                            max_add=2, max_cl_size=50, 
                                            cutoff_fn=upd_cutoff)
        
        cluster_candidates, cluster_distances = update_distances(clusters, backmap, orig_dist)

        print('Cutoff {0} done. N clusters = {1}'.format(round(cutoff, 3), len(clusters)))

        if (len(data) / len(clusters)) >= 2.6:
            break
            
    pred_index = get_pred_from_cluster(clusters, orig_dist, 
                                       cluster_candidates, cluster_distances, 
                                       cl_co=1, co=1, exact_add=1)

else:
    pred_index, probs = [[x] for x in range(data.shape[0])], [[1.0]] * data.shape[0]

### Create submission

In [None]:
%%time
posting_dict = data['posting_id'].reset_index(drop=True).to_dict()

pred_posting = []
for pp in pred_index:
    pred_posting.append(' '.join([posting_dict[x] for x in pp]))

In [None]:
prediction = data[['posting_id']].copy()
prediction['matches'] = pred_posting

prediction

In [None]:
prediction.to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()

In [None]:
%%time
if DEBUG:   
    y_true = get_y_true(data)
    metric_val = f1_score(y_true, pred_index)
    
    print(metric_val)