In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [None]:
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig
from transformers import AutoTokenizer, AutoModel, AutoConfig

MAX_LEN = 128

tokenizers = [DistilBertTokenizer.from_pretrained("/kaggle/input/db-tokenizer", do_lower_case=True),
              AutoTokenizer.from_pretrained("/kaggle/input/db-tokenizer2", do_lower_case=True)]

In [None]:
import os
import cv2
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import albumentations
from albumentations.pytorch.transforms import ToTensorV2

import torch
import timm
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors

# Config

In [None]:
class CFG:
    seed = 54
    classes = 11014 
    scale = 30 
    margin = 0.5
    model_name =  'tf_efficientnet_b4'
    fc_dim = 512
    img_size = 512
    batch_size = 20
    num_workers = 4
    device = device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Utils

In [None]:
import codecs
from unicodedata import normalize


UNIT_TEST_SIZE = 256

def fix_encoding(x):
    return normalize("NFD", codecs.escape_decode(x, 'hex')[0].decode("utf-8"))

def read_dataset():
    df = pd.read_csv('../input/shopee-product-matching/test.csv')
    df["path"] = "test"
    fp_df = pd.read_csv('../input/shopee-product-matching/train.csv')
    
    if df.shape[0] == 3:
        df = pd.read_csv('../input/shopee-product-matching/train.csv').head(UNIT_TEST_SIZE).drop("label_group", axis=1)
        df["path"] = "train"
        fp_df = pd.read_csv('../input/shopee-product-matching/train.csv').tail(UNIT_TEST_SIZE).drop("label_group", axis=1)
        
    df["title"] = df["title"].apply(fix_encoding)
    fp_df["title"] = fp_df["title"].apply(fix_encoding)
    fp_df["path"] = "train"
    return df, fp_df

df, fp_df = read_dataset()
df.shape, fp_df.shape

In [None]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)

# Image Predictions

In [None]:
img_backbones = ["swin_base_patch4_window12_384", 
                 'tf_efficientnet_b4', 
                 "vit_base_resnet50_384"]
img_model_paths = ['../input/shopee-img-models/img_model_i15.pth', 
                   '../input/shopee-img-models/img_model_i04.pth',
                   '../input/shopee-img-models/img_model_i11.pth']


class ShopeeModel(nn.Module):
    def __init__(self, model_name, pretrained, fc_dim=512):
        super(ShopeeModel, self).__init__()
        self.model_name = model_name
        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        if model_name != "tf_efficientnet_b4":
            in_features = self.backbone.head.in_features
            self.backbone.head = nn.Identity()
        else:
            in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        
        self.dropout = nn.Dropout(p=0.1)
        self.classifier = nn.Linear(in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        self.vector_size = fc_dim
        
    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
        
    def forward(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        if self.model_name == "tf_efficientnet_b4":
            x = self.pooling(x)
        x = x.view(batch_size, -1)

        x = self.dropout(x)
        x = self.classifier(x)
        x = self.bn(x)
        
        x = F.normalize(x)
        return x

In [None]:
from transformers import DistilBertModel, DistilBertTokenizer


class Text2Vec(nn.Module):
    def __init__(self, bert):
        super(Text2Vec, self).__init__()
        self.top = nn.Sequential(nn.BatchNorm1d(768), nn.Dropout(0.2), nn.Linear(768, 256))
        self.bert = bert
        
    def forward(self, ids, mask):
        return F.normalize(self.top(self.bert(ids, mask)[0][:, 0, :]))

    
class ShopeeDataset(Dataset):
    
    def __init__(self, df):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.img_sizes = [(512, 512), (384, 384)]
        self.tokenizers = tokenizers

    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        img = cv2.imread(f"/kaggle/input/shopee-product-matching/{row.path}_images/{row.image}")
        
        outs = [torch.FloatTensor(cv2.resize(img, size)).permute(2, 0, 1)/255.0 for size in self.img_sizes]
        
        for i in range(len(tokenizers)):
            inputs = self.tokenizers[i].encode_plus(
                row.title,
                None,
                add_special_tokens=True,
                max_length=MAX_LEN,
                padding="max_length",
                return_token_type_ids=True,
                truncation=True
            )
            outs.append(torch.LongTensor(inputs['input_ids']))
            outs.append(torch.LongTensor(inputs['attention_mask']))
        return tuple(outs)

    def __len__(self):
        return self.df.shape[0]

In [None]:
def get_embeddings(df):
    conf = AutoConfig.from_pretrained("/kaggle/input/db-tokenizer2/config.json")
    text2vecs = [Text2Vec(DistilBertModel(DistilBertConfig())), Text2Vec(DistilBertModel(conf))]

    text2vecs[0].load_state_dict(torch.load("/kaggle/input/shopee-text-models/text_model_t08full_0.pth"))
    text2vecs[1].load_state_dict(torch.load("/kaggle/input/shopee-text-models/text_model_t08full_1.pth"))

    text2vecs[0] = text2vecs[0].cuda()
    text2vecs[1] = text2vecs[1].cuda()
    text2vecs[0].eval()
    text2vecs[1].eval()

    models = [None, None, None]
    for i in range(3):
        models[i] = ShopeeModel(img_backbones[i], pretrained=False).to(CFG.device)
        models[i].load_state_dict(torch.load(img_model_paths[i]))
        models[i].eval()

    image_dataset = ShopeeDataset(df)
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        num_workers=CFG.num_workers
    )

    V_img1, V_img2, V_img3, V_text1, V_text2 = [], [], [], [], []
    with torch.no_grad():
        for x in tqdm(image_loader): 
            inputs = [inp.cuda() for inp in x]
            im_vector1 = models[0](inputs[1])#/(6**0.5)
            im_vector2 = models[1](inputs[0])#/(6**0.5)
            im_vector3 = models[2](inputs[1])#/(6**0.5)
            text_vector1 = text2vecs[0](inputs[2], inputs[3])#/(4**0.5)
            text_vector2 = text2vecs[1](inputs[4], inputs[5])#/(4**0.5)
            
            V_img1.append(im_vector1.detach().cpu().numpy().astype(np.float32))
            V_img2.append(im_vector2.detach().cpu().numpy().astype(np.float32))
            V_img3.append(im_vector3.detach().cpu().numpy().astype(np.float32))
            V_text1.append(text_vector1.detach().cpu().numpy().astype(np.float32))
            V_text2.append(text_vector2.detach().cpu().numpy().astype(np.float32))
            
    V_img1, V_img2, V_img3, V_text1, V_text2 = np.concatenate(V_img1), np.concatenate(V_img2), np.concatenate(V_img3), np.concatenate(V_text1), np.concatenate(V_text2)

    return np.concatenate([V_img1, V_img2, V_img3], axis=1)/np.sqrt(3), np.concatenate([V_text1, V_text2], axis=1)/np.sqrt(2)

In [None]:
V_img, V_text = get_embeddings(df)
V_img.shape, V_text.shape

In [None]:
V_img_fp, V_text_fp = get_embeddings(fp_df)
V_img_fp.shape, V_text_fp.shape

In [None]:
def db_aug(V):
    model = NearestNeighbors(n_neighbors=2, metric="cosine")
    model.fit(V)
    distances, indices = model.kneighbors(V)
    
    w = np.power(np.clip(2.0 - distances, 0, 2.0), 0.5)
    
    V = (w[:, 0, None]*V[indices[:, 0]] + w[:, 1, None]*V[indices[:, 1]])/w.sum(axis=1)[:, None]
    
    return V


V = db_aug(np.concatenate([V_img/np.sqrt(2), V_text/np.sqrt(2)], axis=1))
V.shape

In [None]:
model = NearestNeighbors(n_neighbors=50, metric="cosine")
model.fit(V)
distances, indices = model.kneighbors(V)

In [None]:
def get_min_dist(V, V_fp):
    d = []
    bs = 256
    for begin in tqdm(range(0, V.shape[0], bs)):
        end = min(V.shape[0], begin + bs)
        d.append(np.dot(V[begin:end], V_fp).max(axis=1))
    
    return 1 - np.concatenate(d)


D_img_fp_min = get_min_dist(V_img, V_img_fp.T)
D_text_fp_min = get_min_dist(V_text, V_text_fp.T)

D_img_fp_min.shape, D_text_fp_min.shape

In [None]:
postings = df["posting_id"].values
res_df = []

for i in tqdm(range(df.shape[0])):
    dix = np.where(distances[i] < 0.37)[0]
    ix = indices[i][dix]
    img_fp_min = D_img_fp_min[i]
    text_fp_min = D_text_fp_min[i]
    
    for index, j in enumerate(ix):
        img_fp_min2 = D_img_fp_min[j]
        text_fp_min2 = D_text_fp_min[j]
        res_df.append({"posting_id": postings[i], "matches": postings[j], "dist": distances[i, dix[index]], 
                       "img_dist": 1 - (V_img[i]*V_img[j]).sum(), "text_dist": 1 - (V_text[i]*V_text[j]).sum(),
                       "img_fp_min": img_fp_min, "text_fp_min": text_fp_min, "img_fp_min2": img_fp_min2, "text_fp_min2": text_fp_min2})
        
res_df = pd.DataFrame(res_df)
print(res_df.shape)
res_df.head()

In [None]:
res_df["dist_rank"] = res_df.groupby("posting_id")["dist"].rank()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,1), binary=True)
tfidf.fit(df["title"])

tfidf2 = TfidfVectorizer(analyzer="char", ngram_range=(5, 5))
tfidf2.fit(df["title"])

In [None]:
preds_df = res_df[res_df["posting_id"] < res_df["matches"]]
preds_df = preds_df.merge(df[["posting_id", "title"]], on="posting_id", how="left")
preds_df = preds_df.merge(df[["posting_id", "title"]].rename(columns={"posting_id": "matches"}), on="matches", how="left")


preds_df["cos_sim"] = tfidf.transform(preds_df["title_x"]).multiply(tfidf.transform(preds_df["title_y"])).sum(axis=1)
preds_df["cos_sim2"] = tfidf2.transform(preds_df["title_x"]).multiply(tfidf2.transform(preds_df["title_y"])).sum(axis=1)

preds_df.shape

In [None]:
import xgboost as xgb

features = ["img_dist", "text_dist", "dist", "dist_rank", "cos_sim", "cos_sim2",
            "img_fp_min", "text_fp_min", "img_fp_min2", "text_fp_min2"]

xgb_model = xgb.XGBClassifier()
xgb_model.load_model("/kaggle/input/shopee-xgb-models/xgb_821_new.json")

preds_df["pred"] = xgb_model.predict_proba(preds_df[features])[:, 1]/2


preds_df[features] = preds_df[features].rank(pct=True, axis=0)
xgb_model = xgb.XGBClassifier()
xgb_model.load_model("/kaggle/input/shopee-xgb-models/xgb_821b_new.json")

preds_df["pred"] += xgb_model.predict_proba(preds_df[features])[:, 1]/2

preds_df.head()

In [None]:
def agglomerative_clustering(preds_df, single_link_threshold=0.30, group_link_threshold=0.70, group_merge_threshold=0.80):

    groups = dict()
    group_members = dict()

    gix = 0
    for i, row in tqdm(preds_df.sort_values("pred", ascending=False).iterrows(), total=preds_df.shape[0]):
        if row.pred > single_link_threshold:
            g1 = groups.get(row.posting_id)
            g2 = groups.get(row.matches)

            if g1 is None and g2 is None:
                groups[row.posting_id] = gix
                groups[row.matches] = gix
                group_members[gix] = {row.posting_id, row.matches}
                gix += 1
            elif g1 is None:
                if row.pred > group_link_threshold:
                    groups[row.posting_id] = g2
                    group_members[g2].add(row.posting_id)
            elif g2 is None:
                if row.pred > group_link_threshold:
                    groups[row.matches] = g1
                    group_members[g1].add(row.matches)
            elif (g1 != g2) and (row.pred > group_merge_threshold):
                groups[row.matches] = g1
                group_members[g1].update(group_members[g2])

                del group_members[g2]

                for k, v in groups.items():
                    if v == g2:
                        groups[k] = g1


    print(len(groups))

    out_df = []

    for k, v in groups.items():
        for k2 in group_members[v]:
            if k != k2:
                out_df.append({"posting_id": k, "matches": k2})

    return pd.DataFrame(out_df)

out_df = agglomerative_clustering(preds_df)
out_df.shape

In [None]:
same_df = df[["posting_id"]].copy()
same_df["matches"] = same_df["posting_id"].values

out_df = out_df.append(same_df)
out_df.shape

In [None]:
out_df = out_df.groupby("posting_id")["matches"].agg(list).reset_index()
out_df["matches"] = out_df["matches"].apply(lambda x: " ".join(x))
out_df.head()

In [None]:
out_df.to_csv('submission.csv', index=False, columns=['posting_id', 'matches'])