In [None]:
%%capture
!pip install ../input/faiss-163/faiss_gpu-1.6.3-cp37-cp37m-manylinux2010_x86_64.whl
!pip install ../input/shopee-libs/editdistance-0.5.3-cp37-cp37m-manylinux1_x86_64.whl

In [None]:
%%writefile lyk_config.py

k = 50
conf_th = 0.7

import pandas as pd
from pathlib import Path

DEBUG = len(pd.read_csv('../input/shopee-product-matching/test.csv')) == 3

def load_data():
    if DEBUG:
        nrows = 1000
        df = pd.read_csv('../input/shopee-product-matching/train.csv', nrows=nrows, usecols=['posting_id', 'image', 'title'])
#         nrows = None
#         df = pd.read_csv('../input/shopee-product-matching/train.csv', nrows=nrows, usecols=['posting_id', 'image', 'title']).append(
#              pd.read_csv('../input/shopee-product-matching/train.csv', nrows=nrows, usecols=['posting_id', 'image', 'title'])).reset_index(drop=True)
        img_dir = Path('../input/shopee-product-matching/train_images/')
    else:
        nrows = None
        df = pd.read_csv('../input/shopee-product-matching/test.csv', usecols=['posting_id', 'image', 'title'])
        img_dir = Path('../input/shopee-product-matching/test_images/')
    return df, img_dir

# Image similarity, Multi-modal similarity

In [None]:
%%python
from lyk_config import k, conf_th, DEBUG, load_data

import sys
sys.path.append('../input/timm045/')
import timm

from itertools import zip_longest
import json
import math
import gc
import os
from pathlib import Path

import faiss
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.io import read_image
from torchvision.transforms import Resize, RandomHorizontalFlip, ColorJitter, Normalize, Compose, RandomResizedCrop, CenterCrop, ToTensor

from tqdm import tqdm
from PIL import Image
import joblib
from scipy.sparse import hstack, vstack, csc_matrix, csr_matrix
import editdistance
import networkx as nx
from transformers import BertConfig, BertModel, BertTokenizerFast

NUM_CLASSES = 11014
NUM_WORKERS = 2
SEED = 0


def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)

    
class ShopeeNet(nn.Module):

    def __init__(self,
                 backbone,
                 num_classes,
                 fc_dim=512,
                 s=30, margin=0.5, p=3):
        super(ShopeeNet, self).__init__()

        self.backbone = backbone
        self.backbone.reset_classifier(num_classes=0)  # remove classifier

        self.fc = nn.Linear(self.backbone.num_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        self.p = p

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone.forward_features(x)
        if isinstance(x, tuple):
            x = (x[0] + x[1]) / 2
            x = self.bn(x)
        else:
            x = gem(x, p=self.p).view(batch_size, -1)
            x = self.fc(x)
            x = self.bn(x)
        return x

    def forward(self, x, label):
        feat = self.extract_feat(x)
        x = self.loss_module(feat, label)
        return x, feat


class ShopeeDataset(Dataset):

    def __init__(self, df, img_dir, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform

    def __getitem__(self, index):
        row = self.df.iloc[index]
        img = read_image(str(self.img_dir / row['image']))
        _, h, w = img.shape
        st_size = (self.img_dir / row['image']).stat().st_size
        if self.transform is not None:
            img = self.transform(img)

        return img, row['title'], h, w, st_size

    def __len__(self):
        return len(self.df)


class MultiModalNet(nn.Module):

    def __init__(self,
                 backbone,
                 bert_model,
                 num_classes,
                 tokenizer,
                 max_len=32,
                 fc_dim=512,
                 s=30, margin=0.5, p=3, loss='ArcMarginProduct'):
        super().__init__()

        self.backbone = backbone
        self.backbone.reset_classifier(num_classes=0)  # remove classifier

        self.bert_model = bert_model
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.fc = nn.Linear(self.bert_model.config.hidden_size + self.backbone.num_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        self.p = p

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def extract_feat(self, img, title):
        batch_size = img.shape[0]
        img = self.backbone.forward_features(img)
        img = gem(img, p=self.p).view(batch_size, -1)

        tokenizer_output = self.tokenizer(title, truncation=True, padding=True, max_length=self.max_len)
        input_ids = torch.LongTensor(tokenizer_output['input_ids']).to('cuda')
        token_type_ids = torch.LongTensor(tokenizer_output['token_type_ids']).to('cuda')
        attention_mask = torch.LongTensor(tokenizer_output['attention_mask']).to('cuda')
        title = self.bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        # x = x.last_hidden_state.sum(dim=1) / attention_mask.sum(dim=1, keepdims=True)
        title = title.last_hidden_state.mean(dim=1)

        x = torch.cat([img, title], dim=1)
        x = self.fc(x)
        x = self.bn(x)
        return x


####

df, img_dir = load_data()
    
###

checkpoint1 = torch.load('../input/shopee/v45.pth')
checkpoint2 = torch.load('../input/shopee/v34.pth')
checkpoint3 = torch.load('../input/shopee/v79.pth')
params1 = checkpoint1['params']
params2 = checkpoint2['params']
params3 = checkpoint3['params']

transform = Compose([
    Resize(size=params1['test_size'] + 32, interpolation=Image.BICUBIC),
    CenterCrop((params1['test_size'], params1['test_size'])),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
dataset = ShopeeDataset(df=df, img_dir=img_dir, transform=None)
data_loader = DataLoader(dataset, batch_size=8, shuffle=False,
                         drop_last=False, pin_memory=True, num_workers=NUM_WORKERS, collate_fn=lambda x: x)

backbone = timm.create_model(model_name=params1['backbone'], pretrained=False)
model1 = ShopeeNet(backbone, num_classes=0, fc_dim=params1['fc_dim'])
model1 = model1.to('cuda')
model1.load_state_dict(checkpoint1['model'], strict=False)
model1.train(False)
model1.p = params1['p_eval']

backbone = timm.create_model(model_name=params2['backbone'], pretrained=False)
model2 = ShopeeNet(backbone, num_classes=0, fc_dim=params2['fc_dim'])
model2 = model2.to('cuda')
model2.load_state_dict(checkpoint2['model'], strict=False)
model2.train(False)
model2.p = params2['p_eval']

backbone = timm.create_model(model_name=params3['backbone'], pretrained=False)
tokenizer = BertTokenizerFast(vocab_file='../input/bert-indo/vocab.txt')
bert_config = BertConfig.from_json_file('../input/bert-indo/config.json')
bert_model = BertModel(bert_config)
model3 = MultiModalNet(backbone, bert_model, num_classes=0, tokenizer=tokenizer, max_len=params3['max_len'],
                       fc_dim=params3['fc_dim'], s=params3['s'], margin=params3['margin'], loss=params3['loss'])
model3 = model3.to('cuda')
model3.load_state_dict(checkpoint3['model'], strict=False)
model3.train(False)
model3.p = params3['p_eval']

img_feats1 = []
img_feats2 = []
mm_feats = []
img_hs = []
img_ws = []
st_sizes = []
for batch in tqdm(data_loader, total=len(data_loader), miniters=None, ncols=55):
    img, title, h, w, st_size = list(zip(*batch))
    img = torch.cat([transform(x.to('cuda').float() / 255)[None] for x in img], axis=0)
    title = list(title)
    with torch.no_grad():
        feats_minibatch1 = model1.extract_feat(img)
        img_feats1.append(feats_minibatch1.cpu().numpy())
        feats_minibatch2 = model2.extract_feat(img)
        img_feats2.append(feats_minibatch2.cpu().numpy())
        feats_minibatch3 = model3.extract_feat(img, title)
        mm_feats.append(feats_minibatch3.cpu().numpy())
    img_hs.extend(list(h))
    img_ws.extend(list(w))
    st_sizes.extend(list(st_size))

img_feats1 = np.concatenate(img_feats1)
img_feats1 /= np.linalg.norm(img_feats1, 2, axis=1, keepdims=True)
img_feats2 = np.concatenate(img_feats2)
img_feats2 /= np.linalg.norm(img_feats2, 2, axis=1, keepdims=True)
mm_feats = np.concatenate(mm_feats)
mm_feats /= np.linalg.norm(mm_feats, 2, axis=1, keepdims=True)

np.save('/tmp/img_feats1', img_feats1)
np.save('/tmp/img_feats2', img_feats2)

img_feats = np.concatenate([
    img_feats1 * 1.0,
    img_feats2 * 1.0,
], axis=1)
img_feats /= np.linalg.norm(img_feats, 2, axis=1, keepdims=True)
###

np.save('/tmp/img_feats', img_feats)

res = faiss.StandardGpuResources()
index_img = faiss.IndexFlatIP(params1['fc_dim'] + params2['fc_dim'])
index_img = faiss.index_cpu_to_gpu(res, 0, index_img)
index_img.add(img_feats)
similarities_img, indexes_img = index_img.search(img_feats, k)


joblib.dump([similarities_img, indexes_img], '/tmp/lyk_img_data.pkl')
joblib.dump([st_sizes, img_hs, img_ws], '/tmp/lyk_img_meta_data.pkl')

res = faiss.StandardGpuResources()
index_mm = faiss.IndexFlatIP(params3['fc_dim'])
index_mm = faiss.index_cpu_to_gpu(res, 0, index_mm)
index_mm.add(mm_feats)
similarities_mm, indexes_mm = index_mm.search(mm_feats, k)

joblib.dump([similarities_mm, indexes_mm], '/tmp/lyk_mm_data.pkl')

### for TKM
np.save('/tmp/mm_feats', mm_feats)

### image QE

In [None]:
%%python
import gc
import numpy as np
import faiss

def query_expansion(feats, sims, topk_idx, alpha=0.5, k=2):
    weights = np.expand_dims(sims[:, :k] ** alpha, axis=-1).astype(np.float32)
    feats = (feats[topk_idx[:, :k]] * weights).sum(axis=1)
    return feats

img_feats = np.load('/tmp/img_feats.npy')

res = faiss.StandardGpuResources()
index_img = faiss.IndexFlatIP(img_feats.shape[1])
index_img = faiss.index_cpu_to_gpu(res, 0, index_img)
index_img.add(img_feats)
img_D, img_I = index_img.search(img_feats, 60)

np.save('/tmp/img_D', img_D)
np.save('/tmp/img_I', img_I)

img_feats_qe = query_expansion(img_feats, img_D, img_I)
img_feats_qe /= np.linalg.norm(img_feats_qe, 2, axis=1, keepdims=True)

img_feats = np.hstack([img_feats, img_feats_qe])
img_feats /= np.linalg.norm(img_feats, axis=1).reshape((-1, 1))

index = faiss.IndexFlatIP(img_feats.shape[1])
res = faiss.StandardGpuResources()
index = faiss.index_cpu_to_gpu(res, 0, index)

index.add(img_feats)
img_D, img_I = index.search(img_feats, 60)

np.save('/tmp/img_D_qe', img_D)
np.save('/tmp/img_I_qe', img_I)

print('end')

### Multi-modal QE

In [None]:
%%python
import gc
import numpy as np
import faiss

def query_expansion(feats, sims, topk_idx, alpha=0.5, k=2):
    weights = np.expand_dims(sims[:, :k] ** alpha, axis=-1).astype(np.float32)
    feats = (feats[topk_idx[:, :k]] * weights).sum(axis=1)
    return feats

mm_feats = np.load('/tmp/mm_feats.npy')

res = faiss.StandardGpuResources()
index_mm = faiss.IndexFlatIP(mm_feats.shape[1])
index_mm = faiss.index_cpu_to_gpu(res, 0, index_mm)
index_mm.add(mm_feats)
mm_D, mm_I = index_mm.search(mm_feats, 60)

np.save('/tmp/mut_D', mm_D)
np.save('/tmp/mut_I', mm_I)

mm_feats_qe = query_expansion(mm_feats, mm_D, mm_I)
mm_feats_qe /= np.linalg.norm(mm_feats_qe, 2, axis=1, keepdims=True)

mm_feats = np.hstack([mm_feats, mm_feats_qe])
mm_feats /= np.linalg.norm(mm_feats, axis=1).reshape((-1, 1))

index = faiss.IndexFlatIP(mm_feats.shape[1])
res = faiss.StandardGpuResources()
index = faiss.index_cpu_to_gpu(res, 0, index)

index.add(mm_feats)
mm_D, mm_I = index.search(mm_feats, 60)

np.save('/tmp/mut_D_qe', mm_D)
np.save('/tmp/mut_I_qe', mm_I)

# BERT similarity

In [None]:
%%python
from lyk_config import k, conf_th, DEBUG, load_data
import sys
sys.path.append('../input/timm045/')
import timm

from itertools import zip_longest
import json
import math
import gc
import os
from pathlib import Path

import faiss
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.io import read_image
from torchvision.transforms import Resize, RandomHorizontalFlip, ColorJitter, Normalize, Compose, RandomResizedCrop, CenterCrop, ToTensor

from tqdm import tqdm
from PIL import Image
import joblib
from scipy.sparse import hstack, vstack, csc_matrix, csr_matrix
import editdistance
import networkx as nx

from transformers import BertConfig, BertModel, BertTokenizerFast

NUM_CLASSES = 11014
NUM_WORKERS = 2
SEED = 0


def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)


class BertNet(nn.Module):

    def __init__(self,
                 bert_model,
                 num_classes,
                 tokenizer,
                 max_len=32,
                 fc_dim=512,
                 simple_mean=True,
                 s=30, margin=0.5, p=3, loss='ArcMarginProduct'):
        super().__init__()

        self.bert_model = bert_model
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.fc = nn.Linear(self.bert_model.config.hidden_size, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        self.p = p
        self.simple_mean = simple_mean

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def extract_feat(self, x):
        tokenizer_output = self.tokenizer(x, truncation=True, padding=True, max_length=self.max_len)
        if 'token_type_ids' in tokenizer_output:
            input_ids = torch.LongTensor(tokenizer_output['input_ids']).to('cuda')
            token_type_ids = torch.LongTensor(tokenizer_output['token_type_ids']).to('cuda')
            attention_mask = torch.LongTensor(tokenizer_output['attention_mask']).to('cuda')
            x = self.bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        else:
            input_ids = torch.LongTensor(tokenizer_output['input_ids']).to('cuda')
            attention_mask = torch.LongTensor(tokenizer_output['attention_mask']).to('cuda')
            x = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        if self.simple_mean:
            x = x.last_hidden_state.mean(dim=1)
        else:
            x = torch.sum(x.last_hidden_state * attention_mask.unsqueeze(-1), dim=1) / attention_mask.sum(dim=1, keepdims=True)
        x = self.fc(x)
        x = self.bn(x)
        return x


class BertDataset(Dataset):

    def __init__(self, df):
        self.df = df

    def __getitem__(self, index):
        row = self.df.iloc[index]

        if 'y' in row.keys():
            target = torch.tensor(row['y'], dtype=torch.long)
            return row['title'], target
        else:
            return row['title']

    def __len__(self):
        return len(self.df)

df, img_dir = load_data()

checkpoint = torch.load('../input/shopee/v75.pth')
checkpoint2 = torch.load('../input/shopee/v102.pth')
checkpoint3 = torch.load('../input/shopee/v103.pth')

params_bert = checkpoint['params']
params_bert2 = checkpoint2['params']
params_bert3 = checkpoint3['params']

datasets = {
    'valid': BertDataset(df=df)
}
data_loaders = {
    'valid': DataLoader(datasets['valid'], batch_size=params_bert['batch_size'] * 2, shuffle=False,
                        drop_last=False, pin_memory=True, num_workers=NUM_WORKERS)
}

tokenizer = BertTokenizerFast(vocab_file='../input/bert-indo/vocab.txt')
bert_config = BertConfig.from_json_file('../input/bert-indo/config.json')
bert_model = BertModel(bert_config)
model = BertNet(bert_model, num_classes=0, tokenizer=tokenizer, max_len=params_bert['max_len'], simple_mean=True,
                fc_dim=params_bert['fc_dim'], s=params_bert['s'], margin=params_bert['margin'], loss=params_bert['loss'])

model = model.to('cuda')
model.load_state_dict(checkpoint['model'], strict=False)
model.train(False)

from transformers import AutoTokenizer, AutoModel, AutoConfig

model_name = params_bert2['model_name']
tokenizer = AutoTokenizer.from_pretrained('../input/bertmultilingual/')
bert_config = AutoConfig.from_pretrained('../input/bertmultilingual/')
bert_model = AutoModel.from_config(bert_config)
model2 = BertNet(bert_model, num_classes=0, tokenizer=tokenizer, max_len=params_bert['max_len'], simple_mean=False,
                 fc_dim=params_bert['fc_dim'], s=params_bert['s'], margin=params_bert['margin'], loss=params_bert['loss'])
model2 = model2.to('cuda')
model2.load_state_dict(checkpoint2['model'], strict=False)
model2.train(False)

#########

model_name = params_bert3['model_name']
tokenizer = AutoTokenizer.from_pretrained('../input/bertxlm/')
bert_config = AutoConfig.from_pretrained('../input/bertxlm/')
bert_model = AutoModel.from_config(bert_config)
model3 = BertNet(bert_model, num_classes=0, tokenizer=tokenizer, max_len=params_bert3['max_len'], simple_mean=False,
                 fc_dim=params_bert3['fc_dim'], s=params_bert3['s'], margin=params_bert3['margin'], loss=params_bert3['loss'])
model3 = model3.to('cuda')
model3.load_state_dict(checkpoint3['model'], strict=False)
model3.train(False)

bert_feats1 = []
bert_feats2 = []
bert_feats3 = []
for i, title in tqdm(enumerate(data_loaders['valid']),
                     total=len(data_loaders['valid']), miniters=None, ncols=55):
    with torch.no_grad():
        bert_feats_minibatch = model.extract_feat(title)
        bert_feats1.append(bert_feats_minibatch.cpu().numpy())
        bert_feats_minibatch = model2.extract_feat(title)
        bert_feats2.append(bert_feats_minibatch.cpu().numpy())
        bert_feats_minibatch = model3.extract_feat(title)
        bert_feats3.append(bert_feats_minibatch.cpu().numpy())

bert_feats1 = np.concatenate(bert_feats1)
bert_feats1 /= np.linalg.norm(bert_feats1, 2, axis=1, keepdims=True)
bert_feats2 = np.concatenate(bert_feats2)
bert_feats2 /= np.linalg.norm(bert_feats2, 2, axis=1, keepdims=True)
bert_feats3 = np.concatenate(bert_feats3)
bert_feats3 /= np.linalg.norm(bert_feats3, 2, axis=1, keepdims=True)

bert_feats = np.concatenate([bert_feats1, bert_feats2], axis=1)
bert_feats /= np.linalg.norm(bert_feats, 2, axis=1, keepdims=True)

res = faiss.StandardGpuResources()
index_bert = faiss.IndexFlatIP(params_bert['fc_dim'])
index_bert = faiss.index_cpu_to_gpu(res, 0, index_bert)
index_bert.add(bert_feats1)
similarities_bert, indexes_bert = index_bert.search(bert_feats1, k)

np.save('/tmp/bert_feats1', bert_feats1)
np.save('/tmp/bert_feats2', bert_feats2)
np.save('/tmp/bert_feats3', bert_feats3)

bert_feats = np.concatenate([bert_feats1, bert_feats2, bert_feats3], axis=1)
bert_feats /= np.linalg.norm(bert_feats, 2, axis=1, keepdims=True)

np.save('/tmp/bert_feats', bert_feats)

joblib.dump([similarities_bert, indexes_bert], '/tmp/lyk_bert_data.pkl')

### Bert QE

In [None]:
%%python
import gc
import numpy as np
import faiss

def query_expansion(feats, sims, topk_idx, alpha=0.5, k=2):
    weights = np.expand_dims(sims[:, :k] ** alpha, axis=-1).astype(np.float32)
    feats = (feats[topk_idx[:, :k]] * weights).sum(axis=1)
    return feats

brt_feats = np.load('/tmp/bert_feats.npy')

res = faiss.StandardGpuResources()
index_brt = faiss.IndexFlatIP(brt_feats.shape[1])
index_brt = faiss.index_cpu_to_gpu(res, 0, index_brt)
index_brt.add(brt_feats)
brt_D, brt_I = index_brt.search(brt_feats, 60)

np.save('/tmp/brt_D', brt_D)
np.save('/tmp/brt_I', brt_I)

del index_brt
gc.collect()

brt_feats_qe = query_expansion(brt_feats, brt_D, brt_I)
brt_feats_qe /= np.linalg.norm(brt_feats_qe, 2, axis=1, keepdims=True)

brt_feats = np.hstack([brt_feats, brt_feats_qe])
brt_feats /= np.linalg.norm(brt_feats, axis=1).reshape((-1, 1))

index = faiss.IndexFlatIP(brt_feats.shape[1])
res = faiss.StandardGpuResources()
index = faiss.index_cpu_to_gpu(res, 0, index)

index.add(brt_feats)
brt_D, brt_I = index.search(brt_feats, 60)

np.save('/tmp/brt_D_qe', brt_D)
np.save('/tmp/brt_I_qe', brt_I)
print('end')

# Image & BERT similarity

In [None]:
%%python
import gc
import numpy as np
import faiss

def query_expansion(feats, sims, topk_idx, alpha=0.5, k=2):
    weights = np.expand_dims(sims[:, :k] ** alpha, axis=-1).astype(np.float32)
    feats = (feats[topk_idx[:, :k]] * weights).sum(axis=1)
    return feats


feats_bert = np.load('/tmp/bert_feats.npy')
feats_img = np.load('/tmp/img_feats.npy')

bth_feats = np.hstack([feats_bert, feats_img])
bth_feats /= np.linalg.norm(bth_feats, 2, axis=1, keepdims=True)

print(bth_feats.shape)

res = faiss.StandardGpuResources()
index = faiss.IndexFlatIP(bth_feats.shape[1])
index = faiss.index_cpu_to_gpu(res, 0, index)
index.add(bth_feats)

bth_D, bth_I = index.search(bth_feats, 60)
np.save('/tmp/bth_D', bth_D)
np.save('/tmp/bth_I', bth_I)

del index
gc.collect()

bth_feats_qe = query_expansion(bth_feats, bth_D, bth_I)
bth_feats_qe /= np.linalg.norm(bth_feats_qe, 2, axis=1, keepdims=True)

bth_feats = np.hstack([bth_feats, bth_feats_qe])
bth_feats /= np.linalg.norm(bth_feats, axis=1).reshape((-1, 1))

index = faiss.IndexFlatIP(bth_feats.shape[1])
res = faiss.StandardGpuResources()
index = faiss.index_cpu_to_gpu(res, 0, index)

index.add(bth_feats)
bth_D, bth_I = index.search(bth_feats, 60)

np.save('/tmp/bth_D_qe', bth_D)
np.save('/tmp/bth_I_qe', bth_I)
print('end')

# lyakaap Side (GCN)

In [None]:
%%python
from lyk_config import k, conf_th, DEBUG, load_data
import sys
sys.path.append('../input/timm045/')
import timm

from itertools import zip_longest
import json
import math
import gc
import os
from pathlib import Path

import faiss
import numpy as np
import cupy as cp
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm
from PIL import Image
import joblib
import lightgbm as lgb
from scipy.sparse import hstack, vstack, csc_matrix, csr_matrix
import editdistance
import networkx as nx

import string
import nltk
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

NUM_CLASSES = 11014
NUM_WORKERS = 2
SEED = 0

class GraphDataset(Dataset):

    def __init__(self, feats=None, labels=None, weights=None, pair_tuples=None, k=50, top_neighbors=None):
        self.feats = feats
        self.labels = labels
        self.weights = weights
        self.pair_tuples = pair_tuples
        self.k = k
        self.top_neighbors = top_neighbors

    def __getitem__(self, index):
        i, j = self.pair_tuples[index]
        feat = torch.FloatTensor(self.feats[i][j])

        padding_i = [[0] * feat.shape[0]] * (self.k - len(self.top_neighbors[i]))
        neighbor_feats_i = torch.FloatTensor([
            self.feats[i][neighbor]
            for neighbor in self.top_neighbors[i]
        ] + padding_i)
        padding_j = [[0] * feat.shape[0]] * (self.k - len(self.top_neighbors[j]))
        neighbor_feats_j = torch.FloatTensor([
            self.feats[j][neighbor]
            for neighbor in self.top_neighbors[j]
        ] + padding_j)
        neighbor_feats = torch.cat([feat.unsqueeze(0), neighbor_feats_i, neighbor_feats_j], dim=0)

        outputs = (feat, neighbor_feats)
        if self.labels is not None:
            outputs += (self.labels[i] == self.labels[j],)
        if self.weights is not None:
            outputs += (self.weights[i],)

        return outputs

    def __len__(self):
        return len(self.pair_tuples)


class GraphAttentionLayer(nn.Module):

    def __init__(self, in_features, out_features, dropout=0.6, alpha=0.2, concat=True):
        super().__init__()
        self.dropout = dropout
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.concat = concat

        self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        self.a = nn.Parameter(torch.empty(size=(2 * out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, h):
        Wh = h @ self.W  # h.shape: (B, N, in_features), Wh.shape: (B, N, out_features)
        a_input = self._prepare_attentional_mechanism_input(Wh)
        e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(3))

        attention = F.softmax(e, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = torch.bmm(attention, Wh)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

    def _prepare_attentional_mechanism_input(self, Wh):
        B, N, D = Wh.shape

        Wh_repeated_in_chunks = Wh.repeat_interleave(N, dim=1)
        Wh_repeated_alternating = Wh.repeat(1, N, 1)

        all_combinations_matrix = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=2)
        return all_combinations_matrix.view(-1, N, N, 2 * D)

    def __repr__(self):
        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'


class GATPairClassifier(nn.Module):
    def __init__(self, nfeat, nhid=8, nclass=1, dropout=0.6, alpha=0.2, nheads=8, pooling='first'):
        super().__init__()
        self.dropout = dropout
        self.pooling = pooling

        self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)

        self.out_att = GraphAttentionLayer(nhid * nheads, nhid, dropout=dropout, alpha=alpha, concat=False)

        self.classifier = nn.Sequential(
            nn.Linear(nfeat + nhid, nhid),
            nn.PReLU(),
            nn.BatchNorm1d(nhid),
            nn.Linear(nhid, nclass),
        )

    def forward_gat(self, x):
        x = F.dropout(x, self.dropout, training=self.training)
        x = torch.cat([att(x) for att in self.attentions], dim=2)
        x = F.dropout(x, self.dropout, training=self.training)
        x = F.elu(self.out_att(x))
        if self.pooling == 'first':
            return x[:, 0]
        elif self.pooling == 'mean':
            return x.mean(dim=1)

    def forward(self, feats, neighbor_feats):
        gat_feats = self.forward_gat(neighbor_feats)
        cat_feats = torch.cat([feats, gat_feats], dim=1)
        return self.classifier(cat_feats).squeeze(1)


import time
from contextlib import contextmanager
from collections import defaultdict
map_used_time = defaultdict(float)
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    tt = time.time() - t0
    map_used_time[title] += tt
    print("  {} - done in {:.5f}s".format(title, tt))


df, img_dir = load_data()

stop_words = set([
    'promo','diskon','baik','terbaik', 'murah',
    'termurah', 'harga', 'price', 'best', 'seller',
    'bestseller', 'ready', 'stock', 'stok', 'limited',
    'bagus', 'kualitas', 'berkualitas', 'hari', 'ini',
    'jadi', 'gratis',
])


titles = [
    title.translate(str.maketrans({_: ' ' for _ in string.punctuation}))
    for title in df['title'].str.lower().values
]

tokenizer = TweetTokenizer()
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, 
                                   binary=True, 
                                   min_df=2, 
                                   token_pattern='(?u)\\b\\w+\\b', 
                                   tokenizer=tokenizer.tokenize,
                                   dtype=np.float32,
                                   norm='l2')
tfidf_feats = tfidf_vectorizer.fit_transform(titles)
simmat_tfidf = tfidf_feats @ tfidf_feats.T

with timer('load'):
    st_sizes, img_hs, img_ws = joblib.load('/tmp/lyk_img_meta_data.pkl')
    similarities_img = np.load('/tmp/img_D_qe.npy')[:, :k]
    indexes_img = np.load('/tmp/img_I_qe.npy')[:, :k]

    similarities_bert = np.load('/tmp/brt_D_qe.npy')[:, :k]
    indexes_bert = np.load('/tmp/brt_I_qe.npy')[:, :k]

    similarities_mm = np.load('/tmp/mut_D_qe.npy')[:, :k]
    indexes_mm = np.load('/tmp/mut_I_qe.npy')[:, :k]
    
    row = indexes_bert.ravel()
    col = np.arange(len(indexes_bert)).repeat(k)
    data = similarities_bert.ravel()
    simmat_bert = {(i, j): d for i, j, d in zip(col, row, data)}

    row = indexes_img.ravel()
    col = np.arange(len(indexes_img)).repeat(k)
    data = similarities_img.ravel()
    simmat_img = {(i, j): d for i, j, d in zip(col, row, data)}

    row = indexes_mm.ravel()
    col = np.arange(len(indexes_mm)).repeat(k)
    data = similarities_mm.ravel()
    simmat_mm = {(i, j): d for i, j, d in zip(col, row, data)}

del row, col, data
gc.collect()

ckpt = torch.load('../input/shopee-meta-models/v135.pth')
params = ckpt['params']

top_neighbors = defaultdict(list)
feats = defaultdict(lambda: defaultdict())

pair_tuples = []
for i in tqdm(range(len(df))):
    right_indexes = set(indexes_img[i, :k].tolist() + indexes_bert[i, :k].tolist())
    right_indexes.remove(i)  # remove self

    right_indexes = list(right_indexes)
    scores = {}
    for j in right_indexes:
        pair_tuples.append((i, j))

        sim_img = simmat_img.get((i, j), 0)
        sim_bert = simmat_bert.get((i, j), 0)
        sim_mm = simmat_mm.get((i, j), 0)
        sim_tfidf = simmat_tfidf[i, j]
        if sim_img == 0 and sim_bert == 0:
            continue

        feats[i][j] = [
            sim_img,
            sim_tfidf,
            sim_bert,
            sim_mm,
        ]
        scores[j] = sim_img + sim_tfidf + sim_bert + sim_mm

    top_neighbors[i] = sorted(right_indexes, key=lambda x: scores[x], reverse=True)[:params['k']]

dataset = GraphDataset(
    feats=feats,
    pair_tuples=pair_tuples,
    k=params['k'],
    top_neighbors=top_neighbors,
)
loader = DataLoader(dataset, batch_size=2 ** 12, shuffle=False, drop_last=False, num_workers=2, pin_memory=True)

gat = GATPairClassifier(nfeat=len(feats[i][j]), nhid=params['nhid'],
                        dropout=params['dropout'], nheads=params['nheads'], pooling=params['pooling'])
gat.to('cuda').eval()
gat.load_state_dict(ckpt['model'])

del tfidf_feats
gc.collect()
###

preds = []
for feats, neighbor_feats in tqdm(loader, desc='predict', leave=False):
    feats = feats.to('cuda', non_blocking=True)
    neighbor_feats = neighbor_feats.to('cuda', non_blocking=True)
    with torch.no_grad():
        pred = gat(feats, neighbor_feats).sigmoid().detach().cpu().numpy().tolist()
        preds.extend(pred)

conf_th_gcn = 0.3
df_pair = pd.DataFrame()
col, row = list(zip(*pair_tuples))
df_pair['i'] = col
df_pair['j'] = row

df_pair['posting_id'] = df['posting_id'].values[df_pair['i'].values]
df_pair['posting_id_target'] = df['posting_id'].values[df_pair['j'].values]

df_pair = df_pair[['posting_id', 'posting_id_target']]
df_pair['pred'] = preds
df_pair['pred'] -= conf_th_gcn

df_pair.to_pickle('submission_lyak_gcn.pkl')
df_pair

# lyakaap Side (LGB)

In [None]:
%%python
from lyk_config import k, conf_th, DEBUG, load_data
import sys
sys.path.append('../input/timm045/')
import timm

from itertools import zip_longest
import json
import math
import gc
import os
from pathlib import Path

import faiss
import numpy as np
import cupy as cp
import pandas as pd

from tqdm import tqdm
from PIL import Image
import joblib
import lightgbm as lgb
from scipy.sparse import hstack, vstack, csc_matrix, csr_matrix
import editdistance
import networkx as nx

import string
import nltk
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

NUM_CLASSES = 11014
NUM_WORKERS = 2
SEED = 0

###
import time
from contextlib import contextmanager
from collections import defaultdict
map_used_time = defaultdict(float)
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    tt = time.time() - t0
    map_used_time[title] += tt
    print("  {} - done in {:.5f}s".format(title, tt))


df, img_dir = load_data()

stop_words = set([
    'promo','diskon','baik','terbaik', 'murah',
    'termurah', 'harga', 'price', 'best', 'seller',
    'bestseller', 'ready', 'stock', 'stok', 'limited',
    'bagus', 'kualitas', 'berkualitas', 'hari', 'ini',
    'jadi', 'gratis',
])

titles = [
    title.translate(str.maketrans({_: ' ' for _ in string.punctuation}))
    for title in df['title'].str.lower().values
]

tokenizer = TweetTokenizer()
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, 
                                   binary=True, 
                                   min_df=2, 
                                   token_pattern='(?u)\\b\\w+\\b', 
                                   tokenizer=tokenizer.tokenize,
                                   dtype=np.float32,
                                   norm='l2')
tfidf_feats = tfidf_vectorizer.fit_transform(titles)

with timer('load'):
    similarities_bert, indexes_bert = joblib.load('/tmp/lyk_bert_data.pkl')
    similarities_img, indexes_img = joblib.load('/tmp/lyk_img_data.pkl')
    st_sizes, img_hs, img_ws = joblib.load('/tmp/lyk_img_meta_data.pkl')
    similarities_mm, indexes_mm = joblib.load('/tmp/lyk_mm_data.pkl')
    
    row = indexes_bert.ravel()
    col = np.arange(len(indexes_bert)).repeat(k)
    data = similarities_bert.ravel()
    simmat_bert = {(i, j): d for i, j, d in zip(col, row, data)}

    row = indexes_img.ravel()
    col = np.arange(len(indexes_img)).repeat(k)
    data = similarities_img.ravel()
    simmat_img = {(i, j): d for i, j, d in zip(col, row, data)}

    row = indexes_mm.ravel()
    col = np.arange(len(indexes_mm)).repeat(k)
    data = similarities_mm.ravel()
    simmat_mm = {(i, j): d for i, j, d in zip(col, row, data)}

del row, col, data
gc.collect()

mean_sim_img_top5 = similarities_img[:, :5].mean(1)
mean_sim_bert_top5 = similarities_bert[:, :5].mean(1)
mean_mean_sim_img_top5 = mean_sim_img_top5[indexes_img[:, :5]].mean(1)
mean_mean_sim_bert_top5 = mean_sim_bert_top5[indexes_bert[:, :5]].mean(1)

mean_sim_img_top5 = (mean_sim_img_top5 - mean_sim_img_top5.mean()) / mean_sim_img_top5.std()
mean_sim_bert_top5 = (mean_sim_bert_top5 - mean_sim_bert_top5.mean()) / mean_sim_bert_top5.std()
mean_mean_sim_img_top5 = (mean_mean_sim_img_top5 - mean_mean_sim_img_top5.mean()) / mean_mean_sim_img_top5.std()
mean_mean_sim_bert_top5 = (mean_mean_sim_bert_top5 - mean_mean_sim_bert_top5.mean()) / mean_mean_sim_bert_top5.std()

mean_sim_img_top15 = similarities_img[:, :15].mean(1)
mean_sim_bert_top15 = similarities_bert[:, :15].mean(1)
mean_sim_img_top15 = (mean_sim_img_top15 - mean_sim_img_top15.mean()) / mean_sim_img_top15.std()
mean_sim_bert_top15 = (mean_sim_bert_top15 - mean_sim_bert_top15.mean()) / mean_sim_bert_top15.std()

mean_sim_img_top30 = similarities_img[:, :30].mean(1)
mean_sim_bert_top30 = similarities_bert[:, :30].mean(1)
mean_sim_img_top30 = (mean_sim_img_top30 - mean_sim_img_top30.mean()) / mean_sim_img_top30.std()
mean_sim_bert_top30 = (mean_sim_bert_top30 - mean_sim_bert_top30.mean()) / mean_sim_bert_top30.std()

mean_sim_mm_top5 = similarities_mm[:, :5].mean(1)
mean_mean_sim_mm_top5 = mean_sim_mm_top5[indexes_mm[:, :5]].mean(1)

mean_sim_mm_top5 = (mean_sim_mm_top5 - mean_sim_mm_top5.mean()) / mean_sim_mm_top5.std()
mean_mean_sim_mm_top5 = (mean_mean_sim_mm_top5 - mean_mean_sim_mm_top5.mean()) / mean_mean_sim_mm_top5.std()

mean_sim_mm_top15 = similarities_mm[:, :15].mean(1)
mean_sim_mm_top15 = (mean_sim_mm_top15 - mean_sim_mm_top15.mean()) / mean_sim_mm_top15.std()

mean_sim_mm_top30 = similarities_mm[:, :30].mean(1)
mean_sim_mm_top30 = (mean_sim_mm_top30 - mean_sim_mm_top30.mean()) / mean_sim_mm_top30.std()

row_titles = df['title'].values
posting_ids = df['posting_id'].values

tmp_dir = Path('/tmp/rows')
tmp_dir.mkdir(exist_ok=True, parents=True)

rows = []
for i in tqdm(range(len(df))):
    right_indexes = set(indexes_img[i].tolist() + indexes_bert[i].tolist())

    for _, j in enumerate(right_indexes):
        if i == j:
            continue
        sim_img = simmat_img.get((i, j), 0)
        sim_bert = simmat_bert.get((i, j), 0)
        sim_mm = simmat_mm.get((i, j), 0)
        if sim_img == 0 and sim_bert == 0:
            continue

        rows.append({
            'i': i,
            'j': j,
            'posting_id': posting_ids[i],
            'posting_id_target': posting_ids[j],
            'sim_img': sim_img,
            'sim_bert': sim_bert,
            'sim_mm': sim_mm,
            'edit_distance': editdistance.eval(titles[i], titles[j]),
            'title_len': len(row_titles[i]),
            'title_len_target': len(row_titles[j]),
            'title_num_words': len(row_titles[i].split()),
            'title_num_words_target': len(row_titles[j].split()),
            'mean_sim_img_top5': mean_sim_img_top5[i],
            'mean_sim_img_target_top5': mean_sim_img_top5[j],
            'mean_sim_bert_top5': mean_sim_bert_top5[i],
            'mean_sim_bert_target_top5': mean_sim_bert_top5[j],
            'mean_sim_img_top15': mean_sim_img_top15[i],
            'mean_sim_img_target_top15': mean_sim_img_top15[j],
            'mean_sim_bert_top15': mean_sim_bert_top15[i],
            'mean_sim_bert_target_top15': mean_sim_bert_top15[j],
            'mean_sim_img_top30': mean_sim_img_top30[i],
            'mean_sim_img_target_top30': mean_sim_img_top30[j],
            'mean_sim_bert_top30': mean_sim_bert_top30[i],
            'mean_sim_bert_target_top30': mean_sim_bert_top30[j],
            'st_size': st_sizes[i],
            'st_size_target': st_sizes[j],
            'wxh/st_size': img_ws[i] * img_hs[i] / st_sizes[i],
            'wxh/st_size_target': img_ws[j] * img_hs[j] / st_sizes[j],
            'mean_mean_sim_img_top5': mean_mean_sim_img_top5[i],
            'mean_mean_sim_img_target_top5': mean_mean_sim_img_top5[j],
            'mean_mean_sim_bert_top5': mean_mean_sim_bert_top5[i],
            'mean_mean_sim_bert_target_top5': mean_mean_sim_bert_top5[j],
            'mean_sim_mm_top5': mean_sim_mm_top5[i],
            'mean_sim_mm_target_top5': mean_sim_mm_top5[j],
            'mean_sim_mm_top15': mean_sim_mm_top15[i],
            'mean_sim_mm_target_top15': mean_sim_mm_top15[j],
            'mean_sim_mm_top30': mean_sim_mm_top30[i],
            'mean_sim_mm_target_top30': mean_sim_mm_top30[j],
            'mean_mean_sim_mm_top5': mean_mean_sim_mm_top5[i],
            'mean_mean_sim_mm_target_top5': mean_mean_sim_mm_top5[j],
        })

    if i % 10000 == 9999 or i == len(df) - 1:
        tmp_df = pd.DataFrame(rows)
        for col in tmp_df.columns:
            if tmp_df[col].dtype == 'float64':
                tmp_df[col] = tmp_df[col].astype('float32')
            elif tmp_df[col].dtype == 'int64':
                tmp_df[col] = tmp_df[col].astype('int32')
        tmp_df.to_feather(tmp_dir / f'{i}.feather')
        rows = []

df.drop(['image', 'title'], axis=1, inplace=True)
del (
    mean_sim_img_top5, mean_sim_img_top15, mean_sim_img_top30, mean_mean_sim_img_top5,
    mean_sim_bert_top5, mean_sim_bert_top15, mean_sim_bert_top30, mean_mean_sim_bert_top5,
    mean_sim_mm_top5, mean_sim_mm_top15, mean_sim_mm_top30, mean_mean_sim_mm_top5,
    simmat_img, simmat_bert, simmat_mm,
    similarities_img, indexes_img,
    similarities_bert, indexes_bert,
    similarities_mm, indexes_mm,
)
gc.collect()
with timer('to_frame'):
    df_pair = pd.concat([pd.read_feather(path) for path in tmp_dir.glob('**/*.feather')], axis=0).reset_index(drop=True)
del rows
gc.collect()

with timer('sim_tfidf'):
    df_pair['sim_tfidf'] = tfidf_feats[df_pair['i'].values].multiply(tfidf_feats[df_pair['j'].values]).sum(axis=1)
df_pair['title_len_diff'] = np.abs(df_pair['title_len'] - df_pair['title_len_target'])
df_pair['title_num_words_diff'] = np.abs(df_pair['title_num_words'] - df_pair['title_num_words_target'])

del tfidf_feats
gc.collect()
###

from cuml import ForestInference
import treelite
list_clf = []
for clf in joblib.load('../input/shopee/boosters_v34_v45_mm.pickle'):
    clf.save_model('/tmp/tmp.lgb')
    fi = ForestInference()
    fi.load_from_treelite_model(treelite.Model.load('/tmp/tmp.lgb', model_format='lightgbm'))
    list_clf.append(fi)

X = df_pair[[
    'sim_img', 'sim_tfidf', 'sim_bert', 'sim_mm', 'edit_distance',
    'title_len', 'title_len_target', 'title_len_diff',
    'title_num_words', 'title_num_words_target', 'title_num_words_diff',
    'mean_sim_img_top5', 'mean_sim_img_target_top5',
    'mean_sim_bert_top5', 'mean_sim_bert_target_top5',
    'mean_sim_mm_top5', 'mean_sim_mm_target_top5',
    'mean_sim_img_top15', 'mean_sim_img_target_top15',
    'mean_sim_bert_top15', 'mean_sim_bert_target_top15',
    'mean_sim_mm_top15', 'mean_sim_mm_target_top15',
    'mean_sim_img_top30', 'mean_sim_img_target_top30',
    'mean_sim_bert_top30', 'mean_sim_bert_target_top30',
    'mean_sim_mm_top30', 'mean_sim_mm_target_top30',
    'st_size', 'st_size_target',
    'wxh/st_size', 'wxh/st_size_target',
    'mean_mean_sim_img_top5', 'mean_mean_sim_img_target_top5',
    'mean_mean_sim_bert_top5', 'mean_mean_sim_bert_target_top5',
    'mean_mean_sim_mm_top5', 'mean_mean_sim_mm_target_top5',
]]

## passing as cupy array might be able to avoid multipy copy to GPU.
X = cp.asarray(X[clf.feature_name()].values.astype(np.float32))
df_pair = df_pair[['posting_id', 'posting_id_target']]

gc.collect()
with timer('predict'):
    df_pair['pred'] = np.mean([clf.predict(X).get() for clf in list_clf], axis=0) - conf_th

df_pair.to_pickle('submission_lyak.pkl')

# TKM side

In [None]:
%%bash
pip install ../input/shopee-libs/imagesize-1.2.0-py2.py3-none-any.whl \
../input/shopee-libs/PyStemmer-2.0.1/dist/PyStemmer-2.0.1.tar

In [None]:
import sys
!cp ../input/rapids/rapids.0.18.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
%%python
import pandas as pd
import numpy as np
import sys
import ast
import os
import time
import cv2
import PIL.Image
import random
import joblib

from multiprocessing import Pool
from sklearn.metrics import accuracy_score

import langid
import Levenshtein

#import albumentations
#from albumentations import *
from tqdm import tqdm
import matplotlib.pyplot as plt
import gc
from sklearn.metrics import roc_auc_score
from warnings import filterwarnings

from contextlib import contextmanager
from collections import defaultdict
map_used_time = defaultdict(float)
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    tt = time.time() - t0
    map_used_time[title] += tt
    print("  {} - done in {:.5f}s".format(title, tt))


filterwarnings("ignore")


###

import imagesize
import Stemmer
stemmer = Stemmer.Stemmer('indonesian')
DEBUG = len(pd.read_csv('../input/shopee-product-matching/test.csv')) == 3


if DEBUG:
    data_dir = '../input/shopee-product-matching/train_images/'
else:
    data_dir = '../input/shopee-product-matching/test_images/'
    
###

if DEBUG:
    if 1:
        nrows = 1000
        df_test = pd.read_csv('../input/shopee-libs/train_newfold_stmmedid.csv', nrows=nrows)
    else:
        df_test = pd.read_csv('../input/shopee-libs/train_newfold_stmmedid.csv').append(
            pd.read_csv('../input/shopee-libs/train_newfold_stmmedid.csv'), ignore_index=True
        )
    
    label_groups = np.sort(df_test['label_group'].unique())
    map_label2id = {g: i for i, g in enumerate(label_groups)}
    df_test['label'] = df_test['label_group'].map(map_label2id)
    df_test['file_path'] = df_test.image.apply(lambda x: os.path.join(data_dir, f'{x}'))
else:
    df_test = pd.read_csv('../input/shopee-product-matching/test.csv')
    df_test['file_path'] = df_test.image.apply(lambda x: os.path.join(data_dir, f'{x}'))

    titles = df_test['title'].str.lower().values

    with timer('get lang'):
        df_test['lang'] = [langid.classify(t)[0] for t in tqdm(titles)]
        list_lang = df_test['lang'].values
    with timer('lemmatize'):
        titles = np.array([t.encode('ascii').decode('unicode-escape').encode('ascii', 'replace').decode('ascii').replace('?', ' ') for t in titles])
        titles = [' '.join(stemmer.stemWords(t.split())) if list_lang[i] in {'id', 'ms'} else t for i, t in enumerate(tqdm(titles))]
        df_test['title'] = titles

with timer('get image size'):
    st_sizes, img_hs, img_ws = joblib.load('/tmp/lyk_img_meta_data.pkl')
    df_test['width'] = img_ws
    df_test['hight'] = img_hs
    df_test['st_size'] = st_sizes
    df_test['wxh/st_size'] = df_test['width'] * df_test['hight'] / df_test['st_size']

df_test.to_pickle('/tmp/df_test_tkm.pkl')
###

K = min(60, df_test.shape[0])

###
print('Computing text embeddings...')
import cupy as cp
import pickle
import gc
from cuml.feature_extraction.text import TfidfVectorizer
import cudf

model = TfidfVectorizer(stop_words=None, 
                        binary=True, 
                        max_features=100000,
                        max_df=0.3,
                        min_df=2,
                        dtype=np.float32)

with timer('tfidf fit'):
    titles = pd.read_csv('../input/shopee-libs/train_newfold_stmmedid.csv', 
                         usecols=['title'])['title'].values.tolist()
    test_titles = df_test.title.values.tolist()
    titles += test_titles
    model.fit(cudf.Series(titles))
    text_embeddings = model.transform(cudf.Series(test_titles))
    print('text embeddings shape',text_embeddings.shape)

with timer('tfidf pred'):
    CHUNK = 1024*4
    print('Finding similar titles...')
    text_D = np.zeros((df_test.shape[0], K), dtype=np.float32)
    text_I = np.zeros((df_test.shape[0], K), dtype=np.int32)


    CTS = text_embeddings.shape[0]//CHUNK
    if  text_embeddings.shape[0]%CHUNK!=0: CTS += 1
    cnt = 0
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b, text_embeddings.shape[0])
        print('chunk',a,'to',b, text_embeddings.shape[0])

        #COSINE SIMILARITY DISTANCE
        cts = (text_embeddings * text_embeddings[a:b].T).T.toarray()
        indices = cp.argsort(cts, axis=1)

        for k in range(b-a):
            idx = indices[k][::-1]
            text_I[cnt] = idx[:K].get()
            text_D[cnt] = cts[k, idx[:K]].get()
            cnt += 1

del text_embeddings, indices, cts
gc.collect()
###

img_D = np.load('/tmp/img_D_qe.npy')
img_I = np.load('/tmp/img_I_qe.npy')

###

bert_D = np.load('/tmp/brt_D_qe.npy')
bert_I = np.load('/tmp/brt_I_qe.npy')

###

bth_D = np.load('/tmp/bth_D_qe.npy')
bth_I = np.load('/tmp/bth_I_qe.npy')
###

mut_D = np.load('/tmp/mut_D_qe.npy')
mut_I = np.load('/tmp/mut_I_qe.npy')
###

map_col2id = {}
###

import langid
import Levenshtein
titles = df_test['title'].values
titles_set = [set(t) for t in titles]
langs = df_test['lang'].values
st_size = df_test['st_size'].values
wh_st_size = df_test['wxh/st_size'].values
###

numset = set('0123456789')

###
text_D = np.array(text_D)
txt_cnt_all = np.vstack([(text_D > t).sum(axis=1) for t in [0.9, 0.8, 0.7, 0.6, 0.5]]).T
txt_avg_raw_all = text_D.mean(axis=1)
txt_avg_all = (txt_avg_raw_all - txt_avg_raw_all.mean()) / txt_avg_raw_all.std()
txt_std_all = text_D.std(axis=1)

txt_avg_5_all = text_D[:, :5].mean(axis=1)
txt_avg_10_all = text_D[:, :10].mean(axis=1)
txt_avg_15_all = text_D[:, :15].mean(axis=1)
txt_avg_30_all = text_D[:, :30].mean(axis=1)

txt_avg_5_all = (txt_avg_5_all - txt_avg_5_all.mean()) / txt_avg_5_all.std()
txt_avg_10_all = (txt_avg_10_all - txt_avg_10_all.mean()) / txt_avg_10_all.std()
txt_avg_15_all = (txt_avg_15_all - txt_avg_15_all.mean()) / txt_avg_15_all.std()
txt_avg_30_all = (txt_avg_30_all - txt_avg_30_all.mean()) / txt_avg_30_all.std()
    
###
brt_cnt_all = np.vstack([(bert_D > t).sum(axis=1) for t in [0.9, 0.8, 0.7, 0.6, 0.5]]).T
brt_avg_raw_all = bert_D.mean(axis=1)
brt_avg_all = (brt_avg_raw_all - brt_avg_raw_all.mean()) / brt_avg_raw_all.std()
brt_std_all = bert_D.std(axis=1)

brt_avg_5_all = bert_D[:, :5].mean(axis=1)
brt_avg_10_all = bert_D[:, :10].mean(axis=1)
brt_avg_15_all = bert_D[:, :15].mean(axis=1)
brt_avg_30_all = bert_D[:, :30].mean(axis=1)

brt_avg_5_all = (brt_avg_5_all - brt_avg_5_all.mean()) / brt_avg_5_all.std()
brt_avg_10_all = (brt_avg_10_all - brt_avg_10_all.mean()) / brt_avg_10_all.std()
brt_avg_15_all = (brt_avg_15_all - brt_avg_15_all.mean()) / brt_avg_15_all.std()
brt_avg_30_all = (brt_avg_30_all - brt_avg_30_all.mean()) / brt_avg_30_all.std()

###
bth_cnt_all = np.vstack([(bth_D > t).sum(axis=1) for t in [0.9, 0.8, 0.7, 0.6, 0.5]]).T
bth_avg_raw_all = bth_D.mean(axis=1)
bth_avg_all = (bth_avg_raw_all - bth_avg_raw_all.mean()) / bth_avg_raw_all.std()
bth_std_all = bth_D.std(axis=1)

bth_avg_5_all = bth_D[:, :5].mean(axis=1)
bth_avg_10_all = bth_D[:, :10].mean(axis=1)
bth_avg_15_all = bth_D[:, :15].mean(axis=1)
bth_avg_30_all = bth_D[:, :30].mean(axis=1)

bth_avg_5_all = (bth_avg_5_all - bth_avg_5_all.mean()) / bth_avg_5_all.std()
bth_avg_10_all = (bth_avg_10_all - bth_avg_10_all.mean()) / bth_avg_10_all.std()
bth_avg_15_all = (bth_avg_15_all - bth_avg_15_all.mean()) / bth_avg_15_all.std()
bth_avg_30_all = (bth_avg_30_all - bth_avg_30_all.mean()) / bth_avg_30_all.std()
        
###
mut_cnt_all = np.vstack([(mut_D > t).sum(axis=1) for t in [0.9, 0.8, 0.7, 0.6, 0.5]]).T
mut_avg_raw_all = mut_D.mean(axis=1)
mut_avg_all = (mut_avg_raw_all - mut_avg_raw_all.mean()) / mut_avg_raw_all.std()
mut_std_all = mut_D.std(axis=1)

mut_avg_5_all = mut_D[:, :5].mean(axis=1)
mut_avg_10_all = mut_D[:, :10].mean(axis=1)
mut_avg_15_all = mut_D[:, :15].mean(axis=1)
mut_avg_30_all = mut_D[:, :30].mean(axis=1)

mut_avg_5_all = (mut_avg_5_all - mut_avg_5_all.mean()) / mut_avg_5_all.std()
mut_avg_10_all = (mut_avg_10_all - mut_avg_10_all.mean()) / mut_avg_10_all.std()
mut_avg_15_all = (mut_avg_15_all - mut_avg_15_all.mean()) / mut_avg_15_all.std()
mut_avg_30_all = (mut_avg_30_all - mut_avg_30_all.mean()) / mut_avg_30_all.std()
        
###
img_cnt_all = np.vstack([(img_D > t).sum(axis=1) for t in [0.9, 0.8, 0.7, 0.6, 0.5]]).T
img_avg_raw_all = img_D.mean(axis=1)
img_avg_all = (img_avg_raw_all - img_avg_raw_all.mean()) / img_avg_raw_all.std()
img_std_all = img_D.std(axis=1)

img_avg_5_all = img_D[:, :5].mean(axis=1)
img_avg_10_all = img_D[:, :10].mean(axis=1)
img_avg_15_all = img_D[:, :15].mean(axis=1)
img_avg_30_all = img_D[:, :30].mean(axis=1)

img_avg_5_all = (img_avg_5_all - img_avg_5_all.mean()) / img_avg_5_all.std()
img_avg_10_all = (img_avg_10_all - img_avg_10_all.mean()) / img_avg_10_all.std()
img_avg_15_all = (img_avg_15_all - img_avg_15_all.mean()) / img_avg_15_all.std()
img_avg_30_all = (img_avg_30_all - img_avg_30_all.mean()) / img_avg_30_all.std()

width_hight = df_test[['width', 'hight']].values

list_pred_id = [[] for _ in range(df_test.shape[0])]

indices = df_test.index.values

ptr = 0
all_feat = np.memmap('/tmp/tkm_feat.dat', dtype='float32', mode='w+', shape=(df_test.shape[0] * 60 * 5, 150), order='F')

feat = np.zeros((60 * 5, 150), dtype='float32')

list_idx = []
list_idx2 = []
list_feats = []
for i in tqdm(indices):
    img_d = img_D[i]
    img_i = img_I[i]

    img_cnt = img_cnt_all[i]
    img_avg = img_avg_all[i]
    img_std = img_std_all[i]

    img_width ,img_hight = width_hight[i]

    ###
    txt_d = text_D[i]
    txt_i = text_I[i]

    txt_cnt = txt_cnt_all[i]
    txt_avg = txt_avg_all[i]
    txt_std = txt_std_all[i]

    txt_set = set(titles[i])
    ###
    brt_d = bert_D[i]
    brt_i = bert_I[i]

    brt_cnt = brt_cnt_all[i]
    brt_avg = brt_avg_all[i]
    brt_std = brt_std_all[i]

    brt_set = set(titles[i])
    bth_d = bth_D[i]
    bth_i = bth_I[i]

    bth_cnt = bth_cnt_all[i]
    bth_avg = bth_avg_all[i]
    bth_std = bth_std_all[i]

    bth_set = set(titles[i])
    mut_d = mut_D[i]
    mut_i = mut_I[i]

    mut_cnt = mut_cnt_all[i]
    mut_avg = mut_avg_all[i]
    mut_std = mut_std_all[i]

    mut_set = set(titles[i])

    map_feat = {}
    for j in range(K):
        _w, _h = width_hight[img_i[j]]
        _img_cnt = img_cnt_all[img_i[j]]
        _img_avg = img_avg_all[img_i[j]]
        _img_std = img_std_all[img_i[j]]

        diff_width = abs(img_width - _w)
        diff_hight = abs(img_hight - _h)
        d = {
            'img_sim': img_d[j],
            'img_avg': img_avg, 
            'img_std': img_std,
            'img_avg2': _img_avg, 
            'img_std2': _img_std,

            'img_avg_raw': img_avg_raw_all[i],
            'img_avg2_raw': img_avg_raw_all[img_i[j]],

            'diff_width': diff_width,
            'diff_hight': diff_hight,
            'img_width': img_width,
            'img_hight': img_hight,
            'img_width2': _w,
            'img_hight2': _h,

            'st_size': st_size[i],
            'st_size2': st_size[img_i[j]],
            'wh_st_size': wh_st_size[i],
            'wh_st_size2': wh_st_size[img_i[j]]
        }
        d.update({f'img_cnt_{ii}': img_cnt[ii] for ii in range(img_cnt.shape[0])})
        d.update({f'img_cnt2_{ii}': _img_cnt[ii] for ii in range(_img_cnt.shape[0])})
        map_feat[img_i[j]] = d
        
    for j in range(K):
        _txt_set = titles_set[txt_i[j]]
        _txt_cnt = txt_cnt_all[txt_i[j]]
        _txt_avg = txt_avg_all[txt_i[j]]
        _txt_std = txt_std_all[txt_i[j]]
        diff_txt_set = set(titles[txt_i[j]]) & txt_set
        diff_txt_set = len(numset & diff_txt_set) / (len(diff_txt_set) + 1)
        xor_txt_set = set(titles[txt_i[j]]) ^ txt_set
        xor_txt_set = len(numset & xor_txt_set) / (len(xor_txt_set) + 1)
        jac_txt = len(txt_set & _txt_set) / (len(txt_set | _txt_set) + 1)
        lev_dist = Levenshtein.distance(titles[i], titles[txt_i[j]])
        d = {
            'txt_sim': txt_d[j],
            'txt_avg': txt_avg, 
            'txt_std': txt_std,
            'txt_avg2': _txt_avg,
            'txt_std2': _txt_std,

            'txt_avg_raw': txt_avg_raw_all[i],
            'txt_avg2_raw': txt_avg_raw_all[txt_i[j]],

            'jac_txt': jac_txt,
            'diff_txt_set': diff_txt_set, 
            'xor_txt_set': xor_txt_set,
            'lev_dist': lev_dist,
            'len_txt': len(titles[i]), 
            'len_txt2': len(titles[txt_i[j]]),
            'lang_en': int(langs[i] == 'en'),
            'lang_en2': int(langs[txt_i[j]] == 'en'),
        }
        d.update({f'txt_cnt_{ii}': txt_cnt[ii] for ii in range(txt_cnt.shape[0])})
        d.update({f'txt_cnt2_{ii}': _txt_cnt[ii] for ii in range(_txt_cnt.shape[0])})
        if txt_i[j] in map_feat:
            map_feat[txt_i[j]].update(d)
        else:
            map_feat[txt_i[j]] = d
            
    for j in range(K):
        _bth_cnt = bth_cnt_all[bth_i[j]]
        _bth_avg = bth_avg_all[bth_i[j]]
        _bth_std = bth_std_all[bth_i[j]]
        if bth_i[j] in map_feat:
            d = map_feat[bth_i[j]]
        else:
            d = {}
        d.update({
            'bth_sim': bth_d[j],
            'bth_avg': bth_avg, 
            'bth_std': bth_std,
            'bth_avg2': _bth_avg,
            'bth_std2': _bth_std,

            'bth_avg_raw': bth_avg_raw_all[i],
            'bth_avg2_raw': bth_avg_raw_all[bth_i[j]],
        })
        d.update({f'bth_cnt_{ii}': bth_cnt[ii] for ii in range(bth_cnt.shape[0])})
        d.update({f'bth_cnt2_{ii}': _bth_cnt[ii] for ii in range(_bth_cnt.shape[0])})
        if 'lev_dist' not in d:
            _bth_set = titles_set[bth_i[j]] #set(titles[bth_i[j]])
            diff_bth_set = set(titles[bth_i[j]]) & bth_set
            diff_bth_set = len(numset & diff_bth_set) / (len(diff_bth_set) + 1)
            xor_bth_set = set(titles[bth_i[j]]) ^ bth_set
            xor_bth_set = len(numset & xor_bth_set) / (len(xor_bth_set) + 1)
            jac_bth = len(bth_set & _bth_set) / (len(bth_set | _bth_set) + 1)
            lev_dist = Levenshtein.distance(titles[i], titles[bth_i[j]])
            d.update({
                'jac_txt': jac_bth,
                'diff_txt_set': diff_bth_set, 
                'xor_txt_set': xor_bth_set,
                'lev_dist': lev_dist,
                'len_txt': len(titles[i]), 
                'len_txt2': len(titles[bth_i[j]]),
                'lang_en': int(langs[i] == 'en'),
                'lang_en2': int(langs[bth_i[j]] == 'en'),
            })
        if 'img_width' not in d:    
            _w, _h = width_hight[bth_i[j]]
            diff_width = abs(img_width - _w)
            diff_hight = abs(img_hight - _h)
            d.update({
                'diff_width': diff_width,
                'diff_hight': diff_hight,
                 'img_width': img_width,
                 'img_hight': img_hight,
                 'img_width2': _w,
                 'img_hight2': _h,
                
                     'st_size': st_size[i],
                     'st_size2': st_size[bth_i[j]],
                     'wh_st_size': wh_st_size[i],
                     'wh_st_size2': wh_st_size[bth_i[j]]
                     })
        map_feat[bth_i[j]] = d
            
    for j in range(K):
        _mut_cnt = mut_cnt_all[mut_i[j]]
        _mut_avg = mut_avg_all[mut_i[j]]
        _mut_std = mut_std_all[mut_i[j]]
        if mut_i[j] in map_feat:
            d = map_feat[mut_i[j]]
        else:
            d = {}
        d.update({
            'mut_sim': mut_d[j],
            'mut_avg': mut_avg, 
            'mut_std': mut_std,
            'mut_avg2': _mut_avg,
            'mut_std2': _mut_std,
            'mut_avg_raw': mut_avg_raw_all[i],
            'mut_avg2_raw': mut_avg_raw_all[mut_i[j]],
        })
        d.update({f'mut_cnt_{ii}': mut_cnt[ii] for ii in range(mut_cnt.shape[0])})
        d.update({f'mut_cnt2_{ii}': _mut_cnt[ii] for ii in range(_mut_cnt.shape[0])})
        if 'lev_dist' not in d:
            _mut_set = titles_set[mut_i[j]]#set(titles[mut_i[j]])
            diff_mut_set = set(titles[mut_i[j]]) & mut_set
            diff_mut_set = len(numset & diff_mut_set) / (len(diff_mut_set) + 1)
            xor_mut_set = set(titles[mut_i[j]]) ^ mut_set
            xor_mut_set = len(numset & xor_mut_set) / (len(xor_mut_set) + 1)
            jac_mut = len(mut_set & _mut_set) / (len(mut_set | _mut_set) + 1)
            lev_dist = Levenshtein.distance(titles[i], titles[mut_i[j]])
            d.update({
                'jac_txt': jac_mut,
                'diff_txt_set': diff_mut_set, 
                'xor_txt_set': xor_mut_set,
                'lev_dist': lev_dist,
                'len_txt': len(titles[i]), 
                'len_txt2': len(titles[mut_i[j]]),
                'lang_en': int(langs[i] == 'en'),
                'lang_en2': int(langs[mut_i[j]] == 'en'),
            })
        if 'img_width' not in d:    
            _w, _h = width_hight[mut_i[j]]
            diff_width = abs(img_width - _w)
            diff_hight = abs(img_hight - _h)
            d.update({
                'diff_width': diff_width,
                'diff_hight': diff_hight,
                'img_width': img_width,
                'img_hight': img_hight,
                'img_width2': _w,
                'img_hight2': _h,
                'st_size': st_size[i],
                'st_size2': st_size[mut_i[j]],
                'wh_st_size': wh_st_size[i],
                'wh_st_size2': wh_st_size[mut_i[j]]
            })
        map_feat[mut_i[j]] = d

    for j in range(K):
        _brt_cnt = brt_cnt_all[brt_i[j]]
        _brt_avg = brt_avg_all[brt_i[j]]
        _brt_std = brt_std_all[brt_i[j]]
        if brt_i[j] in map_feat:
            d = map_feat[brt_i[j]]
        else:
            d = {}
        d.update({
            'brt_sim': brt_d[j],
            'brt_avg': brt_avg, 
            'brt_std': brt_std,
            'brt_avg2': _brt_avg,
            'brt_std2': _brt_std,
            'brt_avg_raw': brt_avg_raw_all[i],
            'brt_avg2_raw': brt_avg_raw_all[brt_i[j]],
        })
        d.update({f'brt_cnt_{ii}': brt_cnt[ii] for ii in range(brt_cnt.shape[0])})
        d.update({f'brt_cnt2_{ii}': _brt_cnt[ii] for ii in range(_brt_cnt.shape[0])})
        if 'lev_dist' not in d:
            _brt_set = titles_set[brt_i[j]]
            diff_brt_set = set(titles[brt_i[j]]) & brt_set
            diff_brt_set = len(numset & diff_brt_set) / (len(diff_brt_set) + 1)
            xor_brt_set = set(titles[brt_i[j]]) ^ brt_set
            xor_brt_set = len(numset & xor_brt_set) / (len(xor_brt_set) + 1)
            jac_brt = len(brt_set & _brt_set) / (len(brt_set | _brt_set) + 1)
            lev_dist = Levenshtein.distance(titles[i], titles[brt_i[j]])
            d.update({
                'jac_txt': jac_brt,
                'diff_txt_set': diff_brt_set, 
                'xor_txt_set': xor_brt_set,
                'lev_dist': lev_dist,
                'len_txt': len(titles[i]), 
                'len_txt2': len(titles[brt_i[j]]),
                'lang_en': int(langs[i] == 'en'),
                'lang_en2': int(langs[brt_i[j]] == 'en'),
            })
        map_feat[brt_i[j]] = d

    feat[:] = 0 
    for ii, (k, map_val) in enumerate(map_feat.items()):
        list_idx.append(i)
        list_idx2.append(k)
        for c, v in map_val.items():
            if c not in map_col2id:
                map_col2id[c] = len(map_col2id)
            feat[ii, map_col2id[c]] = v

    all_feat[ptr:ptr + len(map_feat)] = feat[:len(map_feat)]
    ptr += len(map_feat)
    
del img_D, img_I, text_D, text_I, bert_D, bert_I, bth_D, bth_I, mut_D, mut_I
gc.collect()

del list_feats
gc.collect()

map_weights = {sim: all_feat[:ptr, map_col2id[f'{sim}_sim']] for sim in ['img', 'bth', 'mut', 'txt', 'brt']}

del feat
gc.collect()

import networkx as nx


list_idx = np.array(list_idx)
list_idx2 = np.array(list_idx2)

from igraph import Graph
map_sim = {}
for sim in tqdm(['img', 'bth', 'mut', 'txt', 'brt'], desc='graph'):
    weights = map_weights[sim]
    idx = weights > 0
    with timer('add edges'):
        g = Graph()
        g.add_vertices(len(df_test))
        g.add_edges(list(zip(list_idx[idx], list_idx2[idx])), {'weight': weights[idx]})
    with timer('pagerank'):
        map_pr = np.array(g.pagerank(damping=0.85, weights='weight', niter=100, eps=1e-06, directed=False))
    with timer('pagerank get'):
        data1 = map_pr[list_idx]
        data2 = map_pr[list_idx2]
        data1[weights <= 0] = 0
        data2[weights <= 0] = 0
        map_sim[f'{sim}_pagerank'] = data1
        map_sim[f'{sim}_pagerank2'] = data2
    del map_pr, g
    gc.collect()

for c, v in tqdm(map_sim.items()):
    map_col2id[c] = len(map_col2id)
    all_feat[:ptr, map_col2id[c]] = v

import treelite_runtime
from cuml import ForestInference
import treelite
import pickle
import lightgbm as lgb

all_weights = {
    '../input/shopee-metric-resnet50d512-0328-newfold/0508_qe_best_0.345/': 1,
}

s = sum(all_weights.values())
all_weights = {k: v / s for k, v in all_weights.items()}
    
list_clf = []
weights = []
thresholds = [] #[0.358, 0.361, 0.350, 0.336, 0.348, 0.346]
for path in [
    '../input/shopee-metric-resnet50d512-0328-newfold/0508_qe_best_0.345/',
    ]:
    name = os.path.dirname(path).split('/')[-1]
    th = float(name.split('_')[-1])
    if all_weights.get(path, 0) == 0:
        continue
        
    fi = ForestInference()
    fi.load_from_treelite_model(treelite.Model.load(f'{path}/all_data_clf_norm.lgb', model_format='lightgbm'))
    list_clf += [fi]
    thresholds += [th]
    weights += [all_weights[path]]
    
print(weights)
print(thresholds)

col = lgb.Booster(model_file=f'{path}/all_data_clf_norm.lgb').feature_name()

for sf in ['img', 'txt', 'mut', 'bth', 'brt']:
    all_feat[:ptr, map_col2id[f'{sf}_avg']] = all_feat[:ptr, map_col2id[f'{sf}_avg_raw']]
    all_feat[:ptr, map_col2id[f'{sf}_avg2']] = all_feat[:ptr, map_col2id[f'{sf}_avg2_raw']]

CHUNK = 1000000
preds = []
col_idx = [map_col2id[c] for c in col]

for ch in tqdm(range(0, ptr, CHUNK), desc='pred chunk'):
    feat = cp.asarray(all_feat[ch:ch+CHUNK, col_idx]).astype('float32')
    probs = np.vstack([(c.predict(feat).get() - thresholds[ii]) * weights[ii] for ii, c in enumerate(list_clf)])
    preds += probs.sum(axis=0).tolist()
    del feat
    gc.collect()

df_pred = pd.DataFrame(
    dict(
        posting_id=list_idx,
        posting_id_target=list_idx2,
        pred=preds[:ptr]
    )
)

idx = df_test.posting_id.values
df_pred['posting_id'] = [idx[i] for i in df_pred['posting_id'].values]
df_pred['posting_id_target'] = [idx[i] for i in df_pred['posting_id_target'].values]

df_pred.to_pickle('submission_tkm.pkl')

# Postprocess

In [None]:
import pandas as pd

df_lyk = pd.read_pickle('submission_lyak.pkl')
df_lyk_gcn = pd.read_pickle('submission_lyak_gcn.pkl')
df_tkm = pd.read_pickle('submission_tkm.pkl')

df_lyk['pred'] *= 1
df_lyk_gcn['pred'] *= 3
df_tkm['pred'] *= 2

In [None]:
df_pred = pd.concat([df_lyk, df_lyk_gcn, df_tkm], axis=0, ignore_index=True).groupby(['posting_id', 'posting_id_target'])[['pred']].sum() / 6

df_pred.reset_index(inplace=True)
df_pred.loc[df_pred['posting_id'] == df_pred['posting_id_target'], 'pred'] = 0.5
df_pred.set_index(['posting_id', 'posting_id_target'], inplace=True)

df_pred = df_pred.query('pred > 0')
df_pred = df_pred[df_pred.apply(lambda row: (row.name[1], row.name[0]) in df_pred.index, axis=1)].reset_index()

df_pred

In [None]:
import networkx as nx
from tqdm import tqdm
from cugraph.centrality.betweenness_centrality import edge_betweenness_centrality

G = nx.Graph()
for i, j, w in df_pred[['posting_id', 'posting_id_target', 'pred']].values:
    G.add_edge(i, j, weight=w)

list_remove_edges = []
list_add_edges = []
def split_graph(G):
    list_comp = list(nx.connected_components(G))
    n = len(G.nodes)
    if len(list_comp) == 1:
        map_bet = edge_betweenness_centrality(G, normalized=True)
        map_bet = {(i, j): w  for (i, j), w in map_bet.items() 
                   if G[i][j]['weight'] < 0.15780210284453428}
        if len(map_bet) == 0:
            return
        edge, val = max(map_bet.items(), key=lambda x: x[1])
        if val > 0.11766651703447985:
            G.remove_edge(*edge)
            list_remove_edges.append(edge)
            return split_graph(G)
    else:
        iters = list_comp
        for comp in iters:
            if len(comp) > 6:
                split_graph(nx.Graph(G.subgraph(comp)))
                
split_graph(G)
for edge in list_remove_edges:
    G.remove_edge(*edge)

def get_score(i, j):
    try:
        return G[i][j]['weight']
    except KeyError:
        return -1

posting_ids = df_pred['posting_id'].unique()
matches = []

for i in posting_ids:
    if i in G:
        m = list(set([i] + list(G.neighbors(i))))
    else:
        m = [i]
    if len(m) > 51:
        m = sorted(m, key=lambda x: get_score(i, x), reverse=True)[:51]
    matches.append(' '.join(m))
matched = pd.DataFrame(dict(posting_id=posting_ids, matches=matches))

matched.to_csv('submission.csv', index=False)
matched