# Install dependencies

In [None]:
%%bash
pip install ../input/shopee-libs/imagesize-1.2.0-py2.py3-none-any.whl \
../input/shopee-libs/PyStemmer-2.0.1/dist/PyStemmer-2.0.1.tar
pip install ../input/faiss-163/faiss_gpu-1.6.3-cp37-cp37m-manylinux2010_x86_64.whl

# Load libraries

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../input/shopee-libs')
import ast
import os
import time
import cv2
import PIL.Image
import random
import joblib
import pickle
from multiprocessing import Pool
import matplotlib.pyplot as plt
import gc

from sklearn.metrics import roc_auc_score
import lightgbm as lgb


import cudf
from cuml.feature_extraction.text import TfidfVectorizer
import cupy as cp
from igraph import Graph
import numba


import faiss
import langid
import Levenshtein
from tqdm import tqdm


from warnings import filterwarnings
from contextlib import contextmanager
from collections import defaultdict, Counter
map_used_time = defaultdict(float)
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    tt = time.time() - t0
    map_used_time[title] += tt
    print("  {} - done in {:.5f}s".format(title, tt))


filterwarnings("ignore")

K = 60

# Due to memory limit, model training cannot be done in kaggle kernel. Please use a local machine with the input dataset.
DEBUG = True

# Load CSV

In [None]:
import imagesize
import Stemmer
stemmer = Stemmer.Stemmer('indonesian')

data_dir = '../input/shopee-product-matching/train_images/'

df_test = pd.read_csv('../input/shopee-feats/train_with_fold.csv')
df_test['file_path'] = df_test.image.apply(lambda x: os.path.join(data_dir, f'{x}'))

label_groups = np.sort(df_test['label_group'].unique())
map_label2id = {g: i for i, g in enumerate(label_groups)}
df_test['label'] = df_test['label_group'].map(map_label2id)
df_test['target'] = df_test['label_group'].map(df_test.groupby('label_group').apply(lambda x: x.index.values.tolist()))

titles = df_test['title'].str.lower().values

with timer('get lang'):
    df_test['lang'] = [langid.classify(t)[0] for t in tqdm(titles)]
    list_lang = df_test['lang'].values
with timer('lemmatize'):
    titles = np.array([t.encode('ascii').decode('unicode-escape').encode('ascii', 'replace').decode('ascii').replace('?', ' ') for t in titles])
    titles = [' '.join(stemmer.stemWords(t.split())) if list_lang[i] in {'id', 'ms'} else t for i, t in enumerate(tqdm(titles))]
    df_test['title'] = titles

with timer('get image size'):
    ## Getting image size and file size. Please see lyakaap's code
    st_sizes, img_hs, img_ws = joblib.load('../input/shopee-cache/lyk_img_meta_data.pkl')
    df_test['width'] = img_ws[:df_test.shape[0]]
    df_test['hight'] = img_hs[:df_test.shape[0]]
    df_test['st_size'] = st_sizes[:df_test.shape[0]]
    df_test['wxh/st_size'] = df_test['width'] * df_test['hight'] / df_test['st_size']

# Similarity: Text TFIDF

In [None]:



model = TfidfVectorizer(stop_words=None,#stop_words,#np.load('stop_words.npy').tolist(),
                        binary=True,
                        # norm='l1',
                        # analyzer='char_wb',
                        #ngram_range=(2, 5),
                        #tokenizer=tokenizer.tokenize,
                        #token_pattern='(?u)\\b\\w+\\b',
                        # max_features=50000,
                        # max_df=100,
                        #vocabulary=['aaa', 'bbb'],
                        min_df=2,
                        dtype=np.float32)

with timer('tfidf fit'):
    langs = df_test.lang.values
    titles = df_test.title.values

    text_embeddings = model.fit_transform(cudf.Series(titles)) #cp.sparse.csr_matrix(model.fit_transform(cudf.Series(titles)))
    print('text embeddings shape', text_embeddings.shape)


with timer('tfidf pred'):
    CHUNK = 1024*4
    print('Finding similar titles...')
    text_D = np.zeros((df_test.shape[0], K), dtype=np.float32)
    text_I = np.zeros((df_test.shape[0], K), dtype=np.int32)

    CTS = text_embeddings.shape[0]//CHUNK
    if text_embeddings.shape[0] % CHUNK != 0:
        CTS += 1
    cnt = 0
    for j in range(CTS):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b, text_embeddings.shape[0])
        print('chunk', a, 'to', b, text_embeddings.shape[0])

        # COSINE SIMILARITY DISTANCE
        cts = (text_embeddings * text_embeddings[a:b].T).T.toarray()
        indices = cp.argsort(cts, axis=1)

        for k in range(b-a):
            idx = indices[k][::-1]
            text_I[cnt] = idx[:K].get()
            text_D[cnt] = cts[k, idx[:K]].get()
            cnt += 1


# Similarity: Multi modal (NFNet-F0 and Indonesian BERT (concatenated at final feature layers))

In [None]:
list_preds1 = np.load('../input/shopee-feats/v79_mm_feats_2fold.npy').astype(np.float32)

mut_D = []
mut_I = []

for fold_id in tqdm(range(2)):
    preds = list_preds1[fold_id]
    preds /= np.linalg.norm(preds, axis=1).reshape((-1, 1))

    index = faiss.IndexFlatIP(preds.shape[1])   # build the index

    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 0, index)

    index.add(preds)                  # add vectors to the index
    D, I = index.search(preds, K)

    mut_D.append(D)
    mut_I.append(I)
fold_mut_I = np.stack(mut_I)
fold_mut_D = np.stack(mut_D)

# Similarity: Bert (Indonesian-BERT, Multilingual-BERT, and Paraphrase-XLM embeddings)

In [None]:
list_preds_txt = [np.load('../input/shopee-feats/v75_bert_feats_2fold.npy').astype(np.float32),
                np.load('../input/shopee-feats/v102_bert_feats_2fold.npy').astype(np.float32),
                np.load('../input/shopee-feats/v103_bert_feats_2fold.npy').astype(np.float32),
                #np.load('../input/shopee-feats/v106_bert_feats_2fold.npy').astype(np.float32),
                #np.stack([np.load('../input/shopee-feats/clip_text_feats.npy').astype(np.float32)] * 2),
                #np.stack([np.load('../input/shopee-feats/use_multilingual_feats.npy').astype(np.float32)] * 2),
]

brt_D = []
brt_I = []

for fold_id in tqdm(range(2)):
    #import pdb;pdb.set_trace()
    #preds = list_preds1[fold_id]
    preds = np.hstack([p[fold_id] for p in list_preds_txt])

    preds /= np.linalg.norm(preds, axis=1).reshape((-1, 1))

    index = faiss.IndexFlatIP(preds.shape[1])   # build the index

    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 0, index)

    index.add(preds)                  # add vectors to the index
    D, I = index.search(preds, K)

    brt_D.append(D)
    brt_I.append(I)
fold_brt_I = np.stack(brt_I)
fold_brt_D = np.stack(brt_D)

# Similarity: Image (NFNet-F0 and ViT )

In [None]:
list_preds1 = np.load('../input/shopee-feats/v34_image_feats_2fold.npy').astype(np.float32)
list_preds2 = np.load('../input/shopee-feats/v45_image_feats_2fold.npy').astype(np.float32)
#list_preds3 = np.load('list_pred_0420_resnet50d_lykfold.npy').astype(np.float32)

img_D = []
img_I = []
for fold_id in tqdm(range(2)):
    preds = np.concatenate([list_preds1[fold_id], list_preds2[fold_id]], axis=1)
    preds /= np.linalg.norm(preds, axis=1).reshape((-1, 1))

    index = faiss.IndexFlatIP(preds.shape[1])   # build the index

    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 0, index)

    index.add(preds)                  # add vectors to the index
    D, I = index.search(preds, K)

    img_D.append(D)
    img_I.append(I)

fold_img_I = np.stack(img_I)
fold_img_D = np.stack(img_D)

# Similarity: Image+BERT

In [None]:
list_preds_txt = [np.load('../input/shopee-feats/v75_bert_feats_2fold.npy').astype(np.float32),
                np.load('../input/shopee-feats/v102_bert_feats_2fold.npy').astype(np.float32),
                np.load('../input/shopee-feats/v103_bert_feats_2fold.npy').astype(np.float32),

]
list_preds_img = [
    np.load('../input/shopee-feats/v34_image_feats_2fold.npy').astype(np.float32),
    np.load('../input/shopee-feats/v45_image_feats_2fold.npy').astype(np.float32),
]


bth_D = []
bth_I = []

for fold_id in tqdm(range(2)):
    preds1 = np.hstack([p[fold_id] for p in list_preds_txt])
    preds1 /= np.linalg.norm(preds1, axis=1).reshape((-1, 1))

    preds2 = np.hstack([p[fold_id] for p in list_preds_img])
    preds2 /= np.linalg.norm(preds2, axis=1).reshape((-1, 1))

    preds = np.concatenate([preds1, preds2], axis=1)
    preds /= np.linalg.norm(preds, axis=1).reshape((-1, 1))

    index = faiss.IndexFlatIP(preds.shape[1])   # build the index

    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 0, index)

    index.add(preds)                  # add vectors to the index
    D, I = index.search(preds, K)

    bth_D.append(D)
    bth_I.append(I)
fold_bth_I = np.stack(bth_I)
fold_bth_D = np.stack(bth_D)

# Create Features

In [None]:
df_feat_all = pd.DataFrame()

TARGETS = df_test['label_group'].values

width_hight = df_test[['width', 'hight']].values
st_size = df_test['st_size'].values
wh_st_size = df_test['wxh/st_size'].values

titles = df_test['title'].values
langs = df_test['lang'].values

numset = set('0123456789')

txt_cnt_all = np.vstack([(text_D[:, :K] > t).sum(axis=1) for t in [0.9, 0.8, 0.7, 0.6, 0.5]]).T
txt_avg_all = text_D[:, :K].mean(axis=1)
txt_std_all = text_D[:, :K].std(axis=1)

txt_avg_5_all = text_D[:, :5].mean(axis=1)
txt_avg_10_all = text_D[:, :10].mean(axis=1)
txt_avg_15_all = text_D[:, :15].mean(axis=1)
txt_avg_30_all = text_D[:, :30].mean(axis=1)

TARGETS = df_test['label_group'].values

neighbors_txt = np.take(TARGETS, text_I)

for fold_id in range(2):
    D, I = fold_img_D[fold_id], fold_img_I[fold_id]
    img_cnt_all = np.vstack([(D[:, :K] > t).sum(axis=1) for t in [0.9, 0.8, 0.7, 0.6, 0.5]]).T
    img_avg_all = D[:, :K].mean(axis=1)
    img_std_all = D[:, :K].std(axis=1)

    neighbors = np.take(TARGETS, I)
    ###
    bert_I, bert_D = fold_brt_I[fold_id], fold_brt_D[fold_id]
    brt_cnt_all = np.vstack([(bert_D[:, :K] > t).sum(axis=1) for t in [0.9, 0.8, 0.7, 0.6, 0.5]]).T
    brt_avg_all = bert_D[:, :K].mean(axis=1)
    brt_std_all = bert_D[:, :K].std(axis=1)

    neighbors_brt = np.take(TARGETS, bert_I)
    ###
    mut_I, mut_D = fold_mut_I[fold_id], fold_mut_D[fold_id]
    mut_cnt_all = np.vstack([(mut_D[:, :K] > t).sum(axis=1) for t in [0.9, 0.8, 0.7, 0.6, 0.5]]).T
    mut_avg_all = mut_D[:, :K].mean(axis=1)
    mut_std_all = mut_D[:, :K].std(axis=1)

    neighbors_mut = np.take(TARGETS, mut_I)
    ###

    bth_I, bth_D = fold_bth_I[fold_id], fold_bth_D[fold_id]
    bth_cnt_all = np.vstack([(bth_D[:, :K] > t).sum(axis=1) for t in [0.9, 0.8, 0.7, 0.6, 0.5]]).T
    bth_avg_all = bth_D[:, :K].mean(axis=1)
    bth_std_all = bth_D[:, :K].std(axis=1)

    neighbors_bth = np.take(TARGETS, bth_I)
    ###
    
    indices = df_test[df_test['fold'] == fold_id].index.values
    if DEBUG:
        indices = indices[:1000]
        
    list_feat = []
    for i in tqdm(indices):  # tqdm(df_test.index.values):#range(TARGETS.shape[0]):
        img_d = D[i]
        img_i = I[i]

        img_cnt = img_cnt_all[i]  # (img_d > 0.9).sum()
        img_avg = img_avg_all[i]  # img_d.mean()
        img_std = img_std_all[i]  # img_d.std()

        img_width, img_hight = width_hight[i]
        ###
        brt_d = bert_D[i]
        brt_i = bert_I[i]

        brt_cnt = brt_cnt_all[i]  # (brt_d > 0.9).sum()
        brt_avg = brt_avg_all[i]  # brt_d.mean()
        brt_std = brt_std_all[i]  # brt_d.std()

        brt_set = set(titles[i])
        ###
        mut_d = mut_D[i]
        mut_i = mut_I[i]

        mut_cnt = mut_cnt_all[i]  # (mut_d > 0.9).sum()
        mut_avg = mut_avg_all[i]  # mut_d.mean()
        mut_std = mut_std_all[i]  # mut_d.std()

        mut_set = set(titles[i])

        ###
        bth_d = bth_D[i]
        bth_i = bth_I[i]

        bth_cnt = bth_cnt_all[i]  # (bth_d > 0.9).sum()
        bth_avg = bth_avg_all[i]  # bth_d.mean()
        bth_std = bth_std_all[i]  # bth_d.std()

        bth_set = set(titles[i])
        ###
        txt_d = text_D[i]
        txt_i = text_I[i]

        txt_cnt = txt_cnt_all[i]  # (txt_d > 0.9).sum()
        txt_avg = txt_avg_all[i]  # txt_d.mean()
        txt_std = txt_std_all[i]  # txt_d.std()

        txt_set = set(titles[i])

        map_feat = {}
        row = neighbors[i]
        for j in range(min(K, len(img_i))):
            _w, _h = width_hight[img_i[j]]
            _img_cnt = img_cnt_all[img_i[j]]  # (img_d > 0.9).sum()
            _img_avg = img_avg_all[img_i[j]]  # img_d.mean()
            _img_std = img_std_all[img_i[j]]  # img_d.std()


            diff_width = abs(img_width - _w)
            diff_hight = abs(img_hight - _h)
            d = {'idx': i, 'idx2': img_i[j],
                 'fold': fold_id,
                 'label': row[j] == TARGETS[i],
                 'img_sim': img_d[j],
                 'img_avg': img_avg,
                 'img_std': img_std,
                 'img_avg2': _img_avg,
                 'img_std2': _img_std,

                 'diff_width': diff_width,
                 'diff_hight': diff_hight,
                 'img_width': img_width,
                 'img_hight': img_hight,
                 'img_width2': _w,
                 'img_hight2': _h,

                 'st_size': st_size[i],
                 'st_size2': st_size[img_i[j]],
                 'wh_st_size': wh_st_size[i],
                 'wh_st_size2': wh_st_size[img_i[j]]
                 }
            d.update({f'img_cnt_{ii}': img_cnt[ii] for ii in range(img_cnt.shape[0])})
            d.update({f'img_cnt2_{ii}': _img_cnt[ii] for ii in range(_img_cnt.shape[0])})
            map_feat[img_i[j]] = d

        row = neighbors_brt[i]
        for j in range(min(K, len(brt_i))):
            _brt_set = set(titles[brt_i[j]])
            _brt_cnt = brt_cnt_all[brt_i[j]]  # (brt_d > 0.9).sum()
            _brt_avg = brt_avg_all[brt_i[j]]  # brt_d.mean()
            _brt_std = brt_std_all[brt_i[j]]  # brt_d.std()

            diff_brt_set = set(titles[brt_i[j]]) & brt_set
            diff_brt_set = len(numset & diff_brt_set) / (len(diff_brt_set) + 1)
            xor_brt_set = set(titles[brt_i[j]]) ^ brt_set
            xor_brt_set = len(numset & xor_brt_set) / (len(xor_brt_set) + 1)
            jac_brt = len(brt_set & _brt_set) / (len(brt_set | _brt_set) + 1)
            lev_dist = Levenshtein.distance(titles[i], titles[brt_i[j]])
            d = {'idx': i, 'idx2': brt_i[j],
                 'fold': fold_id,
                 'label': row[j] == TARGETS[i],
                 'brt_sim': brt_d[j],
                 'brt_avg': brt_avg,
                 'brt_std': brt_std,
                 'brt_avg2': _brt_avg,
                 'brt_std2': _brt_std,

                 'jac_txt': jac_brt,
                 'diff_txt_set': diff_brt_set,
                 'xor_txt_set': xor_brt_set,
                 'lev_dist': lev_dist,
                 'len_txt': len(titles[i]),
                 'len_txt2': len(titles[brt_i[j]]),
                 'lang_en': int(langs[i] == 'en'),
                 'lang_en2': int(langs[brt_i[j]] == 'en'),
                 }
            d.update({f'brt_cnt_{ii}': brt_cnt[ii] for ii in range(brt_cnt.shape[0])})
            d.update({f'brt_cnt2_{ii}': _brt_cnt[ii] for ii in range(_brt_cnt.shape[0])})
            if brt_i[j] in map_feat:
                map_feat[brt_i[j]].update(d)
            else:
                map_feat[brt_i[j]] = d

        row = neighbors_mut[i]
        for j in range(min(K, len(mut_i))):
            _mut_set = set(titles[mut_i[j]])
            _mut_cnt = mut_cnt_all[mut_i[j]]  # (mut_d > 0.9).sum()
            _mut_avg = mut_avg_all[mut_i[j]]  # mut_d.mean()
            _mut_std = mut_std_all[mut_i[j]]  # mut_d.std()

            _w, _h = width_hight[mut_i[j]]
            diff_width = abs(img_width - _w)
            diff_hight = abs(img_hight - _h)

            diff_mut_set = set(titles[mut_i[j]]) & mut_set
            diff_mut_set = len(numset & diff_mut_set) / (len(diff_mut_set) + 1)
            xor_mut_set = set(titles[mut_i[j]]) ^ mut_set
            xor_mut_set = len(numset & xor_mut_set) / (len(xor_mut_set) + 1)
            jac_mut = len(mut_set & _mut_set) / (len(mut_set | _mut_set) + 1)
            lev_dist = Levenshtein.distance(titles[i], titles[mut_i[j]])
            d = {'idx': i, 'idx2': mut_i[j],
                 'fold': fold_id,
                 'label': row[j] == TARGETS[i],
                 'mut_sim': mut_d[j],
                 'mut_avg': mut_avg,
                 'mut_std': mut_std,
                 'mut_avg2': _mut_avg,
                 'mut_std2': _mut_std,

                 'jac_txt': jac_mut,
                 'diff_txt_set': diff_mut_set,
                 'xor_txt_set': xor_mut_set,
                 'lev_dist': lev_dist,
                 'len_txt': len(titles[i]),
                 'len_txt2': len(titles[mut_i[j]]),
                 'lang_en': int(langs[i] == 'en'),
                 'lang_en2': int(langs[mut_i[j]] == 'en'),

                 'diff_width': diff_width,
                 'diff_hight': diff_hight,
                 'img_width': img_width,
                 'img_hight': img_hight,
                 'img_width2': _w,
                 'img_hight2': _h,
                 'st_size': st_size[i],
                 'st_size2': st_size[mut_i[j]],
                 'wh_st_size': wh_st_size[i],
                 'wh_st_size2': wh_st_size[mut_i[j]]
                 }
            d.update({f'mut_cnt_{ii}': mut_cnt[ii] for ii in range(mut_cnt.shape[0])})
            d.update({f'mut_cnt2_{ii}': _mut_cnt[ii] for ii in range(_mut_cnt.shape[0])})
            if mut_i[j] in map_feat:
                map_feat[mut_i[j]].update(d)
            else:
                map_feat[mut_i[j]] = d

        row = neighbors_bth[i]
        for j in range(min(K, len(bth_i))):
            _bth_set = set(titles[bth_i[j]])
            _bth_cnt = bth_cnt_all[bth_i[j]]  # (bth_d > 0.9).sum()
            _bth_avg = bth_avg_all[bth_i[j]]  # bth_d.mean()
            _bth_std = bth_std_all[bth_i[j]]  # bth_d.std()

            _w, _h = width_hight[bth_i[j]]
            diff_width = abs(img_width - _w)
            diff_hight = abs(img_hight - _h)

            diff_bth_set = set(titles[bth_i[j]]) & bth_set
            diff_bth_set = len(numset & diff_bth_set) / (len(diff_bth_set) + 1)
            xor_bth_set = set(titles[bth_i[j]]) ^ bth_set
            xor_bth_set = len(numset & xor_bth_set) / (len(xor_bth_set) + 1)
            jac_bth = len(bth_set & _bth_set) / (len(bth_set | _bth_set) + 1)
            lev_dist = Levenshtein.distance(titles[i], titles[bth_i[j]])
            d = {'idx': i, 'idx2': bth_i[j],
                'fold': fold_id,
                'label': row[j] == TARGETS[i],
                'bth_sim': bth_d[j],
                'bth_avg': bth_avg,
                'bth_std': bth_std,
                'bth_avg2': _bth_avg,
                'bth_std2': _bth_std,

                'jac_txt': jac_bth,
                'diff_txt_set': diff_bth_set,
                'xor_txt_set': xor_bth_set,
                'lev_dist': lev_dist,
                'len_txt': len(titles[i]),
                'len_txt2': len(titles[bth_i[j]]),
                'lang_en': int(langs[i] == 'en'),
                'lang_en2': int(langs[bth_i[j]] == 'en'),


                 'diff_width': diff_width,
                 'diff_hight': diff_hight,
                 'img_width': img_width,
                 'img_hight': img_hight,
                 'img_width2': _w,
                 'img_hight2': _h,
                 'st_size': st_size[i],
                 'st_size2': st_size[bth_i[j]],
                 'wh_st_size': wh_st_size[i],
                 'wh_st_size2': wh_st_size[bth_i[j]]
                }
            d.update({f'bth_cnt_{ii}': bth_cnt[ii] for ii in range(bth_cnt.shape[0])})
            d.update({f'bth_cnt2_{ii}': _bth_cnt[ii] for ii in range(_bth_cnt.shape[0])})
            if bth_i[j] in map_feat:
                map_feat[bth_i[j]].update(d)
            else:
                map_feat[bth_i[j]] = d

        row = neighbors_txt[i]
        for j in range(min(K, len(txt_i))):
            _txt_set = set(titles[txt_i[j]])
            _txt_cnt = txt_cnt_all[txt_i[j]]  # (txt_d > 0.9).sum()
            _txt_avg = txt_avg_all[txt_i[j]]  # txt_d.mean()
            _txt_std = txt_std_all[txt_i[j]]  # txt_d.std()
            diff_txt_set = set(titles[txt_i[j]]) & txt_set
            diff_txt_set = len(numset & diff_txt_set) / (len(diff_txt_set) + 1)
            xor_txt_set = set(titles[txt_i[j]]) ^ txt_set
            xor_txt_set = len(numset & xor_txt_set) / (len(xor_txt_set) + 1)
            jac_txt = len(txt_set & _txt_set) / (len(txt_set | _txt_set) + 1)
            lev_dist = Levenshtein.distance(titles[i], titles[txt_i[j]])
            d = {'idx': i, 'idx2': txt_i[j],
                 'fold': fold_id,
                 'label': row[j] == TARGETS[i],
                 'txt_sim': txt_d[j],
                 'txt_avg': txt_avg,
                 'txt_std': txt_std,
                 'txt_avg2': _txt_avg,
                 'txt_std2': _txt_std,
                 'jac_txt': jac_txt,
                 'diff_txt_set': diff_txt_set,
                 'xor_txt_set': xor_txt_set,
                 'lev_dist': lev_dist,
                 'len_txt': len(titles[i]),
                 'len_txt2': len(titles[txt_i[j]]),
                 'lang_en': int(langs[i] == 'en'),
                 'lang_en2': int(langs[txt_i[j]] == 'en'),
                 }
            d.update({f'txt_cnt_{ii}': txt_cnt[ii] for ii in range(txt_cnt.shape[0])})
            d.update({f'txt_cnt2_{ii}': _txt_cnt[ii] for ii in range(_txt_cnt.shape[0])})
            if txt_i[j] in map_feat:
                map_feat[txt_i[j]].update(d)
            else:
                map_feat[txt_i[j]] = d

        #df_feat = pd.DataFrame(map_feat.values()).fillna(0)
        #import pdb;pdb.set_trace()
        list_feat += list(map_feat.values())
    df_feat = pd.DataFrame(list_feat).fillna(0)
    df_feat['fold_id'] = fold_id
    df_feat_all = df_feat_all.append(df_feat)

### Graph Features

In [None]:
for sim in tqdm(['img', 'bth', 'mut', 'brt', 'txt'], desc='graph'):
    weights = df_feat_all[f'{sim}_sim'].values
    list_idx = df_feat_all['idx'].values
    list_idx2 = df_feat_all['idx2'].values
    idx = weights > 0
    g = Graph()
    g.add_vertices(len(df_test))
    g.add_edges(list(zip(list_idx[idx], list_idx2[idx])), {'weight': weights[idx]})
    with timer('pagerank'):
        map_pr = np.array(g.pagerank(damping=0.85, weights='weight', niter=100, eps=1e-06, directed=False))
    with timer('pagerank reg'):
        data1 = map_pr[list_idx]
        data2 = map_pr[list_idx2]

        data1[weights <= 0] = 0
        data2[weights <= 0] = 0
        df_feat_all[f'{sim}_pagerank'] = data1
        df_feat_all[f'{sim}_pagerank2'] = data2

# Sample weighting according to label group size

In [None]:
df_feat_all['weight'] = df_feat_all['idx'].map(df_test['label_group'].map(df_test['label_group'].value_counts()))
df_feat_all['weight'] = df_feat_all['weight'] ** 0.44636585418558483  #** (trial.suggest_uniform('weight_factor', 0, 2)) #(trial.suggest_uniform('weight_factor', 0, 2)) # 0.6431355997563519 # 
df_feat_all['group'] = df_feat_all['idx'].map(df_test['label_group'].map(df_test['label_group'].value_counts()))

# Train LGB model

### F1 Optimization for stopping criteria

https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/37221

In [None]:
%load_ext Cython

In [None]:
%%cython

from cython.parallel cimport parallel, prange
from libc.stdlib cimport abort, malloc, free
cimport cython
import numpy as np
cimport numpy as np
from sklearn.metrics import f1_score


@cython.boundscheck(False)
@cython.wraparound(False)
def f1_opt(np.ndarray[double, ndim=1] preds):
    cdef int i, j, k, k1
    cdef double f1, score
    cdef long n = preds.shape[0]


    cdef np.ndarray[double, ndim = 2] DP_C = np.zeros((n + 2, n + 1), dtype=np.float)

    DP_C[0, 0] = 1.0
    for j in range(1, n):
        DP_C[0, j] = (1.0 - preds[j - 1]) * DP_C[0, j - 1]
    for i in range(1, n + 1):
        DP_C[i, i] = DP_C[i - 1, i - 1] * preds[i - 1]
        for j in range(i + 1, n + 1):
            DP_C[i, j] = preds[j - 1] * DP_C[i - 1, j - 1] + (1.0 - preds[j - 1]) * DP_C[i, j - 1]

    cdef np.ndarray[double, ndim = 1] DP_S = np.zeros((2 * n + 1,))

    for i in range(1, 2 * n + 1):
        DP_S[i] = 1. / (1. * i)

    score = -1
    cdef np.ndarray[double, ndim= 1] expectations = np.zeros(n + 1)

    for k in range(n + 1)[::-1]:
        f1 = 0
        for k1 in range(n + 1):
            f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
        for i in range(1, 2 * k - 1):
            DP_S[i] = (1 - preds[k - 1]) * DP_S[i] + preds[k - 1] * DP_S[i + 1]

        expectations[k] = f1

    return expectations


@cython.boundscheck(False)
@cython.wraparound(False)
def f1_score(np.ndarray[long, ndim=1] label, np.ndarray[double, ndim=1] preds):
    cdef int i, j, k, k1
    cdef double f1, score, f1None, pNone
    cdef long n = preds.shape[0]

    pNone = 0.

    cdef np.ndarray[long, ndim= 1] idx = np.argsort(preds)[::-1]
    label = label[idx]
    preds = preds[idx]

    cdef np.ndarray[double, ndim = 2] DP_C = np.zeros((n + 2, n + 1), dtype=np.float)

    DP_C[0, 0] = 1.0
    for j in range(1, n):
        DP_C[0, j] = (1.0 - preds[j - 1]) * DP_C[0, j - 1]
    for i in range(1, n + 1):
        DP_C[i, i] = DP_C[i - 1, i - 1] * preds[i - 1]
        for j in range(i + 1, n + 1):
            DP_C[i, j] = preds[j - 1] * DP_C[i - 1, j - 1] + (1.0 - preds[j - 1]) * DP_C[i, j - 1]

    cdef np.ndarray[double, ndim = 1] DP_S = np.zeros((2 * n + 1,))
    cdef np.ndarray[double, ndim = 1] DP_SNone = np.zeros((2 * n + 1,))
    for i in range(1, 2 * n + 1):
        DP_S[i] = 1. / (1. * i)
        DP_SNone[i] = 1. / (1. * i + 1)

    score = -1
    cdef np.ndarray[double, ndim= 1] expectations = np.zeros(n + 1)
    cdef np.ndarray[double, ndim= 1] expectationsNone = np.zeros(n + 1)

    for k in range(n + 1)[::-1]:
        f1 = 0
        f1None = 0
        for k1 in range(n + 1):
            f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
            f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
        for i in range(1, 2 * k - 1):
            DP_S[i] = (1 - preds[k - 1]) * DP_S[i] + preds[k - 1] * DP_S[i + 1]
            DP_SNone[i] = (1 - preds[k - 1]) * DP_SNone[i] + preds[k - 1] * DP_SNone[i + 1]
        expectations[k] = f1
        expectationsNone[k] = f1None + 2 * pNone / (2 + k)

    if expectations.max() > expectationsNone.max():
        i = np.argsort(expectations)[n] - 1
        tp = label[:i + 1].sum()
        if tp > 0:
            precision = tp / (i + 1)
            recall = tp / label.sum()
            f1 = (2 * precision * recall) / (precision + recall)
        else:
            f1 = 0
    else:
        i = np.argsort(expectationsNone)[n] - 1
        tp = label[:i + 1].sum() if label.sum() != 0 else 1
        if tp > 0:
            precision = tp / (i + 2)
            recall = tp / max(label.sum(), 1)
            f1 = (2 * precision * recall) / (precision + recall)
        else:
            f1 = 0

    return f1


from multiprocessing import Pool


@cython.boundscheck(False)
@cython.wraparound(False)
def f1_group(np.ndarray[long, ndim=1] label, np.ndarray[double, ndim=1] preds, np.ndarray[long, ndim=1] group):
    cdef int i, start, end, j, s
    cdef double score = 0.
    cdef long m = group.shape[0]
    cdef long n = preds.shape[0]
    start = 0

    p = Pool()
    list_p = []
    for i in range(m):
        end = start + group[i]
        list_p.append(p.apply_async(f1_score, (label[start:end], preds[start:end],)))
        start = end
    scores = [a.get() for a in list_p]
    p.close()
    p.join()
    return np.mean(scores)


### Training

In [None]:
param = {'objective': 'binary',
             'verbosity': -1,
             'boosting_type': 'gbdt',
             'bagging_freq': 1,
             'num_iterations': 10000 if not DEBUG else 101,
             'early_stopping_round': 201,
             'n_jobs': 32,
             'seed': 114,
             'metric':  'auc',  # trial.suggest_categorical('metric', ['auc', 'binary_logloss', ]), #'auc',
             'learning_rate': 0.01,
             'lambda_l1': 4.8179519901479875,
             'lambda_l2': 7.805340795613767,
             'num_leaves': 43,
             'feature_fraction': 0.48605812738261606, 
              'min_child_samples': 14, 
               'bagging_fraction': 0.7692769160924116}

def f1_metric(pred, dmat):
    itr, sc = np.load('/tmp/eval.npy')
    if itr < (2500 if not DEBUG else 100):
        sc = itr * 1.0e-5
    elif itr % 50 == 0:
        res = f1_group(dmat.get_label().astype(np.int), pred, dmat.get_group())
        sc = np.mean(res)
    np.save('/tmp/eval', [itr + 1, sc])
    return 'f1', sc, True

list_clf = []

for fold_id in tqdm(range(2)):
    df_feat = df_feat_all[df_feat_all['fold_id'] != fold_id].sort_values(['idx', 'idx2']).reset_index(drop=True)
    trn_x = df_feat.drop(['label', 'fold_id', 'fold', 'label', 'idx', 'idx2',  'weight',
                          'group'], axis=1).fillna(0).astype(np.float32)  # .values
    trn_y = df_feat['label'].values
    trn_w = 1. / df_feat['weight'].values
    trn_g = df_feat.groupby('idx').apply(lambda row: row.index.shape[0]).values

    df_feat = df_feat_all[df_feat_all['fold_id'] == fold_id].sort_values(['idx', 'idx2']).reset_index(drop=True)
    val_x = df_feat.drop(['label', 'fold_id', 'fold', 'label', 'idx', 'idx2',  'weight',
                          'group'], axis=1).fillna(0).astype(np.float32)  # .values
    val_y = df_feat['label'].values
    val_w = 1. / df_feat['weight'].values
    val_g = df_feat.groupby('idx').apply(lambda row: row.index.shape[0]).values

    dtrain = lgb.Dataset(trn_x, label=trn_y, weight=trn_w, group=trn_g)
    eval_data = lgb.Dataset(val_x, label=val_y, weight=val_w, group=val_g)

    param['metric'] = "None"
    np.save('/tmp/eval', [0, 0])
    best = lgb.train(param,
                          dtrain,
                          valid_sets=eval_data,
                          early_stopping_rounds=201,
                          feval=f1_metric,
                          verbose_eval=50
                          )
    list_clf.append(best)


with open('list_clf_tune_balanced.pkl', 'wb') as f:
    pickle.dump(list_clf, f, -1)
    
x = df_feat_all.drop(['label', 'fold_id', 'fold', 'label', 'idx', 'idx2',
                      'weight', 'group'], axis=1).fillna(0).astype(np.float32)
y = df_feat_all['label'].values
w = 1. / df_feat_all['weight'].values
dtrain = lgb.Dataset(x, label=y, weight=w)
best_param = dict(param)
best_param['num_iterations'] = int(np.mean([c.best_iteration for c in list_clf]) * 1.1)
best_param.pop('early_stopping_round')
best = lgb.train(best_param,
                      dtrain,
                      )
with open('all_data_clf.pkl', 'wb') as f:
    pickle.dump(best, f, -1)
    

best.save_model('all_data_clf_norm.lgb')

# Predict 

In [None]:
list_pred_idx = [[] for _ in range(df_test.shape[0])]
list_pred_val = [[] for _ in range(df_test.shape[0])]

for fold_id in range(2):
    #indices = df_test[df_test['fold'] == fold_id].index.values
    df = df_feat_all[df_feat_all['fold'] == fold_id].reset_index(drop=True)
    clf = list_clf[fold_id]
    col = clf.feature_name()
    feat = df[col]
    prob = clf.predict(feat[col])

    for i, (j, j2) in enumerate(df[['idx', 'idx2']].values):
        list_pred_idx[j].append(j2)
        list_pred_val[j].append(prob[i])

# Evaluate

In [None]:
map_result = {}


@numba.jit
def calc_f1(sc, total_cnt):
    #sc = targets[i] == row
    tp = sc.sum()
    fp = len(sc) - tp
    fn = total_cnt - tp
    f1 = (tp) / (tp + 0.5 * (fp + fn))
    return f1


targets = df_test.label.values
map_cnt = Counter(targets)
map_id2label = dict(tuple(x) for x in df_test[['posting_id', 'label']].values)

tk0 = tqdm(np.arange(0.3, 0.5, 0.02))
for avg_threshold in tk0:
    list_pred_id = [[] for _ in range(df_test.shape[0])]

    for i in range(df_test.shape[0]):
        pred_id = []

        idx = np.array(list_pred_idx[i])
        prob = np.array(list_pred_val[i])

        pred_id += idx[prob > avg_threshold].tolist()
        list_pred_id[i] = pred_id

    posting_ids = df_test['posting_id'].values

    list_res = []
    for i in (range(df_test.shape[0])):
        row = posting_ids[list_pred_id[i]]
        list_res.append(dict(posting_id=posting_ids[i],
                             matches=row)
                        )

    for fold_id in range(2):
        indices = df_test[df_test['fold'] == fold_id].index.values

        list_sc = []
        for i in indices:  # range(df_test.shape[0]):
            row = [map_id2label[j] for j in list_res[i]['matches']]
            sc = targets[i] == row
            f1 = calc_f1(sc, map_cnt[targets[i]])
            list_sc.append(f1)

        map_result[avg_threshold, fold_id] = np.mean(list_sc)
    df_res = pd.pivot_table(pd.Series(map_result).to_frame(name='score').reset_index(),
                            index='level_0', columns='level_1', values='score')
    tmp = df_res.mean(axis=1).to_frame()

print(df_res.index.values[df_res.values.argmax(axis=0)])
tmp['rank'] = tmp[0].rank(ascending=False)
print(tmp[tmp['rank'] < 6])

In [None]:
print('Best threshold is', float(tmp.loc[tmp['rank'] == 1, 0]))