## 导入包

In [None]:
!pip install ../input/shopeeexternalmodels/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/shopeeexternalmodels/efficientnet-1.1.0-py3-none-any.whl
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
import efficientnet.tfkeras as efn
from tqdm.notebook import tqdm
import math
from shutil import copyfile
import tensorflow_hub as hub
import transformers
from collections import Counter

transformers.__version__

In [None]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
BATCH_SIZE = 8
IMAGE_SIZE = [512, 512]
# Seed
SEED = 42
# Verbosity
VERBOSE = 1
# Number of classes
N_CLASSES = 11011

In [None]:
# RESTRICT TENSORFLOW TO 2GB OF GPU RAM
# SO THAT WE HAVE 14GB RAM FOR RAPIDS
LIMIT = 4.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

## Part1 TensorFlow-Hub Bert & EfficientNetB3

In [None]:
# Flag to get cv score
GET_CV = True
# Flag to check ram allocations (debug)
CHECK_SUB = False

df = pd.read_csv('../input/shopee-product-matching/test.csv')
print(len(df))
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
# if len(df) > 1:
    GET_CV = False
del df

# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1



# Function to read out dataset
def read_dataset():
    if GET_CV:
        print("GET_CV",GET_CV)
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/train_images/' + df['image']
        df['file_path'] = image_paths
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/test_images/' + df['image']
        df['file_path'] = image_paths
        
    return df, df_cu, image_paths

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

# Function to get our dataset that read images
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output



# Function to get the embeddings of our images with the fine-tuned model
def get_image_embeddings(image_paths,model_type='B0'):
    print("get_image_embeddings model type =========>",model_type)
    embeds = []
    
    margin = ArcMarginProduct(
            n_classes = 11014, 
            s = 30, 
            m = 0.7, 
            name='head/arc_margin', 
            dtype='float32'
            )

    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    if model_type=='B0':
        model_path='../input/shopee-efficientnetb0-arcmarginproduct/EfficientNetB0_512_42.h5'
        x = efn.EfficientNetB0(weights = None, include_top = False)(inp)
    elif model_type=='B1':
        model_path='../input/shopee-efficientnetb1-arcmarginproduct/EfficientNetB1_512_42.h5'
        x = efn.EfficientNetB1(weights = None, include_top = False)(inp)
    elif model_type=='B2':
        model_path='../input/shopee-efficientnetb2-arcmarginproduct/EfficientNetB2_512_42.h5'
        x = efn.EfficientNetB2(weights = None, include_top = False)(inp) 
    elif model_type=='B3':
        model_path='../input/shopee-efficientnetb3-arcmarginproduct/EfficientNetB3_512_42.h5'
        x = efn.EfficientNetB3(weights = None, include_top = False)(inp) 
    elif model_type=='B4':
        model_path='../input/shopee-efficientnetb4-arcmarginproduct/EfficientNetB4_512_42.h5'
        x = efn.EfficientNetB4(weights = None, include_top = False)(inp) 
    else:
        model_path='../input/shopee-efficientnetb7-arcmarginproduct/EfficientNetB7_512_42.h5'
        x = efn.EfficientNetB7(weights = None, include_top = False)(inp) 
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = margin([x, label])

    output = tf.keras.layers.Softmax(dtype='float32')(x)
    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
    model.load_weights(model_path)
    
    model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in tqdm(iterator):
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        image_dataset = get_dataset(image_paths[a:b])
        image_embeddings = model.predict(image_dataset)
        embeds.append(image_embeddings)
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings


# Return tokens, masks and segments from a text array or series
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

# Function to get our text title embeddings using a pre-trained bert model
def get_text_embeddings(df, max_len = 70):
    embeds = []
    module_url = "/kaggle/input/shopeeexternalmodels/bert_en_uncased_L-24_H-1024_A-16_1"
    bert_layer = hub.KerasLayer(module_url, trainable = True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
    text = bert_encode(df['title'].values, tokenizer, max_len = max_len)
    
    margin = ArcMarginProduct(
            n_classes = 11014, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    
    input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    label = tf.keras.layers.Input(shape = (), name = 'label')

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    x = margin([clf_output, label])
    output = tf.keras.layers.Softmax(dtype='float32')(x)
    model = tf.keras.models.Model(inputs = [input_word_ids, input_mask, segment_ids, label], outputs = [output])
    
    model.load_weights('../input/bert-baseline/Bert_123.h5')
    model = tf.keras.models.Model(inputs = model.input[0:3], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in tqdm(iterator):
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        text_chunk = ((text[0][a:b], text[1][a:b], text[2][a:b]))
        text_embeddings = model.predict(text_chunk, batch_size = BATCH_SIZE)
        embeds.append(text_embeddings)
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings
    

## Part2 Huggingface Indonesian-Distilbert

In [None]:

# https://www.kaggle.com/moeinshariatnia/indonesian-distilbert-finetuning-with-arcmargin
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import (BertTokenizer, BertModel,
                          DistilBertTokenizer, DistilBertModel)
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, mode="train", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            self.targets = dataframe['label_code'].values
        texts = list(dataframe['title'].apply(lambda o: str(o)).values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        self.mode = mode
        
        
    def __getitem__(self, idx):
        # putting each tensor in front of the corresponding key from the tokenizer
        # HuggingFace tokenizers give you whatever you need to feed to the corresponding model
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}
        # when testing, there are no targets so we won't do the following
        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx]).long()
        return item
    
    def __len__(self):
        return len(self.dataframe)
    
    
    
class CFG:
    DistilBERT = True # if set to False, BERT model will be used
    bert_hidden_size = 768
    
    batch_size = 16
    epochs = 50
    num_workers = 4
    learning_rate = 1e-5 #3e-5
    scheduler = "ReduceLROnPlateau"
    step = 'epoch'
    patience = 2
    factor = 0.8
    dropout = 0.5
    model_path = "outputs/"
    max_length = 60
    model_save_name = "model.pt"
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

class ArcMarginProduct_torch(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
        super(ArcMarginProduct_torch, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device=CFG.device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s
        # print(output)

        return output
    
class Model(nn.Module):
    def __init__(self, 
                 bert_model, 
                 num_classes=11014, 
                 last_hidden_size=CFG.bert_hidden_size):
        
        super().__init__()
        self.bert_model = bert_model
        self.arc_margin = ArcMarginProduct_torch(last_hidden_size, 
                                           num_classes, 
                                           s=30.0, 
                                           m=0.50, 
                                           easy_margin=False)
    
    def get_bert_features(self, batch):
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        CLS_token_state = last_hidden_state[:, 0, :] # obtaining CLS token state which is the first token.
        return CLS_token_state
    
    def forward(self, batch):
        CLS_hidden_state = self.get_bert_features(batch)
        output = self.arc_margin(CLS_hidden_state, batch['labels'])
        return output

print("image bert")

In [None]:
def get_indoesian_text_embedding(df):
    
    # model_name='cahya/distilbert-base-indonesian'
    model_name='../input/indonesian-distilbert-arcmargin/distilbert-base-indonesian/'
    tokenizer = DistilBertTokenizer.from_pretrained(model_name,local_files_only=True)
    bert_model = DistilBertModel.from_pretrained(model_name,local_files_only=True)
    
    model = Model(bert_model).to(CFG.device)
    del bert_model
#     model.load_state_dict(torch.load('../input/indonesian-arc/model-m0.15.pt'))
    model.load_state_dict(torch.load('../input/indonesian-arc/model11.pt'))
    
    
    model.eval()

    embeds=[]
    test_dataset = TextDataset(df, tokenizer, mode='test',max_length=CFG.max_length)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                               batch_size=CFG.batch_size, 
                                               num_workers=CFG.num_workers, 
                                               shuffle=False)
    

    tqdm_object = tqdm(test_loader, total=len(test_loader))
    with torch.no_grad():
        for batch in tqdm_object:
            batch = {k: v.to(CFG.device) for k, v in batch.items()}
            preds = model.get_bert_features(batch)
            embeds.append(preds.cpu())
        
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

print("indonesian")

## Part3 Densenet ArcFace

In [None]:
image_size = 512
batch_size = 32
num_workers = 4
n_batch = 10 # to avoid oom, split 70000+ images into 10 batches
sim_thresh = 0.9


import pandas as pd
import numpy as np
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import os
import sys
import time
import cv2
import PIL.Image
import random
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
import albumentations
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import gc
from sklearn.metrics import roc_auc_score
%matplotlib inline
import seaborn as sns
from pylab import rcParams
import timm
from warnings import filterwarnings
from sklearn.preprocessing import LabelEncoder
import math
import glob
filterwarnings("ignore")

device = torch.device('cuda') 


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f'Setting all seeds to be {seed} to reproduce...')
seed_everything(42)


transforms_valid = albumentations.Compose([
    albumentations.Resize(image_size, image_size),
    albumentations.Normalize()
])


class SHOPEEDataset(Dataset):
    def __init__(self, df, mode, transform=None):
        
        self.df = df.reset_index(drop=True)
        self.mode = mode
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.loc[index]
        img = cv2.imread(row.file_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        if self.transform is not None:
            res = self.transform(image=img)
            img = res['image']
                
        img = img.astype(np.float32)
        img = img.transpose(2,0,1)
        
        if self.mode == 'test':
            return torch.tensor(img).float()
        else:
            return torch.tensor(img).float()
#             return torch.tensor(img).float(), torch.tensor(row.label_group).float()
        
        
        
        
class DesnetArcModule(nn.Module):
    def __init__(self, in_features, out_features, s=10, m=0.5):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_normal_(self.weight)

        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = torch.tensor(math.cos(math.pi - m))
        self.mm = torch.tensor(math.sin(math.pi - m) * m)

    def forward(self, inputs, labels):
        cos_th = F.linear(inputs, F.normalize(self.weight))
        cos_th = cos_th.clamp(-1, 1)
        sin_th = torch.sqrt(1.0 - torch.pow(cos_th, 2))
        cos_th_m = cos_th * self.cos_m - sin_th * self.sin_m
        # print(type(cos_th), type(self.th), type(cos_th_m), type(self.mm))
        cos_th_m = torch.where(cos_th > self.th, cos_th_m, cos_th - self.mm)

        cond_v = cos_th - self.th
        cond = cond_v <= 0
        cos_th_m[cond] = (cos_th - self.mm)[cond]

        if labels.dim() == 1:
            labels = labels.unsqueeze(-1)
        onehot = torch.zeros(cos_th.size()).cuda()
        labels = labels.type(torch.LongTensor).cuda()
        onehot.scatter_(1, labels, 1.0)
        outputs = onehot * cos_th_m + (1.0 - onehot) * cos_th
        outputs = outputs * self.s
        return outputs
class SHOPEEDenseNet(nn.Module):

    def __init__(self, channel_size, out_feature, dropout=0.5, backbone='densenet121', pretrained=False):
        super(SHOPEEDenseNet, self).__init__()
        self.backbone = timm.create_model(backbone, pretrained=pretrained)
        self.channel_size = channel_size
        self.out_feature = out_feature
        self.in_features = self.backbone.classifier.in_features
        self.margin = DesnetArcModule(in_features=self.channel_size, out_features = self.out_feature)
        self.bn1 = nn.BatchNorm2d(self.in_features)
        self.dropout = nn.Dropout2d(dropout, inplace=True)
        self.fc1 = nn.Linear(self.in_features * 16 * 16 , self.channel_size)
        self.bn2 = nn.BatchNorm1d(self.channel_size)
        
    def forward(self, x, labels=None):
        features = self.backbone.features(x)
        features = self.bn1(features)
        features = self.dropout(features)
        features = features.view(features.size(0), -1)
        features = self.fc1(features)
        features = self.bn2(features)
        features = F.normalize(features)
        if labels is not None:
            return self.margin(features, labels)
        return features
    

    
    
def generate_desent_features(df_loader):
    model = SHOPEEDenseNet(512, 11014)
    model.load_state_dict(
        torch.load('../input/shopeeexternalmodels/baseline_fold0_densenet_512_epoch40.pth', map_location='cuda:0'))
    model.to(device)
    model.eval()
    bar = tqdm(df_loader)

    FEAS = []
    TARGETS = []

    with torch.no_grad():
        for batch_idx, (images) in enumerate(bar):
            #             print(images)
            images = images.to(device)

            features = model(images)

            FEAS += [features.detach().cpu()]
    FEAS = torch.cat(FEAS).cpu().numpy()
    del model
    gc.collect()
    return FEAS




In [None]:
df, df_cu, image_paths = read_dataset()
print(df.shape)
gc.collect()

## TFIDF

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer

model = TfidfVectorizer(stop_words=None, binary=True, max_features=25000)
text_embeddings2 = model.fit_transform(df_cu.title).toarray()
print('text embeddings shape',text_embeddings2.shape)


preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(df_cu)//CHUNK
if len(df_cu)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(df_cu))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    # cts = np.dot( text_embeddings, text_embeddings[a:b].T).T
    cts = cupy.matmul(text_embeddings2, text_embeddings2[a:b].T).T
    
    for k in range(b-a):
        # IDX = np.where(cts[k,]>0.7)[0]
        IDX = cupy.where(cts[k,]>0.775)[0]
        o = df_cu.iloc[cupy.asnumpy(IDX)].posting_id.to_pandas().values
        if len(o) > 1:
            IDX_again = cupy.where(cts[k,] > 0.80)[0]
            o_again = df.iloc[cupy.asnumpy(IDX_again)].posting_id.values
            if len(o_again) > 1 :
                preds.append(o_again)
            else:
                preds.append(o)
        else:
            preds.append(o)
                
del model, text_embeddings2

df_cu['oof_text'] = preds

## eca-nfnet-l0

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import numpy as np 
import pandas as pd 

import math
import random 
import os 
import cv2
import timm

from tqdm import tqdm 

import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2

import torch 
from torch.utils.data import Dataset 
from torch import nn
import torch.nn.functional as F 

import gc
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors


class ECACFG:
    
    img_size = 512
    batch_size = 12
    seed = 2020
    
    device = 'cuda'
    classes = 11014
    
    model_name = 'eca_nfnet_l0'
#     model_path = '../input/shopee-pytorch-models/arcface_512x512_nfnet_l0 (mish).pt'
    model_path = '../input/shopee-pytorch-models/arcface_512x512_nfnet_l0 (mish).pt'
    scale = 30 
    margin = 0.5



def get_test_transforms():

    return A.Compose(
        [
            A.Resize(ECACFG.img_size,ECACFG.img_size,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )


class ECAShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)
    
class ECAArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ECAArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output

class ECAShopeeModel(nn.Module):

    def __init__(
        self,
        n_classes = ECACFG.classes,
        model_name = ECACFG.model_name,
        fc_dim = 512,
        margin = ECACFG.margin,
        scale = ECACFG.scale,
        use_fc = True,
        pretrained = False,
        is_swin=False
    ):


        super(ECAShopeeModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if model_name == 'resnext50_32x4d':
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'efficientnet_b3':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'tf_efficientnet_b5_ns':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif model_name == 'eca_nfnet_l0':
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()
        elif 'swin' in model_name:
            final_in_features = self.backbone.head.in_features
            self.backbone.norm = nn.Identity()
            self.backbone.head = nn.Identity()
            
            
        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc
        self.is_swin = is_swin


        self.dropout = nn.Dropout(p=0.0)
        self.fc = nn.Linear(final_in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        final_in_features = fc_dim

        self.final = ECAArcMarginProduct(
            final_in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        if not self.is_swin:
            feature = self.extract_feat(image)
        else:
            feature = self.extract_swin_feat(image)
            print(feature.shape)
        #logits = self.final(feature,label)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x
    
    def extract_swin_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
#         print(x.shape)
#         print(self.pooling(x).shape)
#         x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        
        return x
    
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2) 

        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 


class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)


def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model


def get_ecanfnet_embeddings(image_paths, model_name = ECACFG.model_name):
    embeds = []
    
    model = ECAShopeeModel(model_name = model_name)
    model.eval()
    
    if model_name == 'eca_nfnet_l0':
        model = replace_activations(model, torch.nn.SiLU, Mish())

    model.load_state_dict(torch.load(ECACFG.model_path))
    model = model.to(ECACFG.device)
    

    image_dataset = ECAShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=ECACFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    del image_dataset,image_loader
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    torch.cuda.empty_cache()
    return image_embeddings


class ECACFG:
    
    img_size = 512
    batch_size = 12
    seed = 2020
    
    device = 'cuda'
    classes = 11014
    
    model_name = 'eca_nfnet_l0'
#     model_path = '../input/shopee-pytorch-models/arcface_512x512_nfnet_l0 (mish).pt'
    model_path = '../input/shopee-pytorch-models/arcface_512x512_nfnet_l0 (mish).pt'
    scale = 30 
    margin = 0.5
    


if not GET_CV:
    
    image_eca_embeddings = get_ecanfnet_embeddings(image_paths.values)
    print("image_eca_embeddings.shape",image_eca_embeddings.shape)

## SWIN model

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import numpy as np 
import pandas as pd 

import math
import random 
import os 
import cv2
import timm

from tqdm import tqdm 

import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2

import torch 
from torch.utils.data import Dataset 
from torch import nn
import torch.nn.functional as F 

import gc
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors




class ECACFG:
    
    img_size = 224
    batch_size = 12
    seed = 2020
    
    device = 'cuda'
    classes = 11014
    
    model_name = 'swin_small_patch4_window7_224'
    model_path = '../input/shopee-pytorch-swin-transformer-image-training/swin_small_patch4_window7_224.pt'
    scale = 30 
    margin = 0.5

def get_test_transforms():

    return A.Compose(
        [
            A.Resize(ECACFG.img_size,ECACFG.img_size,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )


class ECAShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)
    
class ECAArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ECAArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output

class ECAShopeeModel(nn.Module):

    def __init__(
        self,
        n_classes = ECACFG.classes,
        model_name = ECACFG.model_name,
        fc_dim = 512,
        margin = ECACFG.margin,
        scale = ECACFG.scale,
        use_fc = True,
        pretrained = False,
        is_swin=False
    ):


        super(ECAShopeeModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if model_name == 'resnext50_32x4d':
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'efficientnet_b3':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'tf_efficientnet_b5_ns':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif model_name == 'eca_nfnet_l0':
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()
        elif 'swin' in model_name:
            final_in_features = self.backbone.head.in_features
            self.backbone.norm = nn.Identity()
            self.backbone.head = nn.Identity()
            
            
        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc
        self.is_swin = is_swin


        self.dropout = nn.Dropout(p=0.0)
        self.fc = nn.Linear(final_in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        final_in_features = fc_dim

        self.final = ECAArcMarginProduct(
            final_in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        if not self.is_swin:
            feature = self.extract_feat(image)
        else:
            feature = self.extract_swin_feat(image)
        #logits = self.final(feature,label)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x
    
    def extract_swin_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
#         print(x.shape)
#         print(self.pooling(x).shape)
#         x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        
        return x
    
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2) 

        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 


class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)


def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model



def get_swin_embeddings(image_paths, model_name = ECACFG.model_name):
    embeds = []
    
    model = ECAShopeeModel(model_name = model_name,is_swin=True)
    model.eval()
    
    if model_name == 'swin':
        model = replace_activations(model, torch.nn.GELU, Mish())

    model.load_state_dict(torch.load(ECACFG.model_path))
    model = model.to(ECACFG.device)
    

    image_dataset = ECAShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=ECACFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    del image_dataset,image_loader
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    torch.cuda.empty_cache()
    return image_embeddings




class ECACFG:
    
    img_size = 224
    batch_size = 12
    seed = 2020
    
    device = 'cuda'
    classes = 11014
    
    model_name = 'swin_small_patch4_window7_224'
    model_path = '../input/shopee-pytorch-swin-transformer-image-training/swin_small_patch4_window7_224.pt'
    scale = 30 
    margin = 0.5

    
if not GET_CV:
    
    image_swin_embeddings = get_swin_embeddings(image_paths.values)
    print("image_swin_embeddings.shape",image_swin_embeddings.shape)

In [None]:
if not GET_CV:
    
    
#     np.save("image_eca_embeddings.npy",image_eca_embeddings)    
    
#     np.save("image_swin_embeddings.npy",image_swin_embeddings)    
    image_b3_embeddings = get_image_embeddings(image_paths,model_type="B3")
    image_b4_embeddings = get_image_embeddings(image_paths,model_type="B4")

    gc.collect()
    tf.keras.backend.clear_session()
    
    
    text_indonesian_embeddings = get_indoesian_text_embedding(df)
    torch.cuda.empty_cache()
    
    
    dataset_df = SHOPEEDataset(df, 'train', transform=transforms_valid)
    df_loader = torch.utils.data.DataLoader(dataset_df, batch_size=batch_size, 
                                            shuffle=False, num_workers=num_workers, 
                                            pin_memory=True)
    image_desnet512_embeddings = generate_desent_features(df_loader)
    torch.cuda.empty_cache()
    
    
else:
    image_eca_embeddings=np.load('../input/shopeeexternalmodels/image_eca_embeddings.npy')    
    image_swin_embeddings=np.load('../input/shopeeexternalmodels/image_swin_embeddings.npy')
    image_b3_embeddings=np.load('../input/allembeddings/image_b3_embeddings.npy')    
    image_b4_embeddings=np.load('../input/allembeddings/image_b4_embeddings.npy')    
    text_indonesian_embeddings=np.load('../input/allembeddings/text_indonesian_embeddings.npy')
    image_desnet512_embeddings=np.load('../input/allembeddings/image_desnet512_embeddings.npy')
    
    print("image_eca_embeddings.shape",image_eca_embeddings.shape)    
    print("image_swin_embeddings.shape",image_swin_embeddings.shape)    

    print("image_b3_embeddings.shape",image_b3_embeddings.shape)    
    print("image_b4_embeddings.shape",image_b4_embeddings.shape)    
    print("text_indonesian_embeddings.shape",text_indonesian_embeddings.shape)    
    print("image_desnet512_embeddings.shape",image_desnet512_embeddings.shape)    

In [None]:
def find_strict_th(cnt=1,ths=[4.8,4.8,4.8,4.8,4.8]):
    index=0
    if 1<=cnt<10:
        index=0
    elif 10<=cnt<20:
        index=1
    elif 20<=cnt<30:
        index=2
    elif 30<=cnt<40:
        index=3
    else:
        index=4
    
    return ths[index]


In [None]:
def get_neighbors(df, 
                  embeddings, 
                  KNN = 50, 
                  image = True,
                  thresholds=None,
                  perfect=None,
                  strict=None,
                  strict_ths=None,
                  is_csr=False,
                  finetuneing=False
                 ):
#     KNN=3
    
        
    if is_csr:
        from sklearn.neighbors import NearestNeighbors
    else:
        from cuml.neighbors import NearestNeighbors
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
#     print(pd.DataFrame(distances).describe())
    if GET_CV:
        scores = []
        if finetuneing:
            for threshold in thresholds:
                predictions = []
                for k in range(embeddings.shape[0]):
                    idx = np.where(distances[k,] < threshold)[0]
                    ids = indices[k,idx]
                    posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                    predictions.append(posting_ids)
                df['pred_matches'] = predictions
                df['f1'] = f1_score(df['matches'], df['pred_matches'])
                score = df['f1'].mean()
                print(f'Our f1 score for threshold {threshold} is {score}')
                scores.append(score)
            thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
            max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
            best_threshold = max_score['thresholds'].values[0]
            best_score = max_score['scores'].values[0]
            print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            
            idx = np.where(distances[k,] < perfect)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            
            # 过滤噪音
#             if len(posting_ids) > 1 and strict_ths:
            if len(posting_ids) > 1 and strict:
                
#                 strict=find_strict_th(cnt=len(posting_ids),ths=strict_ths)
                idx_image_again = np.where(distances[k,] < strict)[0]
                ids_image_again = indices[k, idx_image_again]
                posting_ids_again = df['posting_id'].iloc[ids_image_again].values
                if len(ids_image_again) > 1:
                    predictions.append(posting_ids_again)
                else:
                    predictions.append(posting_ids)
            else:
                predictions.append(posting_ids)
#             predictions.append(posting_ids)
    
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            idx = np.where(distances[k,] < perfect)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            # 过滤噪音
#             if len(posting_ids) > 1 and strict_ths:
            if len(posting_ids) > 1 and strict:
            
#                 strict=find_strict_th(cnt=len(posting_ids),ths=strict_ths)
                idx_image_again = np.where(distances[k,] < strict)[0]
                ids_image_again = indices[k, idx_image_again]
                posting_ids_again = df['posting_id'].iloc[ids_image_again].values
                if len(ids_image_again) > 1:
                    predictions.append(posting_ids_again)
                else:
                    predictions.append(posting_ids)
            else:
                predictions.append(posting_ids)
#             predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions



In [None]:
# ====================eca================
eca_thresholds_range = list(np.arange(20, 23, 0.5))
perfect_eca_threshold=16

df, image_eca_predictions = get_neighbors(
    df, image_eca_embeddings, 
    KNN = 100, image = True,
    thresholds=eca_thresholds_range,
    perfect=perfect_eca_threshold,
    strict=15,
    finetuneing=True
)

In [None]:


# ====================swin================
print("# ===============swin=============")

swin_thresholds_range = list(np.arange(0.4, 1.0, 0.1))
perfect_swin_threshold=0.4

df, image_swin_predictions = get_neighbors(
    df, image_desnet512_embeddings, 
    KNN = 100, image = True,
    thresholds=swin_thresholds_range,
    perfect=perfect_swin_threshold,
    strict=0.3,
    finetuneing=True
)

# ===============efficientnet=============
print("# ===============efficientnet=============")

b3_thresholds_range = list(np.arange(4.0, 5.0, 0.1))
perfect_b3_threshold=4.0

df, image_b3_predictions = get_neighbors(
    df, image_b3_embeddings, 
    KNN = 100, image = True,
    thresholds=b3_thresholds_range,
    perfect=4.0,
    strict=3.9,
    finetuneing=True
    
#     strict_ths=[3.85,3.85,3.90,3.90,3.90]
)
df, image_b4_predictions = get_neighbors(
    df, image_b4_embeddings, 
    KNN = 50, image = True,
    thresholds=b3_thresholds_range,
    perfect=4.1,
    strict=4.0,
    finetuneing=True
    
#     strict_ths=[3.95,3.95,4.0,4.0,4.0]
    
)



# ================desnet512================
print("# ===============desnet512=============")

desnet_thresholds_range = list(np.arange(0.2, 0.7, 0.05))
perfect_desnet_threshold=0.3

df, image_predictions_desnet = get_neighbors(
    df, image_desnet512_embeddings, 
    KNN = 100, image = True,
    thresholds=desnet_thresholds_range,
    perfect=perfect_desnet_threshold,
    strict=0.295,
    finetuneing=True
    
#     strict_ths=[0.291,0.291,0.295,0.295,0.295]
    
)



# ===============indonesian bert=============
print("# ===============indonesian bert=============")
text_thresholds_range = list(np.arange(20, 28, 1))
perfect_text_threshold=14.0
df, text_indonesian_predictions = get_neighbors(
    df, text_indonesian_embeddings, 
    KNN = 100, image = False,
    thresholds=text_thresholds_range,
    perfect=perfect_text_threshold,
    strict=13.5,
    finetuneing=True
    
#     strict_ths=[13.2,13.2,13.5,13.5,13.5]
    
)


In [None]:
def cal_result_metrics(df):
    print(df.shape)
    # 计算召回率
    ## 真实标签 matches,pred_matches

    ## 完全正确
    # sorted(['train_2278313361','train_129225211 '])
    # ['train_129225211 ', 'train_2278313361']
    full_right=0
    full_recall_right=0

    right_recall_1=0
    right_recall_2=0

    not_recall=0
    error_cnt=0
    for index,row in df.iterrows():
        if sorted(row['matches'].split())==sorted(row['pred_matches'].split()):
            full_right+=1

        flag=True
        for x in row['matches'].split():
            if x not in row['pred_matches']:
                not_recall+=1
                flag=False
            else:
                right_recall_1+=1

        if flag:
            full_recall_right+=1


        for x in row['pred_matches'].split():
            if x not in row['matches']:
                error_cnt+=1
            else:
                right_recall_2+=1
    df['matches_count']=df['matches'].apply(lambda x:len(x.split()))
    df['pred_count']=df['pred_matches'].apply(lambda x:len(x.split()))

    print("真实样本匹配的个数：",df['matches_count'].sum())
    print("候选集预测的个数：",df['pred_count'].sum())

    full_right_ratio=full_right/df.shape[0]
    print("候选集pred_matches完全与matches相等的概率率：",full_right_ratio)
    full_right_recall_ratio=full_recall_right/df.shape[0]
    print("每一个matches中的都出现在pred_matches的概率：",full_right_ratio)
    print("候选集中没有出现在matches中的个数，相当于是预测错误的个数：",error_cnt)
    print("候选集中没有预测到matches样本的个数，相当于没有召回到的个数：",not_recall)
    print("候选集中预测正确的个数：",right_recall_1)
    print("候选集中预测正确的个数：",right_recall_2)

    print("真实召回率",right_recall_2/df['matches_count'].sum())

In [None]:
# Function to combine predictions
def combine_predictions(row):
    x = np.concatenate([
        row['image_eca_predictions'][:48],
        row['image_swin_predictions'][:48],
        row['image_b3_predictions'][:48],
        row['image_b4_predictions'][:48],
        row['image_predictions_desnet'][:48],
        row['text_indonesian_predictions'][:48],
        row['oof_text'][:48],
    ])
    return ' '.join( np.unique(x) )

df['image_eca_predictions'] = image_eca_predictions
df['image_swin_predictions'] = image_swin_predictions
df['image_b3_predictions'] = image_b3_predictions
df['image_b4_predictions'] = image_b4_predictions
df['image_predictions_desnet'] = image_predictions_desnet

df['text_indonesian_predictions'] = text_indonesian_predictions
df['oof_text'] = df_cu['oof_text'].to_pandas().values
df['pred_matches'] = df.apply(combine_predictions, axis = 1)

if GET_CV:
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'1111 Our final f1 cv score is {score}')
    cal_result_metrics(df)
    
    df['pred_count']=df['pred_matches'].apply(lambda x:len(x.split()))
    print("原始匹配结果数量分布",df['pred_count'].describe())
    

In [None]:
if GET_CV:
    pari_counts=[
        (1,10),
        (10,20),
        (20,30),
        (30,40),
        (40,50),
        (50,100)
    ]

    for start,end in pari_counts:
        print("==============>",start,end,"====================>")
        tmp=df.loc[(start<=df['pred_count'])&(df['pred_count']<end),:]
        cal_result_metrics(tmp)

In [None]:
# 获取tfidf向量
def get_tfidf_embeddings(df, max_features = 15500):
    from sklearn.feature_extraction.text import TfidfVectorizer
    model = TfidfVectorizer(max_features = max_features)
    text_embeddings = model.fit_transform(df['title'])
    print(f'Our title text embedding shape is {text_embeddings.shape}')
    del model,df
    gc.collect()
    return text_embeddings


text_tfidf_embeddings = get_tfidf_embeddings(df, max_features = 21500)
print("text_tfidf_embeddings.shape",text_tfidf_embeddings.shape)

In [None]:
df, big_eca_pred = get_neighbors(df, image_eca_embeddings, KNN = 50, image = True,
                                      thresholds=eca_thresholds_range,
                                      perfect=21.5)


df, big_swin_pred = get_neighbors(df, image_swin_embeddings, KNN = 50, image = True,
                                      thresholds=swin_thresholds_range,
                                      perfect=0.6)


df, big_b3_pred = get_neighbors(df, image_b3_embeddings, KNN = 50, image = True,
                                      thresholds=b3_thresholds_range,
                                      perfect=4.8)


df, big_b4_pred = get_neighbors(df, image_b4_embeddings, KNN = 50, image = True,
                                      thresholds=b3_thresholds_range,
                                      perfect=4.9)



df, big_desnet_pred = get_neighbors(df, image_desnet512_embeddings, KNN = 100, image = True,
                                             thresholds=desnet_thresholds_range,
                                             perfect=0.6)

df, big_indonsian_pred = get_neighbors(df, text_indonesian_embeddings, KNN = 100, image = False,
                                                thresholds=text_thresholds_range,
                                                perfect=25)
df, big_tfidf_pred = get_neighbors(df, text_tfidf_embeddings, KNN = 100, image = True,
                                             thresholds=desnet_thresholds_range,
                                             perfect=0.8,is_csr=True)

del image_b3_embeddings,image_b4_embeddings,text_indonesian_embeddings
del text_tfidf_embeddings


df['big_eca_pred']=big_eca_pred
df['big_swin_pred']=big_swin_pred
df['big_b3_pred']=big_b3_pred
df['big_b4_pred']=big_b4_pred
df['big_desnet_pred']=big_desnet_pred
tmp = df.groupby('image_phash').posting_id.agg('unique').to_dict()
df['oof_hash'] = df.image_phash.map(tmp)

df['big_indonsian_pred']=big_indonsian_pred
df['big_tfidf_pred']=big_tfidf_pred

del big_eca_pred,big_swin_pred
del big_b3_pred,big_b4_pred,big_indonsian_pred
del big_desnet_pred,tmp,big_tfidf_pred

gc.collect()
df.head()

In [None]:
# def combine_predictions_v2(row,topn):
#     #     print(row['text_predictions_indonesian'])
#     x = np.concatenate([
#         row['big_b3_pred'][:topn],
#         row['big_b4_pred'][:topn],
#         row['big_indonsian_pred'][:topn],
#         row['big_desnet_pred'][:topn],
#         row['oof_hash'][:topn],
#         row['big_tfidf_pred'][:topn],
#     ])
#     return ' '.join(np.unique(x))

# df['big_pred_matches'] = df.apply(lambda row:combine_predictions_v2(row,2), axis=1)
# df.head()

In [None]:
# 投票召回频次最大的两个
from collections import Counter
def vote_recall_predictions(row):
    x = np.concatenate([
        row['big_eca_pred'][:3],
        row['big_swin_pred'][:3],
        row['big_b3_pred'][:3],
        row['big_b4_pred'][:3],
        row['big_desnet_pred'][:3],
    ])
    
    collection_matches = Counter(x)
    # 存在两个以上召回两个频次最大的，否则返回自身
    try:
        new_x = collection_matches.most_common(2)[0][0] + ' ' + collection_matches.most_common(2)[1][0]
    except:
        new_x = collection_matches.most_common(1)[0][0]

    new_x = new_x.split()
    new_x = np.concatenate([row['oof_hash'][:3], row['big_tfidf_pred'][:3], new_x ])
    
    # bert 放在最后做召回
    if len( np.unique(new_x) ) < 2:
        new_x = np.concatenate([row['big_indonsian_pred'][:3], row['big_tfidf_pred'][:3]])
    
    return ' '.join( np.unique(new_x) )

df['big_pred_matches'] = df.apply(vote_recall_predictions, axis=1)


In [None]:
# Concatenate image predctions with text predictions
if GET_CV:
    # 赋值重新召回的值 重新召回结果为1的结果
    only_one_df=df[df['pred_count']==1]
    print(only_one_df.shape)
    df['big_pred_count']=df['big_pred_matches'].apply(lambda x:len(x.split()))
    stats=df[df['posting_id'].isin(only_one_df['posting_id'])]['big_pred_count'].describe()
    print(stats)
    
    df.loc[only_one_df.index,'pred_matches']=df.loc[only_one_df.index,'big_pred_matches']
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'2222 Our final f1 cv score is {score}')
    cal_result_metrics(df)
    df['matches'] = df['pred_matches']
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
    df['pred_count']=df['pred_matches'].apply(lambda x:len(x.split()))
    only_one_df=df[df['pred_count']==1]
    print("合并大阈值结果之后的1样本个数",only_one_df.shape)
else:
    # 赋值重新召回的值
    df['matches'] = df['pred_matches']

    df['pred_count']=df['matches'].apply(lambda x:len(x.split()))
    only_one_df=df[df['pred_count']==1]
    df.loc[only_one_df.index,'matches']=df.loc[only_one_df.index,'big_pred_matches']
    
#     only_two_df=df[df['pred_count']==2]
#     df.loc[only_two_df.index,'matches']=df.loc[only_two_df.index,'big_pred_matches3']
    
    
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

In [None]:
!rm ./tokenization.py
!rm ./__pycache__/tokenization.cpython-37.pyc


In [None]:
df[['posting_id', 'matches']].head()

In [None]:
df.shape