## Directory settiings

In [None]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR='./'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
ROOT_DIR = '../input/shopee-product-matching/'
TRAIN_PATH = ROOT_DIR + 'train_images/'
TEST_PATH = ROOT_DIR + 'test_images/'

## CFG

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug = False
    CHECK_SUB = False
    GET_CV = False
    num_workers = 4
    model_name_cnn = 'tf_efficientnet_b3_ns'
    model_name_bert = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
    size = 512
    batch_size = 8
    seed = 42
    target_size = 8811
    classes = 11014
    target_size_list = [8811, 8812, 8811, 8811, 8811]
    target_col = 'label_group'
    use_fc = False
    use_arcface = True
    scale = 30
    margin = 0.5
    fc_dim = 512
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = False
    inference = True

In [None]:
import pandas as pd
test = pd.read_csv('../input/shopee-product-matching/test.csv')
if len(test)>3: 
    CFG.GET_CV = False
else: 
    print('this submission notebook will compute CV score, but commit notebook will not')

## Library

In [None]:
# ====================================================
# Library
# ====================================================
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau, _LRScheduler

import transformers

from albumentations import (
    Compose, OneOf, Normalize, Resize, RandomResizedCrop, RandomCrop, HorizontalFlip, VerticalFlip, 
    RandomBrightness, RandomContrast, RandomBrightnessContrast, Rotate, ShiftScaleRotate, Cutout, 
    IAAAdditiveGaussianNoise, Transpose
    )
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors

import timm

import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return ' '.join( np.unique(x) )

@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s.')

def init_logger(log_file=OUTPUT_DIR+'inference.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

#LOGGER = init_logger()

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

tokenizer = transformers.AutoTokenizer.from_pretrained(CFG.model_name_bert)

## Data Loading

In [None]:
def read_dataset():
    if CFG.GET_CV:
        
        # create folds
        # trainingの時と同じようにfoldを切っています。
        folds = pd.read_csv('../input/shopee-product-matching/train.csv')
        if CFG.debug:
            folds = folds.sample(n=300, random_state=CFG.seed).reset_index(drop=True)  
        Fold = GroupKFold(n_splits=CFG.n_fold)
        groups = folds['label_group'].values
        for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col], groups)):
            folds.loc[val_index, 'fold'] = int(n)
        folds['fold'] = folds['fold'].astype(int)
        display(folds.groupby('fold').size())
        
        tmp = folds.groupby('label_group')['posting_id'].unique().to_dict()
        folds['matches'] = folds['label_group'].map(tmp)
        folds['matches'] = folds['matches'].apply(lambda x: ' '.join(x))
        folds['file_path'] = folds['image'].apply(lambda x: TRAIN_PATH + x)
        
        if CFG.CHECK_SUB:
            folds = pd.concat([folds, folds], axis=0)
            folds.reset_index(drop=True, inplace=True)
        folds_cu = cudf.DataFrame(folds)
    else:
        folds = pd.read_csv('../input/shopee-product-matching/test.csv')
        folds['file_path'] = folds['image'].apply(lambda x: TEST_PATH + x)
        folds_cu = cudf.DataFrame(folds)
        
    return folds, folds_cu

## Dataset

In [None]:
class TestDataset(Dataset):
    
    def __init__(self, df, transform=None):
        self.df = df
        self.file_paths = df['file_path'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
            
        return image, torch.tensor(1)

In [None]:
class TestDataset_BERT(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.loc[idx, 'title']
        text = tokenizer(text, padding='max_length', truncation=True, max_length=64, return_tensors='pt')  # 'pt': pytorch
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]
        return input_ids, attention_mask

## Data Loader

In [None]:
# ====================================================
# Transforms
# ====================================================
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            #Resize(CFG.size, CFG.size),
            RandomResizedCrop(CFG.size, CFG.size),
            Transpose(p=0.5),
            HorizontalFlip(p=0.5),
            VerticalFlip(p=0.5),
            ShiftScaleRotate(p=0.5),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])
    
    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

## Model

In [None]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin
        
    def forward(self, input, label):
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
    
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output, nn.CrossEntropyLoss()(output,label)

class CustomEfficientNet(nn.Module):
    
    def __init__(
        self,
        n_classes = CFG.target_size,
        model_name = CFG.model_name_cnn,
        fc_dim = CFG.fc_dim,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = True):
        
        super(CustomEfficientNet,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        self.pooling =  nn.AdaptiveAvgPool2d(1)
        self.use_fc = use_fc
        
        if use_fc:
            self.dropout = nn.Dropout(p=0.1)
            self.classifier = nn.Linear(in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            in_features = fc_dim

        self.final = ArcMarginProduct(
            in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )
        
    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
        
    def forward(self, image, label):
        features = self.extract_features(image)
        if self.training:
            logits = self.final(features, label)
            return logits
        else:
            return features
        
    def extract_features(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc and self.training:
            x = self.dropout(x)
            x = self.classifier(x)
            x = self.bn(x)
        return x

In [None]:
class ShopeeModel(nn.Module):

    def __init__(
        self,
        n_classes = CFG.classes,
        model_name = CFG.model_name_cnn,
        fc_dim = 512,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = False):


        super(ShopeeModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if model_name == 'resnext50_32x4d':
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'efficientnet_b3':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'tf_efficientnet_b5_ns':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif model_name == 'eca_nfnet_l0':
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()

        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc

        self.dropout = nn.Dropout(p=0.0)
        self.fc = nn.Linear(final_in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        final_in_features = fc_dim

        self.final = ArcMarginProduct(
            final_in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        feature = self.extract_feat(image)
        #logits = self.final(feature,label)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2) 

        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()

        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 


class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)


def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model

In [None]:
class CustomSEResNeXt(nn.Module):
    def __init__(
        self,
        n_classes = CFG.target_size,
        model_name = CFG.model_name_cnn,
        fc_dim = CFG.fc_dim,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = True):
        
        super(CustomSEResNeXt, self).__init__()
        print(f'Building Model Backbone for {model_name} model')
        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.use_fc = use_fc
        
        if use_fc:
            self.dropout = nn.Dropout(p=0.1)
            self.fc = nn.Linear(in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            in_features = fc_dim
        
        self.final = ArcMarginProduct(
            in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )
    
    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
        
    def forward(self, image, label):
        features = self.extract_features(image)
        if self.training:
            logits = self.final(features, label)
            return logits
        else:
            return features
        
    def extract_features(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)
        
        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x

In [None]:
class CustomSEResNet152D(nn.Module):
    def __init__(
        self,
        n_classes = CFG.target_size,
        model_name = CFG.model_name_cnn,
        fc_dim = CFG.fc_dim,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = True):
        
        super(CustomSEResNet152D, self).__init__()
        print(f'Building Model Backbone for {model_name} model')
        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.use_fc = use_fc
        
        if use_fc:
            self.dropout = nn.Dropout(p=0.1)
            self.fc = nn.Linear(in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            in_features = fc_dim
        
        self.final = ArcMarginProduct(
            in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )
    
    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
        
    def forward(self, image, label):
        features = self.extract_features(image)
        if self.training:
            logits = self.final(features, label)
            return logits
        else:
            return features
        
    def extract_features(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)
        
        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x

In [None]:
class CustomModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model0 = ShopeeModel(model_name = 'eca_nfnet_l0').to(device)
        self.model0 = replace_activations(self.model0, torch.nn.SiLU, Mish())
        self.model0.load_state_dict(torch.load('../input/shopee-nfnetl0-ver0506/eca_nfnet_l0_exp027_fold0_epoch15.pth')['model'])
        self.model1 = CustomEfficientNet(n_classes=8812, pretrained=False)
        self.model1.load_state_dict(torch.load('../input/shopee-002-data-ver37/tf_efficientnet_b3_ns_fold1_best.pth')['model'])
        self.model2 = CustomEfficientNet(n_classes=8811, pretrained=False)
        self.model2.load_state_dict(torch.load('../input/shopee-002-data-ver37/tf_efficientnet_b3_ns_fold2_best.pth')['model'])
        self.model3 = CustomSEResNeXt(model_name = 'seresnext50_32x4d', n_classes=CFG.target_size, pretrained=False)
        self.model3.load_state_dict(torch.load('../input/shopee-007-seresnext-ver3/seresnext50_32x4d_fold3_best.pth')['model'])
        self.model4 = CustomSEResNet152D(model_name = 'seresnet152d', n_classes=CFG.target_size, pretrained=False)
        self.model4.load_state_dict(torch.load('../input/shopee-009-seresnet152d-ver4/seresnet152d_fold4_best.pth')['model'])
        
        #self.model0 = CustomEfficientNet(n_classes=8811, pretrained=False)
        #self.model0.load_state_dict(torch.load('../input/shopee-002-data-local/tf_efficientnet_b3_ns_fold0_best.pth')['model'])

        
        
    def forward(self, image, label):
        x0 = self.model0(image, label)
        x1 = self.model1(image, label)
        x2 = self.model2(image, label)
        x3 = self.model3(image, label)
        x4 = self.model4(image, label)
        #x = (x0+x1+x2+x3)/4
        return x0,x1,x2,x3,x4

In [None]:
class CustomBERT(nn.Module):
    def __init__(
        self,
        n_classes = CFG.target_size,
        model_name = CFG.model_name_bert,
        fc_dim = CFG.fc_dim,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = CFG.use_fc,
        use_arcface = CFG.use_arcface,
        pretrained = True):
        
        super(CustomBERT, self).__init__()
        print(f'Building Model Backbone for {model_name} model')
        self.bert = transformers.AutoModel.from_pretrained(model_name)
        in_features = self.bert.config.hidden_size
        self.use_fc = use_fc
        self.use_arcface = use_arcface
        
        if self.use_fc:
            self.dropout = nn.Dropout(p=0.1)
            self.classifier = nn.Linear(in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            in_features = fc_dim
        
        if self.use_arcface:
            self.final = ArcMarginProduct(
            in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )
        else:
            self.final = nn.Linear(in_features, n_classes)
    
    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
        
    def forward(self, input_ids, attention_mask):
        features = self.extract_features(input_ids, attention_mask)
        return features
        
    def extract_features(self, input_ids, attention_mask):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        features = x[0]
        features = features[:, 0, :]
        
        if self.use_fc:
            features = self.dropout(features)
            features = self.classifier(features)
            features = self.bn(features)
        return features

## inference functions

In [None]:
def get_image_embeddings(folds, fold):
    
    model = CustomModel()
    model.eval()
    
    #if CFG.model_name == 'eca_nfnet_l0':
    #    model = replace_activations(model, torch.nn.SiLU, Mish())
        
    #model.load_state_dict(torch.load('../input/shopee-models/eca_nfnet_l0_exp017_fold0_epoch14.pth')['model'])
    model = model.to(device)
    
    image_dataset = TestDataset(folds, transform=get_transforms(data='valid'))
    image_loader = DataLoader(image_dataset,
                              batch_size=CFG.batch_size,
                              num_workers=CFG.num_workers,
                              pin_memory=True,
                              drop_last=False)
    
    
    
    embeds0 = []
    embeds1 = []
    embeds2 = []
    embeds3 = []
    embeds4 = []
    with torch.no_grad():
        pbar = tqdm(image_loader, total=len(image_loader))
        for img, label in pbar:
            img = img.to(device)
            label = label.to(device)
            f0,f1,f2,f3,f4 = model(img, label)
            
            i0,i1,i2,i3,i4 = f0.detach().cpu().numpy(),f1.detach().cpu().numpy(),f2.detach().cpu().numpy(),f3.detach().cpu().numpy(),f4.detach().cpu().numpy()
            embeds0.append(i0)
            embeds1.append(i1)
            embeds2.append(i2)
            embeds3.append(i3)
            embeds4.append(i4)
            
    del model
    image_embeddings0 = np.concatenate(embeds0)
    image_embeddings1 = np.concatenate(embeds1)
    image_embeddings2 = np.concatenate(embeds2)
    image_embeddings3 = np.concatenate(embeds3)
    image_embeddings4 = np.concatenate(embeds4)
    #print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds0,embeds1,embeds2,embeds3, embeds4
    gc.collect()
    return [image_embeddings0,image_embeddings1,image_embeddings2,image_embeddings3,image_embeddings4]

In [None]:
def get_text_embeddings(folds, fold):
    
    model = CustomBERT(pretrained=False).to(device)
    model_path = f'../input/shopee-bert/paraphrase-xlm-r-multilingual-v1_fold0_best.pth'
    model.load_state_dict(torch.load(model_path)['model'])
    model.eval()
    
    text_dataset = TestDataset_BERT(folds)
    text_loader = DataLoader(text_dataset,
                              batch_size=CFG.batch_size,
                              num_workers=CFG.num_workers,
                              pin_memory=True,
                              drop_last=False)
    embeds = []
    with torch.no_grad():
        pbar = tqdm(text_loader, total=len(text_loader))
        for input_ids, attention_mask in pbar:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            features = model(input_ids, attention_mask)
            text_embeddings = features.detach().cpu().numpy()
            embeds.append(text_embeddings)
            
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings



In [None]:
def get_text_predictions(df, df_cu, max_features=25_000, thresh=0.75):
    
    model = TfidfVectorizer(stop_words='english',
                            binary=True,
                            max_features=max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    
    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1
        
    preds = []
    for j in range( CTS ):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        print('chunk', a, 'to', b)
        
        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
        
        for k in range(b-a):
            IDX = cupy.where(cts[k,]>thresh)[0]  # 変える余地がありそう
            if len(IDX) == 1:
                #print('置き換える１')
                IDX = cupy.where(cts[k,] > (thresh-0.04))[0]
                if len(IDX) == 1:
                    #print('置き換える２')
                    IDX = cupy.where(cts[k,] > (thresh-0.08))[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
    del model, text_embeddings
    gc.collect()
    return preds



def get_text_predictions_nostopwords(df, df_cu, max_features=25_000, thresh=0.75):
    
#     model = TfidfVectorizer(stop_words='english',
#                             binary=True,
#                             max_features=max_features)
    model = TfidfVectorizer(stop_words=None,
                            binary=True,
                            max_features=max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    
    
    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1
        
    preds = []
    for j in range( CTS ):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        print('chunk', a, 'to', b)
        
        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
        
        for k in range(b-a):
            IDX = cupy.where(cts[k,]>thresh)[0]  # 変える余地がありそう
            if len(IDX) == 1:
                #print('置き換える１')
                IDX = cupy.where(cts[k,] > (thresh-0.04))[0]
                if len(IDX) == 1:
                    #print('置き換える２')
                    IDX = cupy.where(cts[k,] > (thresh-0.08))[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
    del model, text_embeddings
    gc.collect()
    return preds


from stop_words import get_stop_words

def get_text_predictions_stopid(df, df_cu, max_features=25_000, thresh=0.75):
    
#     model = TfidfVectorizer(stop_words='english',
#                             binary=True,
#                             max_features=max_features)
    idstop_words = get_stop_words('indonesian')
    model = TfidfVectorizer(stop_words=idstop_words,
                            binary=True,
                            max_features=max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    
    
    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1
        
    preds = []
    for j in range( CTS ):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        print('chunk', a, 'to', b)
        
        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
        
        for k in range(b-a):
            IDX = cupy.where(cts[k,]>thresh)[0]  # 変える余地がありそう
            if len(IDX) == 1:
                #print('置き換える１')
                IDX = cupy.where(cts[k,] > (thresh-0.04))[0]
                if len(IDX) == 1:
                    #print('置き換える２')
                    IDX = cupy.where(cts[k,] > (thresh-0.08))[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
    del model, text_embeddings
    gc.collect()
    return preds

# 数字
# def get_text_predictions_stopnumber(df, df_cu, max_features=25_000, thresh=0.75):
    
#     model = TfidfVectorizer(stop_words='english',
#                             binary=True,
#                             max_features=max_features)
#     _ = model.fit_transform(df_cu['title']).toarray()
    
#     vocabulary = model.vocabulary_.to_pandas()
#     vocabulary = [s for s in vocabulary if not s.isdigit()]
    
#     model_ = TfidfVectorizer(vocabulary=vocabulary)
#     text_embeddings = model_.fit_transform(df_cu['title']).toarray()
    
    
#     print('Finding similar titles...')
#     CHUNK = 1024 * 4
#     CTS = len(df) // CHUNK
#     if (len(df)%CHUNK) != 0:
#         CTS += 1
        
#     preds = []
#     for j in range( CTS ):
#         a = j * CHUNK
#         b = (j+1) * CHUNK
#         b = min(b, len(df))
#         print('chunk', a, 'to', b)
        
#         # COSINE SIMILARITY DISTANCE
#         cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
        
#         for k in range(b-a):
#             IDX = cupy.where(cts[k,]>thresh)[0]  # 変える余地がありそう
#             if len(IDX) == 1:
#                 #print('置き換える１')
#                 IDX = cupy.where(cts[k,] > (thresh-0.04))[0]
#                 if len(IDX) == 1:
#                     #print('置き換える２')
#                     IDX = cupy.where(cts[k,] > (thresh-0.08))[0]
#             o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
#             preds.append(o)
            
#     del model, text_embeddings
#     gc.collect()
#     return preds

In [None]:
# 英語
def get_text_predictions_additional(df, df_cu, max_features=25_000, thresh=0.75):
    
    model = TfidfVectorizer(stop_words='english',
                            binary=True,
                            max_features=max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    
    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1
        
    preds = []
    for j in range( CTS ):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        print('chunk', a, 'to', b)
        
        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
        
        for k in range(b-a):
            IDX = cupy.where(cts[k,]>thresh)[0]  # 変える余地がありそう
            if len(IDX) == 1:
                #print('置き換える１')
                IDX = cupy.where(cts[k,] > (thresh-0.05))[0]
                if len(IDX) == 1:
                    #print('置き換える２')
                    IDX = cupy.where(cts[k,] > (thresh-0.1))[0]
                    if len(IDX) == 1:
                        #print('置き換える3')
                        IDX = cupy.where(cts[k,] > (thresh-0.15))[0]
                        if len(IDX) == 1:
                            #print('置き換える5')
                            IDX = cupy.where(cts[k,] > (thresh-0.2))[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
    del model, text_embeddings
    gc.collect()
    return preds

# 追加アルゴリズム by beluga
# インドネシア語
def get_text_predictions_additional_nomatch_id(df, df_cu, max_features=25_000, thresh=0.75):
    
    idstop_words = get_stop_words('indonesian')
    model = TfidfVectorizer(stop_words=idstop_words,
                            binary=True,
                            max_features=max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    
    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1
        
    preds = []
    for j in range( CTS ):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        print('chunk', a, 'to', b)
        
        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
        
        for k in range(b-a):
            IDX = cupy.where(cts[k,]>thresh)[0]  # 変える余地がありそう
            if len(IDX) == 1:
                #print('置き換える１')
                IDX = cupy.where(cts[k,] > (thresh-0.05))[0]
                if len(IDX) == 1:
                    #print('置き換える２')
                    IDX = cupy.where(cts[k,] > (thresh-0.1))[0]
                    if len(IDX) == 1:
                        #print('置き換える3')
                        IDX = cupy.where(cts[k,] > (thresh-0.15))[0]
                        if len(IDX) == 1:
                            #print('置き換える5')
                            IDX = cupy.where(cts[k,] > (thresh-0.2))[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
    del model, text_embeddings
    gc.collect()
    return preds

In [None]:
def get_neighbors(df, embeddings, KNN = 100, image = True, thres = 0.28):
    
    #distances = np.zeros(shape=(len(df), KNN), dtype='float16')
    distances_list = []
    indices_list = []
    for i in range(len(embeddings)):
        model = NearestNeighbors(n_neighbors = KNN, metric='cosine')
        model.fit(embeddings[i])
        distances_tmp, indices_tmp = model.kneighbors(embeddings[i])
        #distances += np.array(distances_tmp, dtype='float16')/len(embeddings)
        #distances = distances.astype('float16')
        distances_list.append(distances_tmp)
        indices_list.append(indices_tmp)
        del distances_tmp, indices_tmp
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if CFG.GET_CV:
        if image:
            thresholds = list(np.arange(0.3, 0.6, 0.02))
        else:
            thresholds = list(np.arange(0.1, 1, 0.05))  # changed
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in tqdm(range(embeddings[0].shape[0])):
                # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
                distances = []
                indices = []
                for i in range(len(distances_list)):
                    distances.append(distances_list[i][k])
                    indices.append(indices_list[i][k])
                #↓あとで書き直す
                dfs = []
                for i in range(len(distances_list)):
                    dfs.append(pd.DataFrame({f'ind': indices[i], f'dis{i}': distances[i]}))
                df_tmp = repeate_merge(dfs)
                if image:
                    df_tmp['dis'] = 0
                    for i in range(len(distances_list)):
                        df_tmp['dis'] += df_tmp[f'dis{i}'] / len(distances_list)
                    #if k < 1:
                    #    display(df_tmp)
                    dis = df_tmp['dis'].values
                    idx = np.where(dis < threshold)[0]
                    idx = idx.astype(int)
                else:
                    idx = np.where(distances[k,] < 0.60)[0]
                #ids = indices[k,idx]
                #print(indices)
                indices = df_tmp['ind'].values
                ids = indices[idx]
                
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            # for debug
            #display(df)
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold  = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings[0].shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            distances = []
            indices = []
            for i in range(len(distances_list)):
                distances.append(distances_list[i][k])
                indices.append(indices_list[i][k])
            #↓あとで書き直す
            dfs = []
            for i in range(len(distances_list)):
                dfs.append(pd.DataFrame({f'ind': indices[i], f'dis{i}': distances[i]}))
            df_tmp = repeate_merge(dfs)
            if image:
                df_tmp['dis'] = 0
                for i in range(len(distances_list)):
                    df_tmp['dis'] += df_tmp[f'dis{i}'] / len(distances_list)
                dis = df_tmp['dis'].values
                idx = np.where(dis < thres)[0]
                idx = idx.astype(int)
                #if len(idx) == 1:
                #    idx = np.where(dis < thres)[0]
            else:
                idx = np.where(distances[k,] < 0.60)[0]
            #ids = indices[k,idx]
            #print(indices)
            indices = df_tmp['ind'].values
            ids = indices[idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
            
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in range(embeddings[0].shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            distances = []
            indices = []
            for i in range(len(distances_list)):
                distances.append(distances_list[i][k])
                indices.append(indices_list[i][k])
            #↓あとで書き直す
            dfs = []
            for i in range(len(distances_list)):
                dfs.append(pd.DataFrame({f'ind': indices[i], f'dis{i}': distances[i]}))
            df_tmp = repeate_merge(dfs)
            if image:
                df_tmp['dis'] = 0
                for i in range(len(distances_list)):
                    df_tmp['dis'] += df_tmp[f'dis{i}'] / len(distances_list)
                dis = df_tmp['dis'].values
                idx = np.where(dis < thres)[0]
                if len(idx) == 1:
                    #print('置き換える１')
                    idx = np.where(dis < (thres+0.04))[0]
                    if len(idx) == 1:
                        #print('置き換える２')
                        idx = np.where(dis < (thres+0.08))[0]
                        if len(idx) == 1:
                        #print('置き換える２')
                            idx = np.where(dis < (thres+0.12))[0]
                idx = idx.astype(int)
            else:
                idx = np.where(distances[k,] < 0.60)[0]
            #ids = indices[k,idx]
            #print(indices)
            indices = df_tmp['ind'].values
            if len(test) == 3:
                display(df_tmp)
            ids = indices[idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [None]:
def get_neighbors_for_txt(df, embeddings, KNN = 50, image = True):
    
    model = NearestNeighbors(n_neighbors = KNN, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if CFG.GET_CV:
        if image:
            thresholds = list(np.arange(0.4, 0.5, 0.1))
        else:
            thresholds = list(np.arange(0.4, 0.6, 0.1))  # changed
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k, idx]
                # 追加アルゴ
                if len(ids) == 1:
                    ids = indices[k, [1,2]]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold  = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            if image:
                idx = np.where(distances[k,] < 0.3)[0]
            else:
                idx = np.where(distances[k,] < 0.3)[0]
            ids = indices[k, idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
            
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            if image:
                idx = np.where(distances[k,] < 0.3)[0]
            else:
                idx = np.where(distances[k,] < 0.3)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

## Calculating Predictions

In [None]:
folds, folds_cu = read_dataset()
folds.head()

In [None]:
def repeate_merge(dfs):
    df = pd.merge(dfs[0],dfs[1], on='ind')
    for _ in range(len(dfs)-2):
        df = pd.merge(df, dfs[_+2], on='ind')
    return df

In [None]:
def match_algo_interpolation_train(submit_df):
    match_dic = {}
    
    def count_match(row):
        return len(row["pred_matches"].split())-1
    
    def match_diff(row):
        posting_id = np.array(row["posting_id"])
        pred_matches = np.array(row["pred_matches"].split())
        pred_match = np.array(np.setdiff1d(pred_matches, posting_id)).tolist()
        if row["match_num"] > 0:
            return " ".join(pred_match)
        else:
            return

    def get_match_dic(row):
        if row["match_num"] > 0:
            if not match_dic.get(row["pred_match"]):
                match_dic[row["pred_match"]] = [row["posting_id"]]
            else:
                match_dic[row["pred_match"]].append(row["posting_id"])

    def join_list():
        for k, v in match_dic.items():
            match_dic[k] = " ".join(v)

    def column_merge(row):
        if row["match_num"] > 0:
            x = row['pred_matches'] + " " + row['posting_id_2']
            x = np.array(x.split())
            return ' '.join( np.unique(x) )
        else:
            return row['pred_matches']
        
    submit_df['match_num'] = submit_df.apply(count_match, axis=1)
    submit_df['pred_match'] = submit_df.apply(match_diff, axis=1)
    submit_df.apply(get_match_dic, axis=1)
    
    join_list()
    match_df = pd.DataFrame.from_dict(match_dic, orient='index')
    match_df = match_df.reset_index()
    match_df = match_df.rename(columns = {'index': 'pred_match', 0: 'posting_id_2'}, inplace = False)
    
    submit_df = pd.merge(submit_df, match_df, on='pred_match', how='left')
    submit_df["pred_matches_2"] = submit_df.apply(column_merge, axis=1)
    
    submit_df['f1_1'] = f1_score(submit_df['matches'], submit_df['pred_matches'])
    submit_df['f1_2'] = f1_score(submit_df['matches'], submit_df['pred_matches_2'])

    score_1 = submit_df['f1_1'].mean()
    score_2 = submit_df['f1_2'].mean()

    print(f'Our final f1 origin cv score is {score_1}')
    print(f'Our final f1 match algo cv score is {score_2}')
    
    submit_df_2 = pd.DataFrame()
    submit_df_2[['posting_id', 'pred_matches']] = submit_df[['posting_id', 'pred_matches_2']]
    return submit_df_2

def match_algo_interpolation_inference(submit_df):
    match_dic = {}
    
    def count_match(row):
        return len(row["matches"].split())-1
    
    def match_diff(row):
        posting_id = np.array(row["posting_id"])
        pred_matches = np.array(row["matches"].split())
        pred_match = np.array(np.setdiff1d(pred_matches, posting_id)).tolist()
        if row["match_num"] > 0:
            return " ".join(pred_match)
        else:
            return row["matches"]

    def get_match_dic(row):
        if row["match_num"] > 0:
            if not match_dic.get(row["matches"]):
                match_dic[row["matches"]] = [row["posting_id"]]
            else:
                match_dic[row["matches"]].append(row["posting_id"])

    def join_list():
        for k, v in match_dic.items():
            match_dic[k] = " ".join(v)

    def column_merge(row):
        if row["match_num"] > 0:
            x = row['matches'] + " " + row['posting_id_2']
            x = np.array(x.split())
            return ' '.join( np.unique(x) )
        else:
            return row['matches']
        
    submit_df['match_num'] = submit_df.apply(count_match, axis=1)
    submit_df['matches'] = submit_df.apply(match_diff, axis=1)
    submit_df.apply(get_match_dic, axis=1)
    
    join_list()
    match_df = pd.DataFrame.from_dict(match_dic, orient='index')
    match_df = match_df.reset_index()
    match_df = match_df.rename(columns = {'index': 'matches', 0: 'posting_id_2'}, inplace = False)
    
    submit_df = pd.merge(submit_df, match_df, on='matches', how='left')
    submit_df["pred_matches_2"] = submit_df.apply(column_merge, axis=1)
    
    submit_df_2 = pd.DataFrame()
    submit_df_2[['posting_id', 'matches']] = submit_df[['posting_id', 'pred_matches_2']]
    return submit_df_2

In [None]:
# Get neighbors for image_embeddings
if CFG.GET_CV:
    oof_df = pd.DataFrame()
    for fold in CFG.trn_fold:
        folds_ = folds[folds['fold'] == fold].reset_index(drop=True)
        folds_cu_ = folds_cu[folds['fold'] == fold].reset_index(drop=True)
        image_embeddings = get_image_embeddings(folds_, fold)
        text_embeddings = get_text_embeddings(folds_, fold)
        text_predictions_tfidf = get_text_predictions(folds_, folds_cu_, max_features=25_000, thresh=0.75)
        oof_df_, image_predictions = get_neighbors(folds_, image_embeddings, KNN=50 if len(folds)>3 else 3, image=True)
        oof_df_, text_predictions_bert = get_neighbors(folds_, text_embeddings, KNN=50 if len(folds) > 3 else 3, image=False)
        oof_df_['image_predictions'] = image_predictions
        oof_df_['text_predictions'] = text_predictions_tfidf
        oof_df_['text_predictions_bert'] = text_predictions_bert
        oof_df_['text_predictions_bert_len'] = oof_df_['text_predictions_bert'].apply(lambda x: len(x))
        oof_df_['text_predictions'].mask(oof_df_['text_predictions_bert_len'] == 2, oof_df_['text_predictions_bert'], inplace=True)
        oof_df_['pred_matches'] = oof_df_.apply(combine_predictions, axis = 1)
        # oofだけを切り出す
#         oof_df_ = oof_df_[folds['fold'] == fold]
        oof_df = pd.concat([oof_df, oof_df_])
    display(oof_df.head())
else:
    image_embeddings = get_image_embeddings(folds, fold=0)  # 後で調整する
    text_embeddings = get_text_embeddings(folds, fold=0)
    
    # 追加アルゴリズム by beluga
#     text_predictions_tfidf = get_text_predictions(folds, folds_cu, max_features=25_000, thresh=0.75)
#     text_predictions_tfidf_nostopwords = get_text_predictions_nostopwords(folds, folds_cu, max_features=25_000, thresh=0.75)
    text_predictions_tfidf_stopwordsid = get_text_predictions_stopid(folds, folds_cu, max_features=25_000, thresh=0.75)
#     text_predictions_tfidf_stopwordsnumber = get_text_predictions_stopnumber(folds, folds_cu, max_features=25_000, thresh=0.75)
    
    df, text_predictions_bert = get_neighbors_for_txt(folds, text_embeddings, KNN=50 if len(folds) > 3 else 3, image=False)
    df, image_predictions = get_neighbors(folds, image_embeddings, KNN=100 if len(folds)>3 else 3, image=True)
    
    df.head()

In [None]:
# 追加アルゴリズム by beluga
# def combine_predictions_notmatch3(row):
# #     x = np.concatenate([row['text_predictions_en'], row['text_predictions_no'], row['text_predictions_id'], row['text_predictions_num']])
#     x = np.concatenate([row['text_predictions_id'], row['text_predictions_num']])
# #     return ' '.join(np.unique(x))
#     return np.unique(x)


if CFG.GET_CV:
#     oof_df['image_predictions'] = image_predictions
#     oof_df['text_predictions'] = text_predictions
#     oof_df['pred_matches'] = oof_df.apply(combine_predictions, axis = 1)
    oof_df = match_algo_interpolation_train(oof_df) # マッチ補間関数
    oof_df['f1'] = f1_score(oof_df['matches'], oof_df['pred_matches'])
    display(oof_df)
    score = oof_df['f1'].mean()
    print(f'Our final f1 cv score is {score}')
    oof_df[['posting_id', 'pred_matches']].to_csv('submission.csv', index = False)
else:
    df['image_predictions'] = image_predictions
    
    # 追加アルゴリズム by beluga
#     df['text_predictions'] = text_predictions_tfidf
#     df['text_predictions_en'] = text_predictions_tfidf
#     df['text_predictions_no'] = text_predictions_tfidf_nostopwords
#     df['text_predictions_id'] = text_predictions_tfidf_stopwordsid
#     df['text_predictions_num'] = text_predictions_tfidf_stopwordsnumber
#     df['text_predictions'] = df.apply(combine_predictions_notmatch3, axis=1)
    
#     df['text_predictions'] = text_predictions_tfidf_stopwordsnumber
    df['text_predictions'] = text_predictions_tfidf_stopwordsid

    df['text_predictions_bert'] = text_predictions_bert
    df['text_predictions_bert_len'] = df['text_predictions_bert'].apply(lambda x: len(x))
    df['text_predictions'].mask(df['text_predictions_bert_len'] == 2, df['text_predictions_bert'], inplace=True)
    df['matches'] = df.apply(combine_predictions, axis = 1)
    # =================================================================
    # additional prediction
    # =================================================================
    df['matches_len'] = df['matches'].apply(lambda x: len(x.split()))
    df_notmatch = df[df['matches_len']==1]
    df_notmatch_cu = cudf.DataFrame(df_notmatch)
#     additional_predictions = get_text_predictions_additional(df_notmatch, df_notmatch_cu, max_features=25000, thresh=0.6)
    additional_predictions = get_text_predictions_additional_nomatch_id(df_notmatch, df_notmatch_cu, max_features=25000, thresh=0.6)# 追加アルゴリズム by beluga
    
    df_notmatch['matches'] = additional_predictions
    df_notmatch['matches_len'] = df_notmatch['matches'].apply(lambda x: len(x))
    df_notmatch_ = df_notmatch[df_notmatch['matches_len']>1]
    df_notmatch_['matches'] = df_notmatch_['matches'].apply(lambda x: ' '.join(x))
    index_notmatch = df_notmatch_.index
    df.loc[index_notmatch, 'matches'] = df_notmatch_['matches']
    
    df_notmatch2 = df_notmatch[df_notmatch['matches_len']==1]
    df_notmatch_cu2 = cudf.DataFrame(df_notmatch2)
#     additional_predictions2 = get_text_predictions_additional(df_notmatch2, df_notmatch_cu2, max_features=25000, thresh=0.6)
    additional_predictions2 = get_text_predictions_additional_nomatch_id(df_notmatch2, df_notmatch_cu2, max_features=25000, thresh=0.6)# 追加アルゴリズム by beluga
    df_notmatch2['matches'] = additional_predictions2
    df_notmatch2['matches_len'] = df_notmatch2['matches'].apply(lambda x: len(x))
    df_notmatch2_ = df_notmatch2[df_notmatch2['matches_len']>1]
    df_notmatch2_['matches'] = df_notmatch2_['matches'].apply(lambda x: ' '.join(x))
    index_notmatch2 = df_notmatch2_.index
    df.loc[index_notmatch2, 'matches'] = df_notmatch2_['matches']
    
    df_afteralgo = match_algo_interpolation_inference(df) # マッチ補間関数
    df['matches'] = df_afteralgo['matches']
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

In [None]:
df.head(3)

In [None]:
pd.read_csv('submission.csv').head()