# Load Libraries

In [None]:
timm_path = "../input/timm-pytorch-image-models/pytorch-image-models-master"
import sys
sys.path.append(timm_path)
import timm
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import os
from tqdm.notebook import tqdm

import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from torch import optim

import numpy as np, pandas as pd, gc
import cv2, matplotlib.pyplot as plt
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
# from tensorflow.keras.applications import EfficientNetB0
print('RAPIDS',cuml.__version__)
print('TF',tf.__version__)

import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split

from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim import Adam, lr_scheduler

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

# transformer_model = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'
# TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

# transformer_model = '../input/shopee-embedding-df/paraphrase-xlm-r-multilingual-v1/0_Transformer'
# TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)



# Load Train Data

In [None]:
COMPUTE_CV = False

test = pd.read_csv('../input/shopee-product-matching/test.csv')
if len(test)>3: COMPUTE_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')

In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

# Compute RAPIDS Model CV and Infer Submission

In [None]:
if COMPUTE_CV:
    test = pd.read_csv('../input/shopee-product-matching/train.csv')
    test_gf = cudf.DataFrame(test)
    print('Using train as test to compute CV (since commit notebook). Shape is', test_gf.shape )
else:
    test = pd.read_csv('../input/shopee-product-matching/test.csv')
    test_gf = cudf.read_csv('../input/shopee-product-matching/test.csv')
    print('Test shape is', test_gf.shape )
test_gf.head()

1. # Use Image Embeddings model1

In [None]:
BASE = '../input/shopee-product-matching/test_images/'
if COMPUTE_CV: BASE = '../input/shopee-product-matching/train_images/'

# image_size = 192#256
valid_batch_size = 64



class Shopee(Dataset):
    def __init__(self, df, augs=None):
        self.df = df
        self.augs = augs

    def __len__(self):
        return(len(self.df))

    def __getitem__(self,idx):
        img_src = self.df.loc[idx, 'image']
        image = cv2.imread(BASE + img_src)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.uint8)

        if (self.augs):
            transformed = self.augs(image=image)
            image = transformed['image']

        return image
    
# テスト用関数
def test_predict(model, dataloader, device):
    model.eval()
    embeds = []

    with torch.no_grad():    
        for i, inputs in enumerate(tqdm(dataloader)):
            inputs = inputs.to(device)
            features = model(inputs).detach()
            if len(features.shape) != 2:
                features = torch.nn.AdaptiveAvgPool2d(1)(features).cpu().view(-1,features.shape[1]).detach().numpy()
            else:
                features = features.detach().cpu().numpy()

            metric = features.reshape(features.shape[0], features.shape[1])
            embeds.append(metric)

    return np.concatenate(embeds)


def get_image_embeddings(model_name, weights_path, image_size):

    valid_aug = A.Compose([
        A.LongestMaxSize(max_size=image_size*1.2, p=1.0),
        A.PadIfNeeded(min_height=image_size, min_width=image_size, border_mode=0, p=1.0),
        A.Normalize(p=1.0),
        A.CenterCrop (image_size, image_size, always_apply=False, p=1.0),
        ToTensorV2(p=1.0)
        ])
    
    valid_data = test.copy()
    valid_data = Shopee(valid_data.reset_index(drop=True), augs = valid_aug)
    test_loader = DataLoader(valid_data,
                              shuffle=False,
                              num_workers=4,
                              batch_size=valid_batch_size)


    num_embeddings = 512#256
    model = timm.create_model(model_name, pretrained=False)

    if "efficientnet" in model_name: 
        num_features = model.classifier.in_features
        model.classifier = nn.Linear(num_features, num_embeddings)
    elif "densenet" in model_name: 
        num_features = model.classifier.in_features
        model.classifier = nn.Linear(num_features, num_embeddings)
    elif "swin" in model_name:
        num_features = model.head.in_features
        model.head = nn.Linear(num_features, num_embeddings)
    elif "vit" in model_name:
        num_features = model.head.in_features
        model.head = nn.Linear(num_features, num_embeddings)
    else:
        num_features = model.head.fc.in_features
        model.head.fc = nn.Linear(num_features, num_embeddings)           
    _ = model.to(device)


    try:
        load_weghts = torch.load(weights_path)
        model.load_state_dict(load_weghts)
    except:
        from collections import OrderedDict
        def fix_model_state_dict(state_dict):
            new_state_dict = OrderedDict()
            for k, v in state_dict.items():
                name = k
                if name.startswith('module.'):
                    name = name[7:]  # remove 'module.' of dataparallel
                new_state_dict[name] = v
            return new_state_dict

        state_dict = torch.load(weights_path)
        model.load_state_dict(fix_model_state_dict(state_dict))
        print("except")


    # newmodel = torch.nn.Sequential(*(list(model.children())[:-1]))
    image_embeddings = test_predict(model, test_loader, device)
    print('image embeddings shape',image_embeddings.shape)

    del model
    _ = gc.collect()
    return image_embeddings

In [None]:
def get_similar(image_embeddings,threshold=0.36,KNN = 50):
    if len(test)==3: KNN = 2
    model = NearestNeighbors(n_neighbors=KNN,metric='cosine')
    model.fit(image_embeddings)
    
    preds = []
    
    CHUNK = 1024*4

    print('Finding similar images...')
    CTS = len(image_embeddings)//CHUNK
    if len(image_embeddings)%CHUNK!=0: CTS += 1
    for j in tqdm(range( CTS )):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(image_embeddings))
        distances, indices = model.kneighbors(image_embeddings[a:b,])

#         x_avg=np.mean(distances) # 平均値(定義上は0)
#         x_std=np.std(distances) # 標準偏差(定義上は1)
#         lsl_1=x_avg-x_std*1.55

        for k in range(b-a):
            IDX = np.where(distances[k,]<threshold)[0]
            IDS = indices[k,IDX]
            o = test.iloc[IDS].posting_id.values
            preds.append(o)

#     print(lsl_1)
    print(f"embed={image_embeddings.shape[1]}_KNN={KNN}_distances={threshold}")
#     plt.hist(distances.flatten(),bins=100)
#     plt.show()
    del model, distances, indices, image_embeddings#, embeds
    _ = gc.collect()
    
    return preds

def get_similar_two(image_embeddings,threshold=100):
    KNN = 2
    model = NearestNeighbors(n_neighbors=KNN,metric='cosine')
    model.fit(image_embeddings)
    
    preds = []
    
    CHUNK = 1024*4

    print('Finding similar images...')
    CTS = len(image_embeddings)//CHUNK
    if len(image_embeddings)%CHUNK!=0: CTS += 1
    for j in tqdm(range( CTS )):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(image_embeddings))
        distances, indices = model.kneighbors(image_embeddings[a:b,])


        for k in range(b-a):
            IDX = np.where(distances[k,]<threshold)[0]
            IDS = indices[k,IDX]
            o = test.iloc[IDS].posting_id.values
            preds.append(o)

#     print(lsl_1)
    print(f"embed={image_embeddings.shape[1]}_KNN={KNN}_distances={threshold}")
#     plt.hist(distances.flatten(),bins=100)
#     plt.show()
    del model, distances, indices, image_embeddings#, embeds
    _ = gc.collect()
    
    return preds

In [None]:
def get_bret_embeddings(weights_path,transformer_model='../input/shopee-embedding-df/paraphrase-xlm-r-multilingual-v1/0_Transformer'):
    TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)
    
    BASE = '../input/shopee-product-matching/test_images/'
    if COMPUTE_CV: BASE = '../input/shopee-product-matching/train_images/'


    valid_batch_size = 64

    class ShopeeDataset(Dataset):
        def __init__(self, csv):
            self.csv = csv.reset_index()

        def __len__(self):
            return self.csv.shape[0]

        def __getitem__(self, index):
            row = self.csv.iloc[index]

            text = row.title

            text = TOKENIZER(text, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
            input_ids = text['input_ids'][0]
            attention_mask = text['attention_mask'][0]  

            return input_ids, attention_mask

    # テスト用関数
    def test_predict(model, dataloader, device):

        model.eval()
        _predicted_metrics = []
        _true_labels = []

        with torch.no_grad():    
            for i, (inputs, attention_masks) in enumerate(tqdm(dataloader)):
                inputs,attention_masks = inputs.to(device),attention_masks.to(device)
                features = model(inputs,attention_masks)[0][:,0,:].detach()

                if len(features.shape) != 2:
                    features = torch.nn.AdaptiveAvgPool2d(1)(features).cpu().view(-1,features.shape[1]).detach().numpy()
                else:
                    features = features.detach().cpu().numpy()

                metric = features.reshape(features.shape[0], features.shape[1])
                _predicted_metrics.append(metric)

        return np.concatenate(_predicted_metrics)

    valid_data = test.copy()
    valid_data = ShopeeDataset(valid_data.reset_index(drop=True))
    test_loader = DataLoader(valid_data,
                              shuffle=False,
                              num_workers=4,
                              batch_size=valid_batch_size)


    num_embeddings = 768#256

    model = transformers.AutoModel.from_pretrained(transformer_model)
    num_features = model.config.hidden_size
    if transformer_model=='../input/shopee-embedding-df/paraphrase-xlm-r-multilingual-v1/0_Transformer':
        model.fc = nn.Linear(num_features, num_embeddings)

    _ = model.to(device)



    load_weghts = torch.load(weights_path)
    model.load_state_dict(load_weghts)



    bert_embeddings = test_predict(model, test_loader, device)
    print('bert embeddings shape',bert_embeddings.shape)

    del model
    _ = gc.collect()
    
    return bert_embeddings


In [None]:
!ls ../input/shopee-embedding-df/

# use word2vec

In [None]:
# from nltk.tokenize import word_tokenize
# test_title_token = test['title'].apply(lambda x: word_tokenize(x))
# test["test_title_token"] = test_title_token

# from gensim.test.utils import common_texts
# from gensim.models import Word2Vec

    
# def get_word_embeddings():
#     model = Word2Vec(sentences=test_title_token, size=512, window=5, min_count=1, workers=4)
#     model.train(test_title_token,total_examples=len(test_title_token),epochs=100)
    
#     def vectors_test(train_title_token): #test_df
#         word_embeddings_test = []
#         # Reading the each book description 
#         for line in train_title_token:
#             avgword2vec = None
#             count = 0
#             for word in line:
#                 if word in model.wv:
#                     count += 1
#                     if avgword2vec is None:
#                         avgword2vec = model.wv[word]
#                     else:
#                         avgword2vec = avgword2vec + model.wv[word]

#             if avgword2vec is not None:
#                 avgword2vec = avgword2vec / count
#                 word_embeddings_test.append(avgword2vec)
#             else:
#                 word_embeddings_test.append(np.array([0]*512, dtype='float32'))

#             return word_embeddings_test[0]

#     title_emb=[]
#     title_embs = test['test_title_token'].apply(lambda x: vectors_test(x))

#     for metric in title_embs:
#         title_emb.append([metric])
        
#     word_embeddings = np.concatenate(title_emb)
#     print('word embeddings shape',word_embeddings.shape)
        
#     del model
#     _ = gc.collect()

#     return word_embeddings

In [None]:
# word_embeddings_0 = get_word_embeddings()
# test['word_preds0'] = get_similar(word_embeddings_0,0.22)
# # # test['word_preds1'] = get_similar(word_embeddings_0,0.36)

# del word_embeddings_0

# use cnn

In [None]:
image_embeddings_0 = get_image_embeddings('dm_nfnet_f0',"../input/shopee-embedding-df/F002_ArcFaceLoss_tfold_5_T_False_dm_nfnet_f0_imgsize_192_nume_512_epoch_20_CV_0.9456.pth",192)
test['image_preds0'] = get_similar(image_embeddings_0,0.27)
test['image_preds3'] = get_similar(image_embeddings_0,0.36)
test['image_predsl1'] = get_similar(image_embeddings_0,0.45)
test['image_predsl4'] = get_similar(image_embeddings_0,0.50)

test['image_preds6'] = get_similar(image_embeddings_0,10,13)
test['image_preds7'] = get_similar(image_embeddings_0,10,30)
test['image_preds8'] = get_similar(image_embeddings_0,10,40)



image_embeddings_1 = get_image_embeddings('swin_small_patch4_window7_224',"../input/shopee-embedding-df/F005_ArcFaceLoss_tfold_5_T_False_swin_small_patch4_window7_224_imgsize_224_nume_512_epoch_7_CV_0.9372.pth",224)
test['image_preds1'] = get_similar(image_embeddings_1,0.27)
test['image_preds4'] = get_similar(image_embeddings_1,0.36)
test['image_predsl2'] = get_similar(image_embeddings_1,0.45)
test['image_predsl5'] = get_similar(image_embeddings_1,0.50)

# test['image_preds_13_1'] = get_similar(image_embeddings_1,10,13)
# test['image_preds_30_1'] = get_similar(image_embeddings_1,10,30)
# test['image_preds_50_1'] = get_similar(image_embeddings_1,10,40)



image_embeddings_2 = get_image_embeddings('efficientnet_b0',"../input/shopee-embedding-df/F003_ArcFaceLoss_tfold_5_T_False_efficientnet_b0_imgsize_224_nume_512_epoch_20_CV_0.9304.pth",224)
test['image_preds2'] = get_similar(image_embeddings_2,0.27)
test['image_preds5'] = get_similar(image_embeddings_2,0.36)
test['image_predsl3'] = get_similar(image_embeddings_2,0.45)
test['image_predsl6'] = get_similar(image_embeddings_2,0.50)

# test['image_preds_13_2'] = get_similar(image_embeddings_2,10,13)
# test['image_preds_30_2'] = get_similar(image_embeddings_2,10,30)
# test['image_preds_50_2'] = get_similar(image_embeddings_2,10,40)



test['image_predsb1'] = get_similar(image_embeddings_0,0.60)
test['image_predsb2'] = get_similar(image_embeddings_1,0.60)
test['image_predsb3'] = get_similar(image_embeddings_2,0.60)



del image_embeddings_0,image_embeddings_1,image_embeddings_2

# use Bert

In [None]:
bert_embeddings_1 = get_bret_embeddings("../input/shopee-embedding-df/G002_bert_indonesian_tfold_5_T_False_algo_ArcFaceLoss_nume_768_epoch_20_CV_0.9464.pth",'../input/shopee-embedding-df/distilbert-base-indonesian')
test['bert_preds5'] = get_similar(bert_embeddings_1,0.27)
test['bert_preds6'] = get_similar(bert_embeddings_1,0.36)
test['bert_predsl1'] = get_similar(bert_embeddings_1,0.45)
test['bert_predsl3'] = get_similar(bert_embeddings_1,0.50)


test['bert_under'] = get_similar_two(bert_embeddings_1)
test['bert_preds2'] = get_similar(bert_embeddings_1,10,13)
test['bert_preds3'] = get_similar(bert_embeddings_1,10,30)
test['bert_preds4'] = get_similar(bert_embeddings_1,10,40)

test['bert_predsb1'] = get_similar(bert_embeddings_1,0.60)


# image_bert_embeddings_0 = np.concatenate([image_embeddings_0,bert_embeddings_1],axis=1)
# test['image_bert_preds0'] = get_similar(bert_embeddings_1,0.27)


del bert_embeddings_1

In [None]:
berta_embeddings_0 = get_bret_embeddings("../input/shopee-embedding-df/G003_bert_hasa-cased_tfold_5_T_False_algo_ArcFaceLoss_nume_768_epoch_20_CV_0.9524.pth",'../input/shopee-embedding-df/albert-base-bahasa-cased')
test['berta_preds1'] = get_similar(berta_embeddings_0,0.27)
test['berta_preds2'] = get_similar(berta_embeddings_0,0.36)
test['berta_predsl1'] = get_similar(berta_embeddings_0,0.45)
test['bert_predsl4'] = get_similar(berta_embeddings_0,0.50)

test['bert_predsb3'] = get_similar(berta_embeddings_0,0.60)

# test['bert_preds_13_1'] = get_similar(berta_embeddings_0,10,13)
# test['bert_preds_30_1'] = get_similar(berta_embeddings_0,10,30)
# test['bert_preds_50_1'] = get_similar(berta_embeddings_0,10,50)

del berta_embeddings_0

In [None]:
bert_embeddings_0 = get_bret_embeddings("../input/shopee-embedding-df/G001_bert_tfold_5_T_False_nume_768_epoch_19_CV_0.9541.pth",'../input/shopee-embedding-df/paraphrase-xlm-r-multilingual-v1/0_Transformer')
test['bert_preds0'] = get_similar(bert_embeddings_0,0.27)
test['bert_preds1'] = get_similar(bert_embeddings_0,0.36)
test['bert_predsl2'] = get_similar(bert_embeddings_0,0.45)
test['bert_predsl5'] = get_similar(bert_embeddings_0,0.50)

test['bert_predsb2'] = get_similar(bert_embeddings_0,0.60)

# test['bert_preds_13_2'] = get_similar(bert_embeddings_0,10,13)
# test['bert_preds_30_2'] = get_similar(bert_embeddings_0,10,30)
# test['bert_preds_50_2'] = get_similar(bert_embeddings_0,10,40)

del  bert_embeddings_0

# Use Text Embeddings

In [None]:
import string

def removePunctuation(text):
    punc_translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return text.translate(punc_translator)

test['title_clean'] = test['title'].apply(removePunctuation)
title_to_use = cudf.DataFrame(test).title_clean

print('Computing text embeddings...')
# tfidf_vec = TfidfVectorizer(stop_words='english', 
#                             binary=True, 
#                             max_features=21500)
tfidf_vec = TfidfVectorizer(stop_words=None, 
                            binary=True, 
                            max_features=21500)
text_embeddings = tfidf_vec.fit_transform(title_to_use).toarray()
print('text embeddings shape',text_embeddings.shape)

In [None]:
def get_text_simier(text_embeddings,threshold=0.75):
    preds = []
    CHUNK = 1024

    print('Finding similar titles...')
    CTS = len(test)//CHUNK
    if len(test)%CHUNK!=0: CTS += 1
    for j in tqdm(range( CTS )):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(test))
    #     print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]>0.65)[0]
            o = test.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
    del text_embeddings
    _ = gc.collect()
    
    return preds


_ = gc.collect()

test['text_preds'] = get_text_simier(text_embeddings,0.75)
# test['text_preds2'] = get_text_simier(text_embeddings,0.65)
# test['text_preds3'] = get_text_simier(text_embeddings,0.55)
# test.head()

del tfidf_vec, text_embeddings

# other 

In [None]:
# tmp = test.groupby('image_phash').posting_id.agg('unique').to_dict()
# test['o_image_phash_preds'] = test.image_phash.map(tmp)

# tmp = test.groupby('image').posting_id.agg('unique').to_dict()
# test['o_image_preds'] = test.image.map(tmp)

# tmp = test.groupby('title').posting_id.agg('unique').to_dict()
# test['o_title_preds'] = test.title.map(tmp)
# test.head()

# Compute CV Score

In [None]:
from functools import reduce
def intersect(*args):
    return reduce(np.intersect1d, args)


def higher(f,*args):
    res = {}
    keys = np.unique(np.concatenate(args))
    for k in keys: 
        res[k] = np.count_nonzero(np.concatenate(args) == k)
    output_dict = dict(filter(lambda item: item[1] >= f, res.items()))
    
    return np.array(list(output_dict.keys()))

def combine_for_sub(row):
#     base2 = intersect(row.text_preds,row.image_predsl4,row.image_predsl5,row.image_predsl6,row.bert_predsl3,row.bert_predsl5)
    base3 = higher(5,row.text_preds,row.image_predsb1,row.image_predsb2,row.image_predsb3,row.bert_predsb1,row.bert_predsb2,row.bert_predsb3)
    # image+bert 0.18
#     base = np.concatenate([base2,base3,row.image_predsb1,row.image_predsb2,row.image_predsb3,row.bert_predsb1,row.bert_predsb2])
    
    # image+bert 0.18
    base = np.concatenate([base3])#,row.image_predsb1,row.image_predsb2,row.image_predsb3,row.bert_predsb1,row.bert_predsb2])
    
    
    # 0.27 zone
    x = np.concatenate([row.image_preds0]) # image dm_nfnet_f0 0.27
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
        
    x = np.concatenate([row.bert_preds5]) # bert indnesia 0.27
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.bert_preds0]) # bert english 0.27
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
        
    x = np.concatenate([row.image_preds1]) # image swin 0.27
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.image_preds2]) # image efficientnet_b0 0.27
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
        
        
        
    # 0.36 zone
    x = np.concatenate([row.image_preds3]) # image dm_nfnet_f0 0.36
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.bert_preds6]) # bert indnesia 0.36
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )    

    x = np.concatenate([row.bert_preds1]) # bert eglish 0.36
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.image_preds4]) # image swin 0.36
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.image_preds5]) # image efficientnet_b0 0.36
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
        
        
    # commone zone
    x = np.intersect1d(row.image_preds6 ,row.bert_preds2) # 13
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
        
    x = np.intersect1d(row.image_preds7 ,row.bert_preds3) # 30
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
        
    x = np.intersect1d(row.image_preds8 ,row.bert_preds4) # 50
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
#     # word zone
#     x = np.concatenate([row.word_preds0]) # word2vec 0.18
#     if len(np.unique(x)) > 1:
#         x = np.concatenate([row.text_preds,x,base])
#         return ' '.join( np.unique(x)[:51] )
        
        
    # 0.45 zone
    x = np.concatenate([row.image_predsl1]) # image dm_nfnet_f0 0.45
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.bert_predsl1]) # bert indnesia 0.45
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.bert_predsl2]) # bert eglish 0.45
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.image_predsl2]) # image swin 0.45
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.image_predsl3]) # image eff 0.45
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    #berta
    x = np.concatenate([row.berta_preds1]) # berta english 0.27
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.berta_preds2]) # berta english 0.36
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    

    # 0.45 zone
    x = np.concatenate([row.image_predsl4]) # image dm_nfnet_f0 0.45
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.bert_predsl3]) # bert indnesia 0.45
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.bert_predsl5]) # bert eglish 0.45
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.image_predsl5]) # image swin 0.45
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    x = np.concatenate([row.image_predsl6]) # image eff 0.45
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    
    
    x = np.concatenate([base]) # base
    if len(np.unique(x)) > 1:
        x = np.concatenate([x,base])
        return ' '.join( np.unique(x)[:51] )
    

    
    # cout zone
#     all_model = pd.Series(np.concatenate([row.image_preds6 ,row.bert_preds2,row.image_preds_13_1,row.image_preds_13_2,row.image_preds_13_3,row.bert_preds_13_1,row.bert_preds_13_2])) # 13
#     df_image = all_model.value_counts()
#     df_image = pd.DataFrame(df_image.rename_axis('posting_id').reset_index(name='num'))
#     x = df_image[df_image["num"]>2].posting_id.to_list()
#     if len(np.unique(x)) > 1:
#         x = np.concatenate([row.text_preds,x,base])
#         return ' '.join( np.unique(x)[:51] )
    
    
    
    
    # under zone
    x = np.concatenate([row.bert_under])
    return ' '.join( np.unique(x)[:51] )

In [None]:
test['matches'] = test.apply(combine_for_sub,axis=1)
test

In [None]:
# def split_data(x):
#     return x.split(" ")

# test["matches"] = test["matches"].map(split_data)
# matches_dict = test.set_index('posting_id').to_dict()['matches']

# def get_other(x,matches_dict):
#     matches_set = set()
#     for posting_id in matches_dict.keys():
#         sample = matches_dict[posting_id]
#         for xx in x:
#             if xx in sample:
#                 matches_set = matches_set | set(sample)
#     # marage other predict
#     return list(matches_set)[:50]


# for posting_id in tqdm(matches_dict.keys()):
#     x = matches_dict[posting_id]
#     if len(x) == 1:
#         y = get_other(x,matches_dict)
#         matches_dict[posting_id] = y[:51]

# for posting_id in tqdm(matches_dict.keys()):
#     matches_dict[posting_id] = " ".join(matches_dict[posting_id])
    
# test2 = pd.DataFrame.from_dict(matches_dict, orient='index').reset_index()
# test2.columns=['posting_id','matches']

In [None]:
# test2["matches"] = test2["matches"].map(split_data)
# test2 = pd.merge(test2, test[["posting_id","bert_under"]])
# test2

In [None]:
# def combine_for_sub2(row):
#     x = np.concatenate([row.matches])
#     if len(np.unique(row.matches)) == 1:
#         x = np.concatenate([row.matches,row.bert_under])
#     return ' '.join( np.unique(x)[:50] )

In [None]:
# test2['matches'] = test2.apply(combine_for_sub2,axis=1)

# Write Submission CSV

In [None]:
test[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()

In [None]:
def count_num(x):
    return len(x.split(" "))
sub["num"] = sub["matches"].map(count_num)
max(sub["num"].to_list())

In [None]:
min(sub["num"].to_list())