In [None]:
!pip install ../input/shopee-external-models/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/shopee-external-models/efficientnet-1.1.0-py3-none-any.whl

import numpy as np
import pandas as pd
import gc
from collections import Counter
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
import efficientnet.tfkeras as efn
from tqdm.autonotebook import tqdm
import math
from sklearn.preprocessing import LabelEncoder
import tensorflow_hub as hub

# Config

In [None]:
model_type_threshold = {
    'b0': 3.3,
    'b1': 3.4,
    'b2': 3.5,
    'b3': 3.7,
    'b4': 3.8,
    'b5': 3.9,
    'b6': 4.0,
    'b7': 3.9,
    'bert1': 16.0,
    'bert2': 14.0,
    'bert3': 8.0,
    'bert4': 9.0,
}

In [None]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
BATCH_SIZE = 8
IMAGE_SIZE = [512, 512]
# Seed
SEED = 42
# Verbosity
VERBOSE = 1
# Number of classes
N_CLASSES = 11014

In [None]:
# RESTRICT TENSORFLOW TO 2GB OF GPU RAM
# SO THAT WE HAVE 14GB RAM FOR RAPIDS
LIMIT = 2.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
# Flag to get cv score
GET_CV = True
# Flag to check ram allocations (debug)
CHECK_SUB = False

In [None]:
%%time

# Dataset for images

df = pd.read_csv('../input/shopee-product-matching/test.csv')
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
    GET_CV = False
del df

# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1


# # Function to combine predictions
# def combine_predictions(row):
#     x = np.concatenate([
#         row['image_predictions_b3'], 
#         row['image_predictions_b4'], 
#         row['text_predictions_bert1'], 
#         row['text_predictions_bert2'],
#         row['text_predictions_tfidf']
#     ])
#     return ' '.join( np.unique(x) )

# Function to voting predictions
def voter(row):
    image_ret = list()
    x = np.concatenate([
#         row['image_predictions_b0'], 
#         row['image_predictions_b1'],
#         row['image_predictions_b2'], 
        row['image_predictions_b3'],
        row['image_predictions_b4'],
        row['image_predictions_b5'],
        row['image_predictions_b6'],
#         row['image_predictions_b7']
    ])
    counter = Counter(x)
    for (item, value) in counter.most_common():
        if value == 4:
            image_ret.append(item)
            
    text_ret = list()
    x = np.concatenate([
#         row['text_predictions_bert1'], 
#         row['text_predictions_bert2'],
        row['text_predictions_bert3'], 
        row['text_predictions_bert4'],
    ])
    counter = Counter(x)
    for (item, value) in counter.most_common():
        if value == 2:
            text_ret.append(item)
            
    image_arr = np.array(image_ret)
    text_arr = np.array(text_ret)
    
    res = np.concatenate([image_arr,
                          text_arr,
                          row['text_predictions_tfidf']])
    return ' '.join( np.unique(res) )

# Function to read out dataset
def read_dataset():
    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/train_images/' + df['image']
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu = cudf.DataFrame(df)
        image_paths = '../input/shopee-product-matching/test_images/' + df['image']
        
    return df, df_cu, image_paths

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

# Function to get our dataset that read images
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset


df, df_cu, image_paths = read_dataset()

In [None]:
%%time

# GET IMAGE EMBEDDINGS

# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output


# Function to get the embeddings of our images with the fine-tuned model
def get_image_embeddings(image_paths, 
                         model_type = 'b3',
                         model_path = '../input/shopee-efficientnetb3-arcmarginproduct-full/EfficientNetB3_M0.5_512_1024.h5'):
    embeds = []
    
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )

    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    if model_type == 'b7':
        x = efn.EfficientNetB7(weights = None, include_top = False)(inp)
    elif model_type == 'b6':
        x = efn.EfficientNetB6(weights = None, include_top = False)(inp)
    elif model_type == 'b5':
        x = efn.EfficientNetB5(weights = None, include_top = False)(inp)
    elif model_type == 'b4':
        x = efn.EfficientNetB4(weights = None, include_top = False)(inp)
    elif model_type == 'b3':
        x = efn.EfficientNetB3(weights = None, include_top = False)(inp)
    elif model_type == 'b2':
        x = efn.EfficientNetB2(weights = None, include_top = False)(inp)
    elif model_type == 'b1':
        x = efn.EfficientNetB1(weights = None, include_top = False)(inp)
    elif model_type == 'b0':
        x = efn.EfficientNetB0(weights = None, include_top = False)(inp)
    else:
        x = efn.EfficientNetB0(weights = None, include_top = False)(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = margin([x, label])
        
    output = tf.keras.layers.Softmax(dtype='float32')(x)

    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
    model.load_weights(model_path)
    model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        image_dataset = get_dataset(image_paths[a:b])
        image_embeddings = model.predict(image_dataset)
        embeds.append(image_embeddings)
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

# image_embeddings_b7 = get_image_embeddings(image_paths,
#                                            model_type = 'b7',
#                                            model_path = '../input/shopee-efficientnetb7-arcmarginproduct-full/EfficientNetB7_M0.5_512_1029.h5')
image_embeddings_b6 = get_image_embeddings(image_paths,
                                           model_type = 'b6',
                                           model_path = '../input/shopee-efficientnetb6-arcmarginproduct-full/EfficientNetB6_M0.5_512_520.h5')
image_embeddings_b5 = get_image_embeddings(image_paths,
                                           model_type = 'b5',
                                           model_path = '../input/shopee-efficientnetb5-arcmarginproduct-full/EfficientNetB5_M0.5_512_1314.h5')
image_embeddings_b4 = get_image_embeddings(image_paths,
                                           model_type = 'b4',
                                           model_path = '../input/shopee-efficientnetb4-arcmarginproduct-full/EfficientNetB4_M0.5_512_2021.h5')
image_embeddings_b3 = get_image_embeddings(image_paths,
                                           model_type = 'b3',
                                           model_path = '../input/shopee-efficientnetb3-arcmarginproduct-full/EfficientNetB3_M0.5_512_1024.h5')
# image_embeddings_b2 = get_image_embeddings(image_paths,
#                                            model_type = 'b2',
#                                            model_path = '../input/shopee-efficientnetb2-arcmarginproduct-full/EfficientNetB2_M0.5_512_666.h5')
# image_embeddings_b1 = get_image_embeddings(image_paths,
#                                            model_type = 'b1',
#                                            model_path = '../input/shopee-efficientnetb1-arcmarginproduct-full/EfficientNetB1_M0.5_512_42.h5')
# image_embeddings_b0 = get_image_embeddings(image_paths,
#                                            model_type = 'b0',
#                                            model_path = '../input/shopee-efficientnetb0-arcmarginproduct-full/EfficientNetB0_M0.5_512_777.h5')


gc.collect()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import (BertTokenizer, BertModel,
                          DistilBertTokenizer, DistilBertModel)

# For pytorch bert
class CFG:
    N_CLASSES = 11014
    bert_hidden_size = 768
    batch_size = 64
    num_workers = 1
    max_length = 60
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    
    
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device=CFG.device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s
        # print(output)

        return output
    
    
class Model(nn.Module):
    def __init__(self, 
                 bert_model, 
                 num_classes=CFG.N_CLASSES, 
                 last_hidden_size=CFG.bert_hidden_size):
        
        super().__init__()
        self.bert_model = bert_model
        self.arc_margin = ArcMarginProduct(last_hidden_size, 
                                           num_classes, 
                                           s=30.0, 
                                           m=0.50, 
                                           easy_margin=False)
    
    def get_bert_features(self, batch):
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        CLS_token_state = last_hidden_state[:, 0, :] # obtaining CLS token state which is the first token.
        return CLS_token_state
    
    def get_first_last_avg(self, batch):
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = output.last_hidden_state
        first_hidden_state = output.hidden_states[1]
        CLS_token_state = (last_hidden_state[:, 0, :] + first_hidden_state[:, 0, :]) / 2
        return CLS_token_state
    
    def forward(self, batch):
        CLS_hidden_state = self.get_bert_features(batch)
        output = self.arc_margin(CLS_hidden_state, batch['labels'])
        return output

In [None]:
# Dataset for text

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, mode="train", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            lbl_encoder = LabelEncoder()
            dataframe['label_code'] = lbl_encoder.fit_transform(dataframe['label_group'])
            self.targets = dataframe['label_code'].values
        texts = list(dataframe['title'].apply(lambda o: str(o)).values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        self.mode = mode
        
        
    def __getitem__(self, idx):
        # putting each tensor in front of the corresponding key from the tokenizer
        # HuggingFace tokenizers give you whatever you need to feed to the corresponding model
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}
        # when testing, there are no targets so we won't do the following
        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx]).long()
        return item
    
    def __len__(self):
        return len(self.dataframe)

In [None]:
def get_text_embeddings(model_type = 'distil',
                        model_name = '../input/distilbert-base-indonesian', 
                        model_path = '../input/indonesian-distilbert-finetuning-with-arcmargin/final.pt',
                        emb_type = 'last'):
    
    if model_type == 'distil':
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        bert_model = DistilBertModel.from_pretrained(model_name, output_hidden_states=True)
    else:
        tokenizer = BertTokenizer.from_pretrained(model_name)
        bert_model = BertModel.from_pretrained(model_name, output_hidden_states=True)
    
    if GET_CV:
        dataset = TextDataset(df, tokenizer, mode='train', max_length=CFG.max_length)
    else:
        dataset = TextDataset(df, tokenizer, mode='test', max_length=CFG.max_length)

    dataloader = torch.utils.data.DataLoader(dataset, 
                                             batch_size=CFG.batch_size, 
                                             num_workers=CFG.num_workers, 
                                             shuffle=False)
    batch = next(iter(dataloader))
    print(batch['input_ids'].shape)

    model = Model(bert_model).to(CFG.device)
    model.load_state_dict(torch.load(model_path, map_location=CFG.device))
    model.eval()
    
    embeds = []
    with torch.no_grad():
        for batch in tqdm(dataloader, total=len(dataloader)):
            batch = {k: v.to(CFG.device) for k, v in batch.items()}
            if emb_type == 'last':
                text_embeddings = model.get_bert_features(batch)
            elif emb_type == 'first_last_avg':
                text_embeddings = model.get_first_last_avg(batch)
            else:
                text_embeddings = model.get_bert_features(batch)
            embeds.append(text_embeddings.cpu())
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    
    return text_embeddings

In [None]:
%%time

# text_embeddings_bert1 = get_text_embeddings(
#     model_type = 'bert',
#     model_name = '../input/bert-base-indonesian-522m',
#     model_path = '../input/indonesian-bert-finetuning-with-arcmargin-full/final.pt',
#     emb_type = 'last'
# )

# text_embeddings_bert2 = get_text_embeddings(
#     model_type = 'distil',
#     model_name = '../input/distilbert-base-indonesian',                                  
#     model_path = '../input/indonesian-distilbert-finetuning-with-arcmargin/final.pt',
#     emb_type = 'last'
# )

text_embeddings_bert3 = get_text_embeddings(
    model_type = 'bert',
    model_name = '../input/bert-base-indonesian-522m',
    model_path = '../input/indonesian-bert-finetuning-with-arcmargin-full/final.pt',
    emb_type = 'first_last_avg'
)

text_embeddings_bert4 = get_text_embeddings(
    model_type = 'distil',
    model_name = '../input/distilbert-base-indonesian',                                  
    model_path = '../input/indonesian-distilbert-finetuning-with-arcmargin/final.pt',
    emb_type = 'first_last_avg'
)

In [None]:
# TF-IDF

from cuml.feature_extraction.text import TfidfVectorizer

model = TfidfVectorizer(stop_words=None, binary=True, max_features=25000)
text_embeddings2 = model.fit_transform(df_cu.title).toarray()
print('text embeddings shape',text_embeddings2.shape)

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(df_cu)//CHUNK
if len(df_cu)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(df_cu))
    print('chunk',a,'to', b)
    
    # COSINE SIMILARITY DISTANCE
    # cts = np.dot( text_embeddings, text_embeddings[a:b].T).T
    cts = cupy.matmul(text_embeddings2, text_embeddings2[a:b].T).T
    
    for k in range(b-a):
        # IDX = np.where(cts[k,]>0.7)[0]
        IDX = cupy.where(cts[k,]>0.775)[0]
        o = df_cu.iloc[cupy.asnumpy(IDX)].posting_id.to_pandas().values
        preds.append(o)
        
del model, text_embeddings2

df_cu['oof_text'] = preds

In [None]:
# Function to get 50 nearest neighbors of each image and apply a distance threshold to maximize cv
def get_neighbors(df, embeddings, KNN = 50, model_type = 'b3'):
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        if model_type in ['b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']:
            thresholds = list(np.arange(3.0, 5.0, 0.1))
        elif model_type in ['bert1', 'bert2']:
            thresholds = list(np.arange(15, 35, 1))
        else:
            thresholds = list(np.arange(6, 18, 1))
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            if model_type in model_type_threshold:
                idx = np.where(distances[k,] < model_type_threshold[model_type])[0]
            else:
                idx = np.where(distances[k,] < 20.0)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            if model_type in model_type_threshold:
                idx = np.where(distances[k,] < model_type_threshold[model_type])[0]
            else:
                idx = np.where(distances[k,] < 20.0)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [None]:
# Get neighbors for image_embeddings
# df, image_predictions_b7 = get_neighbors(df, image_embeddings_b7, KNN = 100, model_type = 'b7')
df, image_predictions_b6 = get_neighbors(df, image_embeddings_b6, KNN = 100, model_type = 'b6')
df, image_predictions_b5 = get_neighbors(df, image_embeddings_b5, KNN = 100, model_type = 'b5')
df, image_predictions_b4 = get_neighbors(df, image_embeddings_b4, KNN = 100, model_type = 'b4')
df, image_predictions_b3 = get_neighbors(df, image_embeddings_b3, KNN = 100, model_type = 'b3')
# df, image_predictions_b2 = get_neighbors(df, image_embeddings_b2, KNN = 100, model_type = 'b2')
# df, image_predictions_b1 = get_neighbors(df, image_embeddings_b1, KNN = 100, model_type = 'b1')
# df, image_predictions_b0 = get_neighbors(df, image_embeddings_b0, KNN = 100, model_type = 'b0')

In [None]:
# Get neighbors for text_embeddings
# df, text_predictions_bert1 = get_neighbors(df, text_embeddings_bert1, KNN = 100, model_type = 'bert1')
# df, text_predictions_bert2 = get_neighbors(df, text_embeddings_bert2, KNN = 100, model_type = 'bert2')
df, text_predictions_bert3 = get_neighbors(df, text_embeddings_bert3, KNN = 100, model_type = 'bert3')
df, text_predictions_bert4 = get_neighbors(df, text_embeddings_bert4, KNN = 100, model_type = 'bert4')

In [None]:
# Concatenate image predctions with text predictions
if GET_CV:
#     df['image_predictions_b0'] = image_predictions_b0
#     df['image_predictions_b1'] = image_predictions_b1
#     df['image_predictions_b2'] = image_predictions_b2
    df['image_predictions_b3'] = image_predictions_b3
    df['image_predictions_b4'] = image_predictions_b4
    df['image_predictions_b5'] = image_predictions_b5
    df['image_predictions_b6'] = image_predictions_b6
#     df['image_predictions_b7'] = image_predictions_b7
#     df['text_predictions_bert1'] = text_predictions_bert1
#     df['text_predictions_bert2'] = text_predictions_bert2
    df['text_predictions_bert3'] = text_predictions_bert3
    df['text_predictions_bert4'] = text_predictions_bert4
    df['text_predictions_tfidf'] = df_cu['oof_text'].to_pandas().values
    df['pred_matches'] = df.apply(voter, axis = 1)
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'Our final f1 cv score is {score}')
    df['matches'] = df['pred_matches']
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
else:
#     df['image_predictions_b0'] = image_predictions_b0
#     df['image_predictions_b1'] = image_predictions_b1
#     df['image_predictions_b2'] = image_predictions_b2
    df['image_predictions_b3'] = image_predictions_b3
    df['image_predictions_b4'] = image_predictions_b4
    df['image_predictions_b5'] = image_predictions_b5
    df['image_predictions_b6'] = image_predictions_b6
#     df['image_predictions_b7'] = image_predictions_b7
#     df['text_predictions_bert1'] = text_predictions_bert1
#     df['text_predictions_bert2'] = text_predictions_bert2
    df['text_predictions_bert3'] = text_predictions_bert3
    df['text_predictions_bert4'] = text_predictions_bert4
    df['text_predictions_tfidf'] = df_cu['oof_text'].to_pandas().values
    df['matches'] = df.apply(voter, axis = 1)
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

In [None]:
tmp = df[['matches']].copy()
tmp['item_count'] = tmp['matches'].apply(lambda x: len(x.split()))
tmp['item_count'].describe()

In [None]:
tmp[tmp['item_count'] == 1].shape

In [None]:
!head submission.csv