# Imports

In [None]:
!pip install ../input/shopee-external-models/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/shopee-external-models/efficientnet-1.1.0-py3-none-any.whl
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
import efficientnet.tfkeras as efn

from tqdm.notebook import tqdm
import math
from shutil import copyfile
import tensorflow_hub as hub
# from shutil import copyfile
# copyfile(src = "../input/bert-baseline/tokenization.py", dst = "../working/tokenization.py")
# import tokenization
# import tensorflow_hub as hub

# Configurations

In [None]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
BATCH_SIZE = 8
IMAGE_SIZE = [512, 512]
# Seed
SEED = 42
# Verbosity
VERBOSE = 1
# Number of classes
N_CLASSES = 11014

In [None]:
# RESTRICT TENSORFLOW TO 2GB OF GPU RAM
# SO THAT WE HAVE 14GB RAM FOR RAPIDS
LIMIT = 4.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

# Getting Embeddings

In [None]:
# Flag to get cv score
GET_CV = False
# Flag to check ram allocations (debug)
CHECK_SUB = False

df = pd.read_csv('../input/shopee-product-matching/test.csv')
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
    GET_CV = False
del df
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

# Function to combine predictions
#row['image_predictions_1'], row['image_predictions_2'], row['image_predictions_3'],row['image_predictions_4']
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return ' '.join( np.unique(x) )

def combine_predictions_test(row):
    x = np.concatenate([row['image_predictions_0'], row['image_predictions_1'], 
                        row['image_predictions_2'], row['text_predictions_0'], 
                       row['text_predictions_1']])
    return ' '.join( np.unique(x) )

# Function to read out dataset
def read_dataset(fold):
    if GET_CV:
        df = pd.read_csv('../input/shopee-384x384-tfrecs/train.csv')
        if fold==10:
            df= df
        else:
            df = df.loc[df['fold']==fold]   
            
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
#         df_cu = df
        image_paths = '../input/shopee-product-matching/train_images/' + df['image_name']
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')#.iloc[:15000]
        df_cu = cudf.DataFrame(df)
#         df_cu = df
        image_paths = '../input/shopee-product-matching/test_images/' + df['image']
        
    return df, df_cu, image_paths

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

# Function to get our dataset that read images
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output


# Function to get the embeddings of our images with the fine-tuned model
def get_image_embeddings(model_names, image_paths, model_path):
    embeds = []
    
    if model_names[0]=='EfficientNetB3':
        N_CLASSES = 11011
    else:
        N_CLASSES = 11014
        
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )

    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    if model_names[0]=='EfficientNetB4':
        x = efn.EfficientNetB4(weights = None, include_top = False)(inp)
    elif model_names[0]=='EfficientNetB5':
        x = efn.EfficientNetB5(weights = None, include_top = False)(inp)
    elif model_names[0]=='EfficientNetB0':
        x = efn.EfficientNetB0(weights = None, include_top = False)(inp)
    elif model_names[0]=='EfficientNetB6':
        x = efn.EfficientNetB6(weights = None, include_top = False)(inp)
    elif model_names[0]=='EfficientNetB7':
        x = efn.EfficientNetB7(weights = None, include_top = False)(inp)
    elif model_names[0]=='EfficientNetB3':
        x = efn.EfficientNetB3(weights = None, include_top = False)(inp)
    elif model_names[0]=='EfficientNetB2':
        x = efn.EfficientNetB2(weights = None, include_top = False)(inp)
    elif model_names[0]=='EfficientNetB1':
        x = efn.EfficientNetB1(weights = None, include_top = False)(inp)
    elif model_names[0]=='DenseNet201':
        x = DenseNet201(weights = None, include_top = False)(inp)
    elif model_names[0]=='ResNet101V2':
        x = DenseNet201(weights = None, include_top = False)(inp)
    elif model_names[0]=='InceptionResNetV2':
        x = DenseNet201(weights = None, include_top = False)(inp)

    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    
    if len(model_names)==2:
        if model_names[1]=='EfficientNetB4':
            x1 = efn.EfficientNetB4(weights = None, include_top = False)(inp)
        elif model_names[1]=='EfficientNetB5':
            x1 = efn.EfficientNetB5(weights = None, include_top = False)(inp)
        elif model_names[1]=='EfficientNetB0':
            x1 = efn.EfficientNetB0(weights = None, include_top = False)(inp)
        elif model_names[1]=='EfficientNetB6':
            x1 = efn.EfficientNetB6(weights = None, include_top = False)(inp)
        elif model_names[1]=='EfficientNetB7':
            x1 = efn.EfficientNetB7(weights = None, include_top = False)(inp)
        elif model_names[1]=='EfficientNetB3':
            x1 = efn.EfficientNetB3(weights = None, include_top = False)(inp)
        elif model_names[1]=='EfficientNetB2':
            x1 = efn.EfficientNetB2(weights = None, include_top = False)(inp)
        elif model_names[1]=='EfficientNetB1':
            x1 = efn.EfficientNetB1(weights = None, include_top = False)(inp)
        elif model_names[1]=='DenseNet201':
            x1 = DenseNet201(weights = None, include_top = False)(inp)
        elif model_names[1]=='ResNet101V2':
            x1 = DenseNet201(weights = None, include_top = False)(inp)
        elif model_names[1]=='InceptionResNetV2':
            x1 = DenseNet201(weights = None, include_top = False)(inp)

        x1 = tf.keras.layers.GlobalAveragePooling2D()(x1)
        x  = tf.keras.layers.Concatenate()([x, x1])
    
    x = margin([x, label])
        
    output = tf.keras.layers.Softmax(dtype='float32')(x)

    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
    model.load_weights(model_path)
    model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in tqdm(iterator):
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        image_dataset = get_dataset(image_paths[a:b])
        image_embeddings = model.predict(image_dataset)
        embeds.append(image_embeddings)
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings


def get_text_predictions(df, post_dict, max_features = 25_000):
    
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    preds = []
    CHUNK = 1024*4
    
    n = 50 if len(df)>3 else 2

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T
        
        for k in range(b-a):
            IDX = cupy.where(cts[k,]>0.75)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            for p in o:
                if p in post_dict:
                    post_dict[p].append(a+k)
                else:
                    post_dict[p]=[a+k]
            preds.append(o)
    
    del model,text_embeddings
    gc.collect()
    return preds, post_dict

In [None]:
# Return tokens, masks and segments from a text array or series
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

# Function to get our text title embeddings using a pre-trained bert model
def get_text_embeddings(df, max_len = 70):
    embeds = []
    module_url = "../input/shopee-external-models/bert_en_uncased_L-24_H-1024_A-16_1"
    bert_layer = hub.KerasLayer(module_url, trainable = True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
    text = bert_encode(df['title'].values, tokenizer, max_len = max_len)
    
    margin = ArcMarginProduct(
            n_classes = 11014, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    
    input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    label = tf.keras.layers.Input(shape = (), name = 'label')

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    x = margin([clf_output, label])
    output = tf.keras.layers.Softmax(dtype='float32')(x)
    model = tf.keras.models.Model(inputs = [input_word_ids, input_mask, segment_ids, label], outputs = [output])
    
    model.load_weights('../input/bert-baseline/Bert_123.h5')
    model = tf.keras.models.Model(inputs = model.input[0:3], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        text_chunk = ((text[0][a:b], text[1][a:b], text[2][a:b]))
        text_embeddings = model.predict(text_chunk, batch_size = BATCH_SIZE)
        embeds.append(text_embeddings)
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

In [None]:
def get_neighbors(df, embeddings, post_dict, KNN = 50, image = True, threshold=0.5):
    model = NearestNeighbors(n_neighbors = KNN, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        if image:
            thresholds = list(np.arange(0.0, 1, 0.025))
        else:
            thresholds = list(np.arange(0.0, 1, 0.025))
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < (np.min(distances[k,])+threshold*(np.max(distances[k,])-np.min(distances[k,]))))[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            if image:
                idx = np.where(distances[k,] < best_threshold)[0]
            else:
                idx = np.where(distances[k,] < best_threshold)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            if image:
                tr = np.min(distances[k,]) + threshold * (np.max(distances[k,]) - np.min(distances[k,]))
#                 idx = np.where(distances[k,] < tr)[0]
#                 c = np.log10(len(idx)+1)/np.log10(60)
#                 tr = tr*(10/(10+c))
                idx = np.where(distances[k,] < tr)[0]
            else:
                idx = np.where(distances[k,] < (np.min(distances[k,])+threshold*(np.max(distances[k,])-np.min(distances[k,]))))[0]
                tr = np.min(distances[k,]) + threshold * (np.max(distances[k,]) - np.min(distances[k,]))
#                 idx = np.where(distances[k,] < tr)[0]
#                 c = np.log10(len(idx)+1)/np.log10(60)
#                 tr = tr*(10/(10+c))
                idx = np.where(distances[k,] < tr)[0]
                
            if len(idx)<=1:
                tr = np.min(distances[k,]) + 1.55*threshold * (np.max(distances[k,]) - np.min(distances[k,]))
                idx = np.where(distances[k,] < tr)[0]
                
                
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            for post_id in posting_ids:
                if post_id in post_dict:
                    post_dict[post_id].append(k)
                else:
                    post_dict[post_id] = [k]
            predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions, post_dict

In [None]:
#model_paths = '../input/concat-efficientnet-models/'
model_paths = '../input/concated-efficientnet-b1-and-b5/'
#model_paths = '../input/efficientnetb2groupkfold/'
#model_paths = '../input/efficientnet-b4-640x640/'
#model_paths = '../input/efficientnetb5groupkfold/'
image_embeddings = []
post_dict_b1b5 = {}
post_dict_b3 = {}
post_dict_text = {}
if GET_CV:
    for i in range(1):
        df, df_cu, image_paths = read_dataset(fold=10)
        model_names=['EfficientNetB1', 'EfficientNetB5']
        model_path = model_paths + f'{model_names[0]}_{model_names[1]}_512_{i}.h5'
        image_embeddings = get_image_embeddings(model_names, image_paths, model_path)
        df, image_predictions, post_dict = get_neighbors(df, image_embeddings, post_dict, KNN = 80 if len(df)>3 else 3, image = True)
        text_predictions = text_predictions = get_text_predictions(df, max_features = 25_000)
        df['image_predictions'] = image_predictions
        df['text_predictions'] = text_predictions
        df['pred_matches'] = df.apply(combine_predictions, axis = 1)
        df['f1'] = f1_score(df['matches'], df['pred_matches'])
        score = df['f1'].mean()
        print(f'Our final f1 cv score is {score}')
else:
    
    df, df_cu, image_paths = read_dataset(fold=10)
    image_embeddings_list = []
    image_thresholds=[0.38, 0.38, 0.38, 0.38, 0.38, 0.38]
#     for i in range(2):
    model_names=['EfficientNetB1', 'EfficientNetB5']
    model_path = model_paths + f'{model_names[0]}_{model_names[1]}_512_0.h5'
    image_embeddings = get_image_embeddings(model_names, image_paths, model_path)
#     image_embeddings_list.append(image_embeddings)
#     image_embeddings = np.average(image_embeddings_list, axis=0)
    #print(image_embeddings.shape)
    df, image_predictions, post_dict_b1b5 = get_neighbors(df, image_embeddings, post_dict_b1b5, KNN = 60 if len(df)>3 else 3, image = True, threshold=image_thresholds[0])
        
#     model_paths = '../input/shoppee-augmented-single-fold-models/'
#     image_embeddings_list = []
#     model_names=['EfficientNetB7']
#     for i in [1]:
#         model_path = model_paths + f'EfficientNetB7_512_{i}.h5'
#         image_embeddings = get_image_embeddings(model_names, image_paths, model_path)
#         #image_embeddings_list.append(image_embeddings)
#     #image_embeddings = np.average(image_embeddings_list, axis=0)
#     print(image_embeddings.shape)
#     df, image_predictions_B7 = get_neighbors(df, image_embeddings, KNN = 60 if len(df)>3 else 3, image = True, threshold=image_thresholds[0])
     
    model_paths = '../input/shoppee-efficientnetb3-ragnar/'
    image_embeddings_list = []
    model_names=['EfficientNetB3']
    for i in range(0, 1):
        model_path = model_paths + f'EfficientNetB3_512_42.h5'
        image_embeddings = get_image_embeddings(model_names, image_paths, model_path)
#         image_embeddings_list.append(image_embeddings)
#     image_embeddings = np.average(image_embeddings_list, axis=0)
    print(image_embeddings.shape)
    df, image_predictions_b3, post_dict_b3 = get_neighbors(df, image_embeddings, post_dict_b3, KNN = 60 if len(df)>3 else 3, image = True, threshold=0.37)
        
    #text_embeddings = get_text_embeddings(df_cu)
    #text_embeddings = cupy.asnumpy(text_embeddings)
    #df, text_predictions = get_neighbors(df, text_embeddings, KNN = 60 if len(df)>3 else 3, image = False, threshold=0.60)
    
    text_predictions, post_dict_text = get_text_predictions(df, post_dict_text, max_features = 25_000)
    #text_embeddings_bert = get_text_embeddings(df)
    #df, text_predictions_bert= get_neighbors(df, text_embeddings_bert, KNN = 60 if len(df)>3 else 3, image = True, threshold=0.30)
    df['image_predictions_0'] = image_predictions
    df['image_predictions_1'] = image_predictions_b3
    df['image_predictions_2'] = image_predictions
    df['text_predictions_0'] = text_predictions
#     df['image_predictions_4'] = image_predictions_list[4]
    df['text_predictions_1'] = text_predictions
   
    df['matches'] = df.apply(combine_predictions_test, axis = 1)
    #df_copy=df.copy()
    
    ids = df['posting_id'].values
    for i, post_id in enumerate(tqdm(ids)):
        
        try:
        
            idx=np.setdiff1d(post_dict_b1b5[post_id], post_dict_b3[post_id], post_dict_text[post_id])

            threshold = 0.25
            if len(idx)>1:
                temp = (df.iloc[idx][['posting_id', 'matches']])
                temp['matches_v1'] = temp.iloc[0]['matches']
                temp['f1_score'] = f1_score(temp['matches_v1'], temp['matches'])
                if temp.loc[temp['f1_score'] < threshold, :].shape[0] > 0:
                    temp.loc[temp['f1_score'] < threshold, 'matches']=temp.loc[temp['f1_score'] < threshold, 'matches'].apply(lambda x: x.replace(post_id, ""))
                    df.loc[temp.index, 'matches'] = temp['matches'].values
        except:
            print('No data')
        

    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
    