In [None]:
# Install Rapids
import sys
!cp ../input/rapids/rapids.0.18.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
# Install BERT
!pip install ../input/bertfortf2/bert/py-params-0.10.2/py-params-0.10.2
!pip install ../input/bertfortf2/bert/params-flow-0.8.2/params-flow-0.8.2
!pip install ../input/bertfortf2/bert/bert-for-tf2-0.14.9/bert-for-tf2-0.14.9

In [None]:
# Imports
import pandas as pd
import numpy as np
import os
import gc
import nltk
import cuml, cupy
import tensorflow as tf
import tensorflow_hub as hub
import bert

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
from tensorflow import keras
from PIL import Image

In [None]:
# RESTRICT TENSORFLOW TO 8GB OF GPU RAM
# SO THAT WE HAVE 8GB RAM FOR RAPIDS
LIMIT = 8
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
# Configure if we are computing CV on train data or making submission on test data
COMPUTE_CV = True

# If test size is > 3, configure for submission
test = pd.read_csv('../input/shopee-product-matching/test.csv')

if len(test) > 3:
    COMPUTE_CV = False
    del test

In [None]:
# Load data
if COMPUTE_CV:
    # If we are computing CV, use train dataset
    df = pd.read_csv('../input/shopee-product-matching/train.csv')
    
    # Create dictionary of label groups (key) and posting IDs (values)
    label_dict = df.groupby('label_group')['posting_id'].unique().to_dict()

    # Create column of matching products
    df['matches'] = df['label_group'].map(label_dict)
    
else:
    df = pd.read_csv('../input/shopee-product-matching/test.csv')

In [None]:
# Create directory path to images
if COMPUTE_CV:
    image_dir = '../input/shopee-product-matching/train_images'
else:
    image_dir = '../input/shopee-product-matching/test_images'

## Image Embeddings

In [None]:
# Import re-trained EfficientNetB4
model = keras.models.load_model('../input/efficientnetb4model8/model_8')

In [None]:
# Set image size for EfficientNetB4 input
im_size = 380

# Set batch size
batch = 8

# As the dataset is large, we will run the modelling in chunks
chunk_size = 5000
chunks = np.arange(np.ceil(len(df) / chunk_size))

# Set image paths of all images
image_paths = image_dir + '/' + df['image']

In [None]:
# Create function to pre-process images
def process_image(image_file_path):
    # Read and decode image from file path
    image = tf.io.read_file(image_file_path)
    image = tf.image.decode_jpeg(image, channels = 3)

    # Resize image
    image = tf.image.resize(image, (im_size,im_size))

    # Scale image vector
    image = tf.cast(image, tf.float32) / 255.0
    return image

In [None]:
# Create tensorflow dataset from image paths
def get_data(image_paths):
    dataset = tf.data.Dataset.from_tensor_slices(image_paths)

    # Process dataset with the image processing function created above. Set parallel calls to autotune
    dataset = dataset.map(process_image, num_parallel_calls = tf.data.AUTOTUNE)

    # Set batch size
    dataset = dataset.batch(batch_size = batch)

    # Set prefetch to autotune
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
# Generate image embeddings from EfficientNetB4 model in chunks
# Initialize embeddings list
embeddings = []

# Iterate through chunks
for i in chunks:
    # Start and end index
    start = int(i * chunk_size)
    end = int((i + 1) * chunk_size)

    # Get image dataset
    image_dataset = get_data(image_paths[start:end])

    # Generate embeddings
    chunk_embeddings = model.predict(image_dataset)

    # Append to embeddings list
    embeddings.append(chunk_embeddings)

    # Print status
    print(f'Chunk {i} completed')

image_embeddings = np.concatenate(embeddings)

In [None]:
# Delete unused variables
del model
del image paths
del embeddings
del image_dataset
del chunk_embeddings
gc.collect()

In [None]:
image_embeddings.shape

## TF-IDF Embeddings

In [None]:
# Create stop words
stop_words = nltk.corpus.stopwords.words('english') + \
             nltk.corpus.stopwords.words('indonesian') + \
             [# Sales words:
                'free', 'gift', 'give', 'get', 'ready', 'stock', 'stocks', 'stok',
                'ori', 'original', 'official', 'new', 'latest',
                'import', 'low', 'price', 'cheap', 'vip', 'discount', 'warranty',
                'promo', 'promotion', 'buy', 'buyer', 'shop', 'shopper', 'shopping',
                'bigsale', 'sale', 'sell', 'seller', 'resell', 'reseller',
                'all', 'any', 'full', 'include', 'includes', 'inclusive', 'tax',
    
                # Units
                'pieces', 'piece', 'pcs', 'pc', 'box', 'boxes', 'pack', 'packs', 'packet', 'packets', 'paket', 'package',
                'set', 'sets', 'size', 'roll', 'rolls', 'sachet', 'sachets'
                
                # Dimensions
                'ml', 'l', 'litre', 'liter', 'g', 'gr', 'gram', 'kg', 'kilo', 'kilogram',
                'mm', 'cm', 'm', 'meter', 'metre', 'yard', 'inch', 'x',
    
                # Miscellaneous alphabets
                'c', 'xe', 'f', 'b', 'v', 'xa',
                
                # Location words:
                'shopee', 'indonesia', 'indonesian', 'indo', 'id', 'jakarta', 'local', 'lokal',
    
                # English descriptors:
                'fashion', 'colour', 'color', 'design',
                'plus', 'pro', 'mini', 'premium', 'pro', 'super', 'extra', 'big', 'small',
                
                # Indonesian descriptors:
                'bpom', 'muat', 'cod', 'murah', 'isi', 'warna', 'pajak', 'garansi', 'beli', 'gratis',
                'terbaru', 'harga', 'resmi',
]

stop_words = list(set(stop_words))

In [None]:
# Create function for generating tokens from titles
def process_tokens(title, stop_words, tokenizer):
    words = tokenizer.tokenize(title.lower())
    return ' '.join([word for word in words if word not in stop_words])

In [None]:
tokenizer = nltk.tokenize.RegexpTokenizer('[a-zA-Z0-9]+')
title_tokens = df['title'].map(lambda x: process_tokens(x, stop_words, tokenizer)).to_numpy()

In [None]:
tvec = TfidfVectorizer()
tfidf_embeddings = tvec.fit_transform(title_tokens)

In [None]:
tfidf_embeddings.shape

## LaBSE Embeddings

In [None]:
def get_model(model_url, max_seq_length):
    labse_layer = hub.KerasLayer(model_url, trainable=True)

    # Define input.
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                             name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                         name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                          name="segment_ids")

    # LaBSE layer.
    pooled_output,  _ = labse_layer([input_word_ids, input_mask, segment_ids])

    # The embedding is l2 normalized.
    pooled_output = tf.keras.layers.Lambda(
          lambda x: tf.nn.l2_normalize(x, axis=1))(pooled_output)

    # Define model.
    return tf.keras.Model(
            inputs=[input_word_ids, input_mask, segment_ids],
            outputs=pooled_output), labse_layer

In [None]:
max_seq_length = 64

In [None]:
labse_model, labse_layer = get_model(model_url="../input/labse-1", max_seq_length=max_seq_length)

In [None]:
vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

def create_input(input_strings, tokenizer, max_seq_length):
    
    input_ids_all, input_mask_all, segment_ids_all = [], [], []
    for input_string in input_strings:
        
        # Tokenize input.
        input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        sequence_length = min(len(input_ids), max_seq_length)

        # Padding or truncation.
        if len(input_ids) >= max_seq_length:
            input_ids = input_ids[:max_seq_length]
        else:
            input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

        input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

        input_ids_all.append(input_ids)
        input_mask_all.append(input_mask)
        segment_ids_all.append([0] * max_seq_length)

    return np.array(input_ids_all), np.array(input_mask_all), np.array(segment_ids_all)

In [None]:
def encode(input_text):
    input_ids, input_mask, segment_ids = create_input(input_text, tokenizer, max_seq_length)
    return labse_model([input_ids, input_mask, segment_ids])

In [None]:
# As the dataset is large, we will run the embedding in chunks
chunk_size = 2000
chunks = np.arange(np.ceil(len(df) / chunk_size))

In [None]:
# Generate text embeddings from LaBSE model in chunks for tokens set 2
# Initialize embeddings list
embeddings = []

# Iterate through chunks
for i in chunks:
    # Start and end index
    start = int(i * chunk_size)
    end = int((i + 1) * chunk_size)

    # Get tokens
    tokens = title_tokens[start:end]

    # Generate embeddings
    text_embeddings = encode(tokens)

    # Append to embeddings list
    embeddings.append(text_embeddings)

    # Print status
    print(f'Chunk {i} completed')

text_labse_embeddings = np.concatenate(embeddings)

In [None]:
# Delete unused variables
del labse_model
del labse_layer
del embeddings
del tokens
del title_tokens
del text_embeddings
del tokenizer
del vocab_file
gc.collect()

In [None]:
text_labse_embeddings.shape

## Make Predictions

In [None]:
combined_embeddings = np.concatenate((image_embeddings,text_labse_embeddings), axis=1)
ss = StandardScaler(with_mean=False)
combined_embeddings_scaled = ss.fit_transform(combined_embeddings)
del combined_embeddings

In [None]:
image_model = NearestNeighbors(n_neighbors=51, metric = 'cosine')
image_model.fit(image_embeddings)
image_distances, image_indices = image_model.kneighbors(image_embeddings)

In [None]:
tfidf_model = NearestNeighbors(n_neighbors=51, metric = 'cosine')
tfidf_model.fit(tfidf_embeddings)
tfidf_distances, tfidf_indices = tfidf_model.kneighbors(tfidf_embeddings)

In [None]:
combined_model = NearestNeighbors(n_neighbors=51, metric = 'cosine')
combined_model.fit(combined_embeddings_scaled)
combined_distances, combined_indices = combined_model.kneighbors(combined_embeddings_scaled)

In [None]:
del image_model, tfidf_model, combined_model

In [None]:
# Create function to predict based on ratio of distances
def predict(df, image_distances, image_indices, tfidf_distances, tfidf_indices,
            combined_distances, combined_indices, image_ratio, tfidf_ratio, combined_ratio):
    
    preds = []

    for i in range(df.shape[0]):
        
        # Set thresholds based on ratios of average distances
        image_threshold = image_ratio * np.mean(image_distances[i])
        image_idx = image_indices[i][np.where(image_distances[i] <= image_threshold)]
        image_ids = df['posting_id'].iloc[image_idx].values
        
        tfidf_threshold = tfidf_ratio * np.mean(tfidf_distances[i])
        tfidf_idx = tfidf_indices[i][np.where(tfidf_distances[i] <= tfidf_threshold)]
        tfidf_ids = df['posting_id'].iloc[tfidf_idx].values
        
        combined_threshold = combined_ratio * np.mean(combined_distances[i])
        combined_idx = combined_indices[i][np.where(combined_distances[i] <= combined_threshold)]
        combined_ids = df['posting_id'].iloc[combined_idx].values      
        
        preds.append(np.union1d(combined_ids, np.union1d(image_ids, tfidf_ids)))

    return preds

In [None]:
image_ratio = 0.5
tfidf_ratio = 0.5
combined_ratio = 0.7

In [None]:
preds = predict(df, image_distances, image_indices, tfidf_distances, tfidf_indices,
                combined_distances, combined_indices, image_ratio, tfidf_ratio, combined_ratio)

## Score / Submit

In [None]:
# Create function to score predictions based on actual matches
def scores(matches, preds):
    result = []
    for i in range(len(matches)):
        n = len(np.intersect1d(matches[i], preds[i]))
        score = 2*n / (len(matches[i]) + len(preds[i]))
        result.append(score)
    return result

In [None]:
if COMPUTE_CV:
    matches = list(df['matches'].to_numpy())
    print(f'Average combined score on train data: {np.mean(scores(matches, preds))}')
    print('')
    print('Saving dummy submission file')
    dummy = pd.read_csv('../input/shopee-product-matching/test.csv')
    dummy['matches'] = dummy['posting_id']
    dummy[['posting_id','matches']].to_csv('submission.csv',index=False)
    
else:
    df['matches'] = preds
    df['matches'] = df['matches'].map(lambda x: ' '.join(x))
    df[['posting_id','matches']].to_csv('submission.csv',index=False)

### Notebook references

- https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700