Hello Fellow Kagglers,

This competition introduced me to the techniques of matching points-of-interest and I will publish my work in the coming days.

This first notebook shows how to make sentence embeddings from the names, which are in multiple languages and can consists of multiple words.

With a multilangual sentence encoder the names can be transformed to a vector of fixed size allowing for comparing names.

In the feature engineering notebook these vectors are used to quantify the similarity between names by computing the cosine similarity between the vectors.

Two multilangual sentence encoders are used

1) [universal-sentence-encoder-multilingual](https://tfhub.dev/google/universal-sentence-encoder-multilingual/3)

2) [sentence-transformers/paraphrase-multilingual-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2)

The dataset generation and training/inference notebook will be (hopefully) published in the coming days

In [None]:
# Install Dependencies
!pip install tensorflow-text==2.6.0 sentence_transformers

In [None]:
import numpy as np
import pandas as pd
import tensorflow_hub as hub
import numpy as np
import tensorflow as tf
import tensorflow_text

from transformers import AutoTokenizer, TFXLMRobertaModel
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

import numba
import os
import pickle

# Disable automatic allocation of all GPU memory to prevent OOM after first model is loaded
for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)

# Train Dataset

In [None]:
# Cast columns to "category" to gratly reduce memory usage
train_dtype = {
    'city': 'category',
    'state': 'category',
    'zip': 'category',
    'country': 'category',
    'url': 'category',
    'phone': 'category',
    'latitude': np.float32,
    'longitude': np.float32,
}
train = pd.read_csv('/kaggle/input/foursquare-location-matching/train.csv', dtype=train_dtype, skiprows=lambda i: i>10000 and False)
# Display Train Dataset Stats/Sample
display(train.info(memory_usage=True))
display(train.head())
display(train.memory_usage(deep=True) / len(train))

In [None]:
# Convert name to lowercase
train['name'] = train['name'].astype(str, copy=False).str.lower().replace('nan', '')

# Models

In [None]:
# universal-sentence-encoder-multilingual
embed_use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

# sentence-transformers/paraphrase-multilingual-mpnet-base-v2
embedder_mpnet = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Embedding Generation

In [None]:
# Split names in 1000 batches to prevent OOM errors
N_SPLITS = 1000
NAMES_UNIQUE = train['name'].astype(str).unique()
NAMES_CHUNKS = np.array_split(
    NAMES_UNIQUE,
    N_SPLITS,
)

In [None]:
# Embedding Arrays
NAMES_EMBEDDINGS_USE = np.zeros(shape=[len(NAMES_UNIQUE), 512], dtype=np.float32)
NAMES_EMBEDDINGS_MPNET = np.zeros(shape=[len(NAMES_UNIQUE), 768], dtype=np.float32)

# Generate Embeddings, will take about 10 minutes, take a cup of coffee
OFFSET = 0
for chunk in tqdm(NAMES_CHUNKS):
    n = len(chunk)
    NAMES_EMBEDDINGS_USE[OFFSET:OFFSET + n] = embed_use(chunk)
    NAMES_EMBEDDINGS_MPNET[OFFSET:OFFSET + n] = embedder_mpnet.encode(chunk, show_progress_bar=False)
    OFFSET += n

In [None]:
# Save embeddings
np.save('NAMES_EMBEDDINGS_USE.npy', NAMES_EMBEDDINGS_USE)
np.save('NAMES_EMBEDDINGS_MPNET.npy', NAMES_EMBEDDINGS_MPNET)

# Name to Embedding Index Dictionary

In [None]:
# Name to Embedding Index dictionary
name2names_embedding_idx_dict  = dict([(a, b) for a, b in zip(NAMES_UNIQUE, np.arange(len(NAMES_UNIQUE)) )])

# Save dictionary as pickle
with open('name2names_embedding_idx_dict.pkl', 'wb') as file:
    pickle.dump(name2names_embedding_idx_dict, file)

In [None]:
# Embedding Index to Name Index dictionary
names_embedding_idx2name_dict  = dict([(a, b) for a, b in zip(np.arange(len(NAMES_UNIQUE)), NAMES_UNIQUE)])

# Save dictionary as pickle
with open('names_embedding_idx2name_dict.pkl', 'wb') as file:
    pickle.dump(names_embedding_idx2name_dict, file)

# Similarity Statistics

The goal of these statistics is to show similair names can be identified using the cosine similarity between embeddings

In [None]:
# Optimized cosine similarity function
@numba.jit(nopython=True)
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
# Use 1 million random pairs of names to get some statistics on cosine similarities between names
N = int(1e6)
# Generate 1 million random pairs without 
np.random.seed(11)
idxs = np.random.randint(low=0, high=len(NAMES_UNIQUE), size=[N, 2])
print(f'Number of Duplicates: {(idxs[:,0] == idxs[:,1]).sum()}')
# arrays to save similarities
similarities_use = np.empty(shape=N, dtype=np.float32)
similarities_mpnet = np.empty(shape=N, dtype=np.float32)

# Compute cosine similarities
for idx, (a_idx, b_idx) in enumerate(tqdm(idxs)):
    similarities_use[idx] = (
            cosine_similarity(NAMES_EMBEDDINGS_USE[a_idx], NAMES_EMBEDDINGS_USE[b_idx])
        )
    
    similarities_mpnet[idx] = (
            cosine_similarity(NAMES_EMBEDDINGS_MPNET[a_idx], NAMES_EMBEDDINGS_MPNET[b_idx])
        )

In [None]:
# Similarities for Universal Sentence Encoder embeddings
display(pd.Series(similarities_use).describe().to_frame(name='Value'))

In [None]:
# Similarities for MPNET embeddings
display(pd.Series(similarities_mpnet).describe().to_frame(name='Value'))

# Similar/Dissimilair Names

Get an impression of the similar and dissimilar names according to the sentence embedders.

Verification is quite hard, as many names are non-English.

In [None]:
# Similar names according to Universal Sentence Encoder
use_similar_rows = []
for idx in np.argwhere(similarities_use > 0.80).squeeze()[:10]:
    use_similar_rows.append({
        'name_1': names_embedding_idx2name_dict[idxs[idx][0]],
        'name_2': names_embedding_idx2name_dict[idxs[idx][1]],
        'cosine_similarity': similarities_use[idx],
    })
    
display(pd.DataFrame(use_similar_rows))

In [None]:
# Dissimilar names according to Universal Sentence Encoder
use_dissimilar_rows = []
for idx in np.argwhere(similarities_use < 0.00).squeeze()[:10]:
    use_dissimilar_rows.append({
        'name_1': names_embedding_idx2name_dict[idxs[idx][0]],
        'name_2': names_embedding_idx2name_dict[idxs[idx][1]],
        'cosine_similarity_use': similarities_use[idx],
    })
    
display(pd.DataFrame(use_dissimilar_rows))

In [None]:
# Similar names according to MPNET
mpnet_similar_rows = []
for idx in np.argwhere(similarities_mpnet > 0.90).squeeze()[:10]:
    mpnet_similar_rows.append({
        'name_1': names_embedding_idx2name_dict[idxs[idx][0]],
        'name_2': names_embedding_idx2name_dict[idxs[idx][1]],
        'cosine_similarity_mpnet': similarities_mpnet[idx],
    })
    
display(pd.DataFrame(mpnet_similar_rows))

In [None]:
# Dissimilar names according to MPNET
mpnet_dissimilar_rows = []
for idx in np.argwhere(similarities_mpnet < 0.00).squeeze()[:10]:
    mpnet_dissimilar_rows.append({
        'name_1': names_embedding_idx2name_dict[idxs[idx][0]],
        'name_2': names_embedding_idx2name_dict[idxs[idx][1]],
        'cosine_similarity_mpnet': similarities_mpnet[idx],
    })
    
display(pd.DataFrame(mpnet_dissimilar_rows))