Hello Fellow Kagglers,

This is notebook demonstrates the generation of 16 million training pairs, using 15 neighbours.

For each point of interest all combination of pairs are added to the training data, as are the 15 nearest neighbours.

This training set is much larger than the 600K pairs provided in the competition dataset.

Moreover, the provided pairs dataset is heavily skewed towards positive samples (68% matches), whereas the generated pairs dataset containing all pairs and all 15 nearest neighbours consists of just 11% positive samples, making the training dataset more representable for the expected pairs when performing inference on the 15 nearest neighbours.

15 neighbours are chosen to make the training dataset as large as possible, while still fitting in the 16GB notebooks when computing features and training.

A quick analysis is provided to demonstrate the marginal value of additional nearest neighbours.

As expected, the match ratio decreases as the number of neighbours increases.

Other notebooks:

[Foursquare USE/MPNET Name Embeddings](https://www.kaggle.com/code/markwijkhuizen/foursquare-use-mpnet-name-embeddings)

Training/Inference notebook coming soon!

In [None]:
# Used to deduce city/country from coordinates
!pip install /kaggle/input/reversegeocode/reverse_geocode-1.4.1-py3-none-any.whl

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras import backend as K
from Levenshtein import distance as lev
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from scipy.spatial import distance
from sklearn import metrics
from multiprocessing import cpu_count
from sklearn.neighbors import BallTree
from difflib import SequenceMatcher

import geopy.distance
import reverse_geocode
import math
import scipy
import numba
import warnings
import Levenshtein
import itertools
import gc
import psutil
import sys

# Pandas Apply With Progress Bar
tqdm.pandas()

# Plot DPI
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 150

# Tensorflow Version
print(f'Tensorflow version {tf.__version__}')

# Ignore Warnings
warnings.filterwarnings('ignore')

In [None]:
SEED = 42
DEBUG = False
# Earths Radius in KM
EARTH_RADIUS = 6371

# Load Train/Test Data

In [None]:
%%time
# Train
train_dtype = {
    'id': 'category',
    'name': 'category',
    'address': 'category',
    'city': 'category',
    'state': 'category',
    'zip': 'category',
    'country': 'category',
    'url': 'category',
    'phone': 'category',
    'categories': 'category',
    'latitude': np.float32,
    'longitude': np.float32,
}
train = pd.read_csv('/kaggle/input/foursquare-location-matching/train.csv', index_col='id', dtype=train_dtype)
train['id'] = train.index.values
display(train.info(memory_usage=True))
display(train.head())
display(train.memory_usage(deep=True) / len(train))

# Test
test_usecols = [
    'id',
    'name',
    'latitude',
    'longitude',
    'address',
    'city',
    'state',
    'zip',
    'country',
    'url'
    'categories',
]

test = pd.read_csv('/kaggle/input/foursquare-location-matching/test.csv', dtype=train_dtype)
display(test.info())
display(test.head())

# Load Pairs

In [None]:
# Pairs
pairs_dtype = {
    'id_1': 'category',
    'id_2': 'category',
    'name_1': 'category',
    'name_2': 'category',
    'address_1': 'category',
    'address_1': 'category',
    'city_1': 'category',
    'city_2': 'category',
    'state_1': 'category',
    'state_2': 'category',
    'zip_1': 'category',
    'zip_2': 'category',
    'country_1': 'category',
    'country_2': 'category',
    'url_1': 'category',
    'url_2': 'category',
    'phone_1': 'category',
    'phone_2': 'category',
    'categories_1': 'category',
    'categories_2': 'category',
    'latitude_1': np.float32,
    'longitude_1': np.float32,
    'latitude_2': np.float32,
    'longitude_2': np.float32,
}
pd.options.display.max_rows = 99
pd.options.display.max_columns = 99

pairs_sample = pd.read_csv('/kaggle/input/foursquare-location-matching/pairs.csv', dtype=pairs_dtype, skiprows=lambda idx: idx > 5)
display(pairs_sample.info())
display(pairs_sample.head())

# Load Sample Submission

In [None]:
# Sample Submission
sample_submission = pd.read_csv('/kaggle/input/foursquare-location-matching/sample_submission.csv')
display(sample_submission.info())
display(sample_submission.head())

# To Lower

In [None]:
# Cast columns to lower case to be case agnostic
to_lower_columns = [
    'name',
    'state',
    'country',
    'city',
    'address',
    'zip',
    'phone',
    'url',
    'categories',
]

def to_lower(df):
    f = lambda v: '' if v == 'NaN' else v.lower()
    for col in to_lower_columns:
        if f'{col}_1' in df and f'{col}_2' in df:
            df[f'{col}_1'] = df[f'{col}_1'].astype(str, copy=False).str.lower().replace('nan', '').astype('category')
            df[f'{col}_2'] = df[f'{col}_2'].astype(str, copy=False).str.lower().replace('nan', '').astype('category')
        else:
            df[col] = df[col].astype(str, copy=False).str.lower().replace('nan', '').astype('category')
            
to_lower(train)
to_lower(pairs_sample)
to_lower(test)

In [None]:
display(train.head())

In [None]:
display(pairs_sample.head())

In [None]:
display(test.head())

# Number of Neighbours Analysis

In [None]:
def plot_n_matches_by_n_neighbours():
    # Mapping a point of interest to all corresponding ids
    poi2ids = train.groupby('point_of_interest')['id'].apply(set).apply(list).apply(np.sort).to_dict()
    # The famous nearest neighbours graph used in all notebooks
    tree = BallTree(np.deg2rad(train[['latitude', 'longitude']].values), metric='haversine')
    N_NEIGHBOURS_CHECK = 31

    matches_count_dict = dict([(n,dict()) for n in range(1, N_NEIGHBOURS_CHECK)])
    for n in range(1, N_NEIGHBOURS_CHECK):
        matches_count_dict[n]['sum'] = 0
        matches_count_dict[n]['count'] = 0

    for row_idx, row in tqdm(train.iterrows(), total=len(train)):
        row_dict = {}
        distances, indices = tree.query(
                    np.deg2rad([row['latitude'], row['longitude']]).reshape(1, -1),
                    k=N_NEIGHBOURS_CHECK,
                )
        indices = indices[0]
        match_ids = set(poi2ids[row['point_of_interest']])
        query_ids = train.iloc[indices]['id'].values
        for n in range(1, N_NEIGHBOURS_CHECK):
            for ind in query_ids[:n]:
                matches_count_dict[n]['count'] += 1
                if ind in match_ids:
                    matches_count_dict[n]['sum'] += 1

    for n in range(1, N_NEIGHBOURS_CHECK):
        matches_count_dict[n]['total_match_ratio (%)'] = matches_count_dict[n]['sum'] / matches_count_dict[n]['count'] * 100


    matches_count_dict_df = pd.DataFrame(matches_count_dict).T.astype({'sum': np.int32, 'count': np.int32})
    # Add Marginal Sum
    matches_count_dict_df['sum_marginal'] = (
            matches_count_dict_df['sum'] - matches_count_dict_df['sum'].shift(1)
        ).fillna(matches_count_dict_df.loc[1, 'sum']).astype(np.int32)
    
    # Add Non Self Sum
    matches_count_dict_df['sum_non_self'] = (matches_count_dict_df['sum'] - len(train)).astype(np.int32)
    
    # Add Percentage Ratio
    matches_count_dict_df['match_ratio (%)'] = ((matches_count_dict_df['sum_marginal'] / len(train)) * 100 ).astype(np.float32)

    pd.options.display.max_rows = 999
    display(matches_count_dict_df)

    plt.figure(figsize=(12,5))
    plt.title('Number of Matches by Number of Neighbours', size=18)
    plt.plot(matches_count_dict_df['sum'])
    plt.xlabel('Number of Neighbours', size=16)
    plt.xlabel('Number of Matches', size=16)
    plt.xticks(size=12)
    plt.yticks(size=12)
    plt.ylim(0)
    plt.grid()
    plt.show()

In [None]:
"""
sum:                   total matches
count:                 total pairs
total_match_ratio (%): global percentage of matches in pairs (positive ratio)
sum_marginal:          pairs in current nearest neighbour
sum_non_self:          total pairs excluding pairs referring to itself
match_ratio (%) :      match ratio in current nearest neighbouir
"""

# As can be observed, the match ratio quickly decreases
plot_n_matches_by_n_neighbours()

# Construct Pairs

In [None]:
# Source Columns
columns = ['id']

# Target Column 1
columns_1 = ['id_1']

# Target Column 2
columns_2 = ['id_2']

In [None]:
# Add a pair for 2 given ids in the training set
def add_train_sample(pairs_dict, id_1, id_2, match, count):
    pairs_dict['match'][count] = match
    pairs_dict['match'][count + 1] = match
    row_1 = train.loc[id_1, columns]
    row_2 = train.loc[id_2, columns]
    for col_idx, (col, col_1, col_2) in enumerate(zip(columns, columns_1, columns_2)):
        if col_idx == 1 or col_idx == 2:
            pairs_dict[col_1][count] = row_1[col_idx]
            pairs_dict[col_2][count] = row_2[col_idx]
            pairs_dict[col_1][count + 1] = row_2[col_idx]
            pairs_dict[col_2][count + 1] = row_1[col_idx]
        else:
            pairs_dict[col_1][count] = row_1[col_idx]
            pairs_dict[col_2][count] = row_2[col_idx]
            pairs_dict[col_1][count + 1] = row_2[col_idx]
            pairs_dict[col_2][count + 1] = row_1[col_idx]

In [None]:
# Generate the pairs DataFrame
def get_pairs():
    # point of interest to corresponding ids mapping
    poi2ids = train.groupby('point_of_interest')['id'].apply(set).apply(list).apply(np.sort).to_dict()
    # nearest neighbours graph
    tree = BallTree(np.deg2rad(train[['latitude', 'longitude']].values), metric='haversine')
    # point of interest cluster size count
    display(pd.Series([len(v) for _, v in poi2ids.items()]).value_counts().head(10))
    # As it is unknown how many pairs will be generated, set the target array size to a size value of 20M
    N_TRAIN_SAMPLES = int(20e6)
    # Number of neighbours to use
    N_NEIGHBOURS = 15
    NEIGHBOURS_IDXS = np.arange(1, N_NEIGHBOURS, dtype=np.int8)
    # Set with pairs id hashes to prevent duplicate training samples
    IDS_SET = set()
    # Dictionary with dataframe columns
    pairs_dict = {
        'match': np.full(shape=N_TRAIN_SAMPLES, fill_value=np.nan, dtype=np.bool)
    }

    # Create empty array to fill up when generating pairs
    for col in columns_1 + columns_2:
        dtype = object if pd.api.types.is_categorical_dtype(pairs_sample[col]) else pairs_sample[col].dtype
        pairs_dict[col] = np.full(shape=N_TRAIN_SAMPLES, fill_value=np.nan, dtype=dtype)

    # Counter to keep track of the row to be filled
    count = 0
    # Iterate over all points of interest
    for poi, ids in tqdm(poi2ids.items()):
        # check if there are any pairs, thus more than 1 id in the point of interest
        if len(ids) > 1:
            # Generate training pair for all combinations
            for id_1, id_2 in list(itertools.combinations(ids, 2)):
                id_hash = hash(id_1 + id_2)
                if id_hash not in IDS_SET:
                    add_train_sample(pairs_dict, id_1, id_2, True, count)
                    count += 2
                    # Add hash of id1 and id2 to prevent duplicate training pairs
                    IDS_SET.update([id_hash, hash(id_2 + id_1)])

        # Get 15 nearest neighbours
        distances, indices = tree.query(
                np.deg2rad([train.loc[ids[0], 'latitude'], train.loc[ids[0], 'longitude']]).reshape(1, -1),
                k=N_NEIGHBOURS,
            )
        # Only add neighbours within 10KM and neighbours which are not matches
        indices = indices[0][ distances[0] < 10 / EARTH_RADIUS]
        query_ids = train.iloc[indices]['id'].tolist()
        query_ids = [e for e in query_ids if e not in ids]
        # Add all negative examples
        for id_2 in query_ids:
            id_hash = hash(ids[0] + id_2)
            if id_hash not in IDS_SET:
                add_train_sample(pairs_dict, ids[0], id_2, False, count)
                count += 2
                IDS_SET.update([id_hash, hash(id_2 + ids[0])])
    
    print(f'Generated {count} Training Samples!')
    return pd.DataFrame(dict([ (k, v[:count]) for k, v in pairs_dict.items()]))

In [None]:
# This is purely an dataframe with pairs of ids!
pairs = get_pairs()

# Columns to fill
columns = [
    'name', 'latitude', 'longitude', 'address', 'city',
    'state', 'zip', 'country', 'url', 'phone', 'categories',
]

columns_1 = [
    'name_1', 'latitude_1', 'longitude_1', 'address_1', 'city_1',
    'state_1', 'zip_1', 'country_1', 'url_1', 'phone_1', 'categories_1',
]

columns_2 = [
    'name_2', 'latitude_2', 'longitude_2', 'address_2', 'city_2',
    'state_2', 'zip_2', 'country_2', 'url_2', 'phone_2', 'categories_2'
]

# Fill pairs columns based on id
for col, col_1, col_2 in zip(tqdm(columns), columns_1, columns_2):
    pairs[col_1] = train.loc[pairs['id_1'], col].values
    pairs[col_2] = train.loc[pairs['id_2'], col].values

# Add hash column of id_1 and id_2 to validate unique training samples
pairs['id_hash'] = pairs[['id_1', 'id_2']].apply(lambda t: hash(t[0] + t[1]), axis=1)

# Cast ID to category
pairs['id_1'] = pairs['id_1'].astype('category')
pairs['id_2'] = pairs['id_2'].astype('category')

# Display Pairs Data
display(pairs.head(25))
display(pairs.info())

# Display Positive/Negative Sample Ratio's
display(pairs['match'].value_counts(normalize=True).to_frame())

# Unique Names
display(pairs[['name_1', 'name_2']].nunique())

In [None]:
# Validate unique pairs
display(pairs['id_hash'].value_counts().value_counts(normalize=True) * 100)

In [None]:
# Memory Usage Analysis, category data type is highly efficient!
display(pairs.memory_usage(deep=True) / len(pairs))

In [None]:
# Drop id_hash
pairs.drop('id_hash', axis=1, inplace=True)

# Save Training Pairs
pairs.to_pickle('pairs.pkl')

# Haversine Distance

To finish with, a simple visualisation of random pairs and matching pairs. Matching pairs are almost always within 10KM of eachother, whereas non-matching pairs are not. The distance between pairs is thus an important indicator of pairs being a match.

In [None]:
@numba.jit(nopython=True)
def haversine_np(args):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = args
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = EARTH_RADIUS * c
    return km

In [None]:
def plot_distances():
    # Get 1 random pairs of points from train
    N = int(1e6)
    # Target array for train distances
    TRAIN_DISTANCES = np.zeros(N, dtype=np.float32)
    # get random train coordinates and compute distances
    coords_a = train[['longitude', 'latitude']].sample(N, replace=True).values
    coords_b = train[['longitude', 'latitude']].sample(N, replace=True).values
    for idx, (a, b) in enumerate(tqdm(zip(coords_a, coords_b), total=N)):
        TRAIN_DISTANCES[idx] = haversine_np((*a, *b))
        
    # plot distance distribution
    plt.figure(figsize=(10,5))
    plt.title('Train Haversine Distance in KM')
    pd.Series(TRAIN_DISTANCES).plot(kind='hist', bins=10)
    plt.show()
    
    # Compute Distances for matching pairs only
    pairs_distance = pd.Series(
            np.apply_along_axis(
                haversine_np,
                1,
                pairs.loc[pairs['match'], ['longitude_1', 'latitude_1', 'longitude_2', 'latitude_2']].values.astype(np.float32)
            )
        )
    
    # Plot Distances between pairs
    plt.figure(figsize=(10,5))
    plt.title('Pairs Haversine Distance in KM')
    pairs_distance.plot(kind='hist', bins=10)
    plt.show()
    
    # Plot train/pairs/pairs_sample distance statistics
    percentiles = [0.01, 0.05, 0.10, 0.25, 0.90, 0.95, 0.99]
    display(
        pd.concat([
            pd.Series(TRAIN_DISTANCES).describe(percentiles=percentiles).to_frame(name='Train').T,
            pairs_distance.describe(percentiles=percentiles).to_frame(name='Pairs').T,
        ]).T
    )

In [None]:
plot_distances()