Base on: https://www.kaggle.com/code/pjmathematician/matching-based-on-nearest-location

# library

In [None]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)
from tqdm.notebook import tqdm

from collections import Counter
from math import sin, cos, sqrt, atan2, radians
from scipy import spatial

# loading data

In [None]:
train = pd.read_csv('../input/foursquare-location-matching/train.csv')
train = train.sort_values(by=['latitude', 'longitude']).reset_index(drop=True)
test = pd.read_csv('../input/foursquare-location-matching/test.csv')

print(train.shape)
display(train.head())
print(test.shape)
display(test.head())

In [None]:
pairs = pd.read_csv('../input/foursquare-location-matching/pairs.csv')
print(pairs.shape)
display(pairs.head())

In [None]:
submission = pd.read_csv('../input/foursquare-location-matching/sample_submission.csv')
print(submission.shape)
display(submission.head())

In [None]:
cates = pairs[pairs['match'] == True][['categories_1', 'categories_2']].fillna('__NaN__')
cates['categories_1'] = cates['categories_1'].apply(lambda x: x.split(', '))
cates['categories_2'] = cates['categories_2'].apply(lambda x: x.split(', '))

# categories match mapping

In [None]:
cate_map = dict()

for _, row in tqdm(cates.iterrows()):
    c1 = row['categories_1']
    c2 = row['categories_2']
    for c in c1:
        if c == '__NaN__':
            continue
        for c_ in c2:
            if c not in cate_map:
                cate_map[c] = [c_]
            else:
                cate_map[c] += [c_]
    for c in c2:
        if c == '__NaN__':
            continue
        for c_ in c1:
            if c not in cate_map:
                cate_map[c] = [c_]
            else:
                cate_map[c] += [c_]

counters = dict()
for c in tqdm(cate_map):
    counters[c] = Counter(cate_map[c])

# find nearest location

In [None]:
def distance(lat1, lon1, lat2, lon2):
    R = 6373.0

    lat1 = radians(abs(lat1))
    lon1 = radians(abs(lon1))
    lat2 = radians(abs(lat2))
    lon2 = radians(abs(lon2))
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

In [None]:
lats = test['latitude'].to_list()
lons = test['longitude'].to_list()
Z = tuple(zip(lats, lons))
tree = spatial.KDTree(Z)
loc_data = (tree.query(Z, 3))

In [None]:
test.fillna('__NaN__', inplace=True)

sol = {'id':[],'matches':[]}
for i, r in tqdm(test.iterrows(),total=test.shape[0]):
    indx = loc_data[1][i]
    c1 = r['categories'].split(', ')
    lat1 = r['latitude']
    lon1 = r['longitude']
    sol['id'].append(r['id'])
    matched = []
    matched.append(test.iloc[list(indx)[0],:]['id'])
    for j in list(indx)[1:]:
        r2 = test.iloc[j, :]
        lat2 = r2['latitude']
        lon2 = r2['longitude']
        if distance(lat1, lon1, lat2, lon2) >= 10:
            continue
        c2 = r2['categories'].split(', ')
        for c in c1:
            for c_ in c2:
                if c in counters and c_ in counters[c] and counters[c][c_] > 50:
                    matched.append(r2['id'])
    sol['matches'].append(' '.join(matched))

In [None]:
solution = pd.DataFrame(sol)
solution.to_csv('submission.csv',index=False)
solution.head()