In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import NearestNeighbors

# Hparams

In [None]:
hparams = {
    "n_splits": 5,
    "n_neighbors": 10,
    "radius": 0.01
}

# Data

In [None]:
submit = pd.read_csv('/kaggle/input/foursquare-location-matching/sample_submission.csv')
pairs = pd.read_csv('/kaggle/input/foursquare-location-matching/pairs.csv')
train = pd.read_csv('/kaggle/input/foursquare-location-matching/train.csv')
test = pd.read_csv('/kaggle/input/foursquare-location-matching/test.csv')

# Metric

In [None]:
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        try:
            preds = set(matches.split())
        except:
            print(matches)
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

# Search Number of Neightbor

In [None]:
kf = GroupKFold(n_splits=hparams["n_splits"])
for i, (trn_idx, val_idx) in enumerate(kf.split(train, train["point_of_interest"], train["point_of_interest"])):
    train.loc[val_idx, "set"] = i
train["set"].value_counts()

In [None]:
def add_mathes(df, radius):
    dfs = []

    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
        
        knn = NearestNeighbors(radius=radius, n_jobs=-1)
        knn.fit(country_df[['latitude','longitude']])
        nears = knn.radius_neighbors(country_df[['latitude','longitude']], return_distance=False)
        
        for j, n in enumerate(nears):
            country_df.loc[j, "matches"] = " ".join([country_df.loc[i, "id"] for i in n])

        dfs.append(country_df)
        
    df = pd.concat(dfs).reset_index(drop=True)
    return df

train["matches"] = ""
# train = pd.concat([add_mathes(train[train["set"]==i], hparams["radius"]) for i in range(hparams["n_splits"])])

In [None]:
id2poi = get_id2poi(train[train["set"]==0])
poi2ids = get_poi2ids(train[train["set"]==0])

In [None]:
scores = []
for r in np.linspace(0.002, 0.001, num=5):
    train_0 = add_mathes(train[train["set"]==0], r)
    sc = get_score(train_0)
    scores.append(sc)
    print(sc)

In [None]:
train_0['mathes'] = train_0['id']

In [None]:
get_score(train_0)

# Inference

In [None]:
test = pd.read_csv('/kaggle/input/foursquare-location-matching/test.csv')
test = add_mathes(test, 0.001)

In [None]:
test

In [None]:
submit = pd.read_csv("../input/foursquare-location-matching/sample_submission.csv")
submit = submit.drop(columns="matches")
submit = submit.merge(test[["id", "matches"]], on="id")
submit.to_csv("submission.csv", index=False)

submit.head()