## Intro

This notebook is based on other code posted previous before, but explicitly calculating the scoring function. I am new to matching so I try to keep everything simple. Not sure how to credit/cite people here, let me know in the comments.

In [None]:
import numpy as np
import pandas as pd 
from sklearn.neighbors import BallTree
from tqdm import tqdm

In [None]:
train = pd.read_csv('../input/foursquare-location-matching/train.csv')
test = pd.read_csv('../input/foursquare-location-matching/test.csv')
sample_submission = pd.read_csv('../input/foursquare-location-matching/sample_submission.csv')

In [None]:
sample_submission.head()

## True Matches from Training

In [None]:
true_matches = {}

for i, row in train.groupby('point_of_interest'):
    ids_ = set(row.id.values)
    
    for id_ in ids_:
        true_matches[id_] = ids_

## Scoring

In [None]:
def get_score(true_matches, predicted_matches):
    score_sum = 0

    for id_ in predicted_matches.keys():
        a = true_matches.get(id_, set())
        b = predicted_matches.get(id_, set())
        score = len(a.intersection(b)) / len(a.union(b))
        score_sum += score

    total_score = round(100.0 * score_sum / len(true_matches), 2)

#     print(total_score)
    
    return total_score

## Matching

In [None]:
def get_matches(df, n_ = 2, cat_dist = 1.5, name_dist = 1.5):
    
    df = df.sort_values("longitude")
    
    all_poi = []
    
    rads = np.deg2rad(df[['latitude', 'longitude']].values)
    tree = BallTree(rads, metric='haversine')

    n = min(n_, len(df))
    
    cat_dist /= 6371.0
    name_dist /= 6371.0

    max_dist = max(cat_dist, name_dist)

    ids = df.id.values
    
    cats = df.categories.fillna("__nan__").values
    names = df.name.fillna("__nan__").values
    
    for i in tqdm(range(len(df))):
        poi = [ids[i]]
        cat = cats[i]
        name = names[i]
            
        dist, ind = tree.query(rads[i].reshape(1, -1), k = n)
        
        
        for d, j in zip(dist[0], ind[0]):
            test_id = ids[j]
            test_cat = cats[j]
            test_name = names[j]

            if d > max_dist:
                break
            
            if d < cat_dist and cat != '__nan__' and ((test_cat in cat) or (cat in test_cat)):
                poi.append(test_id)
            
            elif d < name_dist and (test_name in name) or (name in test_name):
                poi.append(test_id)
        
        all_poi.append(poi)
        
    predicted_matches = {a : {a} for a in df.id}

    for p in all_poi:
        for a in p:
            if a in predicted_matches.keys():
                predicted_matches[a].update(p)
        
    return predicted_matches



## Hyperparameter Tuning (Grid search)

In [None]:
sample_df = train.sort_values("longitude").iloc[:100_000]

n_range = [5] # [10, 20]
name_range = [20.0] #[20.0, 10.0]
cat_range = [0.1] #[0.1, 0.75, 0.05, 0.04]

for n_ in n_range:
    for name_dist in name_range:
        for cat_dist in cat_range:
            
            predicted_matches = get_matches(sample_df, n_ = n_, cat_dist = cat_dist, name_dist = name_dist)
            score = get_score(predicted_matches, true_matches)

            print(f"cat_dist: {cat_dist} name_dist: {name_dist} score: {score}")

In [None]:
train_output = pd.DataFrame({
    "id" : list(predicted_matches.keys()),
    "matches" : [" ".join(a) for a in predicted_matches.values()]
})

In [None]:
train_output.head(20)

In [None]:
score = get_score(predicted_matches, true_matches)

In [None]:
test.head()

In [None]:
test_matches = get_matches(test, 20, 0.05, 50.0)

test_submission = pd.DataFrame({
    "id" : list(test_matches.keys()),
    "matches" : [" ".join(a) for a in test_matches.values()]
})

In [None]:
test_submission.head()

In [None]:
test_submission.to_csv('submission.csv', index=False)