In [None]:
import numpy as np
import pandas as pd
from geopy.distance import geodesic
import difflib
import Levenshtein
from tqdm.notebook import tqdm
import gc
from sklearn.neighbors import NearestNeighbors

pd.set_option('display.max_columns', 50)

In [None]:
train = pd.read_parquet('../input/foursquare-convert-train-data-to-parquet/train.parquet')
print(train.shape)

In [None]:
def IoU(truth, pred):
    truth = set(truth)
    pred = set(pred)
    
    iou = len(truth.intersection(pred)) / len(truth.union(pred))
    return iou

In [None]:
# Test IoU Function
IoU(['A'], ['A', 'B', 'C', 'D', 'E'])

In [None]:
train.head()

In [None]:
train['point_of_interest'].nunique()

In [None]:
poi_list = train['point_of_interest'].unique()
truth_dict = train.groupby('point_of_interest')['id'].apply(list).to_dict()

In [None]:
id_poi_dict = dict(zip(train['id'], train['point_of_interest']))

In [None]:
coo_cols = ["latitude", "longitude"]

matcher = NearestNeighbors(n_neighbors=5, n_jobs=-1)
matcher.fit(train[coo_cols])

distances, indices = matcher.kneighbors(train[coo_cols])

In [None]:
def join_neighbors(df, indices, n_neighbors):
    df.reset_index(drop=True, inplace=True)
    df.loc[:, 'index'] = df.index
    drop_columns = ['index_1', 'index_2']
    dfs = []
    for i in range(n_neighbors):
        df.loc[:, f'join_key_{i}'] = indices[:, i]
        tmp_df = pd.merge(df, df, left_on=f'join_key_{i}', right_on='index', how='left',suffixes=('_1', '_2'))
        drop_columns.append(f'join_key_{i}_1')
        drop_columns.append(f'join_key_{i}_2')
        dfs.append(tmp_df)
    
    merged_df = pd.concat(dfs, ignore_index=True)
    merged_df.drop(drop_columns, axis=1, inplace=True)
    return merged_df

In [None]:
train_df = join_neighbors(train, indices, 5)
train_df.head()

In [None]:
from sklearn.model_selection import KFold

In [None]:
train_df = train_df[train_df['id_1'] != train_df['id_2']].reset_index(drop=True)

In [None]:
def make_match_list(row):
    return list(row['id_2']) + list([row['id_1']])

In [None]:
def calc_IoU(row):
    global truth_dict
    global id_poi_dict
    return IoU(truth_dict[id_poi_dict[row['id_1']]], row['matches'])

In [None]:
kf = KFold(n_splits=5)
poi_array = np.array(poi_list)
fold = 1
for train_index, test_index in kf.split(poi_array):
    poi_train = poi_array[train_index]
    poi_test = poi_array[test_index]
    
    fold_train = train_df[train_df['point_of_interest_1'].isin(poi_train)]
    fold_test = train_df[train_df['point_of_interest_1'].isin(poi_test)]
    
    match_train = fold_train.groupby('id_1')['id_2'].apply(list).reset_index()
    match_test = fold_test.groupby('id_1')['id_2'].apply(list).reset_index()
    
    match_train['matches'] = match_train.apply(make_match_list, axis=1)
    match_test['matches'] = match_test.apply(make_match_list, axis=1)
    
    match_train['IoU'] = match_train.apply(calc_IoU, axis=1)
    match_test['IoU'] = match_test.apply(calc_IoU, axis=1)
    
    print(f'fold {fold} Train IOU: {match_train["IoU"].mean()}')
    print(f'fold {fold} Test IOU: {match_test["IoU"].mean()}')
    fold += 1