In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from scipy.spatial import KDTree
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import time
import os

tqdm.pandas()
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

warnings.filterwarnings("ignore")

plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (20,10)

In [None]:
def convert_str_columns_to_lower_case(df):
    for column, column_type in pd.DataFrame(df.dtypes.astype(str)).reset_index().values:
        if column == "id":
            continue
        if column_type == "object":
            df[column] = df[column].str.lower()
    return df

def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [None]:
raw_pair_df = pd.read_csv("/kaggle/input/foursquare-location-matching/pairs.csv")
raw_test_df =  pd.read_csv("/kaggle/input/foursquare-location-matching/test.csv")

In [None]:
raw_pair_df = raw_pair_df.pipe(convert_str_columns_to_lower_case)
raw_test_df = raw_test_df.pipe(convert_str_columns_to_lower_case)

In [None]:
%%time
tree = KDTree(raw_test_df[["latitude","longitude"]].values)

In [None]:
%%time
X = haversine_np(raw_pair_df['longitude_1'],
                  raw_pair_df['latitude_1'],
                  raw_pair_df['longitude_2'],
                  raw_pair_df['latitude_2'])
y = raw_pair_df.match.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X), y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

print(classification_report(y_test, model.predict(X_test).astype(int)))

In [None]:
submission_df_list = []
base_raw_df = raw_test_df[["id","latitude","longitude"]]
base_raw_df["dummy"] = 1

for row_id, row in tqdm(base_raw_df[["id","latitude","longitude"]].iterrows(), total=base_raw_df.shape[0]):    
    row_df = pd.DataFrame(row).T.astype({"latitude":"float","longitude":"float"})
    row_df["dummy"] = 1
    test_df = row_df.merge( 
           base_raw_df.iloc[ tree.query([row["latitude"],row["longitude"]], min(base_raw_df.shape[0],40))[1]],
           on = "dummy", suffixes=["_1","_2"])
    
    features = haversine_np(test_df['longitude_1'],
                  test_df['latitude_1'],
                  test_df['longitude_2'],
                  test_df['latitude_2'])
    
    ids_ = test_df[["id_1", "id_2"]].values
    prediction = model.predict(pd.DataFrame(features)).astype(int)
    
    df = pd.DataFrame(ids_)
    df["prediction"] = prediction
    submission_ = (df
                    .query("prediction == 1")
                    .groupby(0, as_index=False)
                    .agg({1:" ".join})
                    .rename(columns={0:"id", 1:"matches"})
                    )
    submission_df_list.append(submission_)

In [None]:
submission = pd.concat(submission_df_list)

In [None]:
submission_base_file = pd.read_csv("/kaggle/input/foursquare-location-matching/sample_submission.csv")
submission_data = submission_base_file.merge(submission, how="left",
                                             on=["id"]).drop("matches_x", axis=1).rename(columns={"matches_y":"matches"})
submission_data.matches = submission_data.matches.fillna(submission_data.id)

In [None]:
submission.to_csv("submission.csv", index=False)