In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from haversine import haversine
from tqdm.notebook import tqdm
from sklearn.svm import SVC
import seaborn as sns
import pandas as pd
import Levenshtein
import numpy as np
import warnings
import nltk
import time
import os

tqdm.pandas()
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

warnings.filterwarnings("ignore")

plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (20,10)

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Preprocessing Functions

In [None]:
def convert_str_columns_to_lower_case(df):
    for column, column_type in pd.DataFrame(df.dtypes.astype(str)).reset_index().values:
        if column_type == "object":
            df[column] = df[column].str.lower()
    return df


def get_features(column_value_lists, verbose=False):
    
    column_pairs = np.array_split(column_value_lists, int(len(column_value_lists)//2))
    str_column_pairs = column_pairs[:-2]
    coordinate_column_pairs = column_pairs[-2:]

    feature_row = []
    for str_list in str_column_pairs:
        levenshtein = Levenshtein.distance(str_list[0], str_list[1])
        jarro = Levenshtein.jaro(str_list[0], str_list[1])
        jaro_winkler = Levenshtein.jaro_winkler(str_list[0], str_list[1])
        feature_row.append([levenshtein, jarro, jaro_winkler])
        
    feature_row = sum(feature_row, [])
    feature_row.append(haversine(tuple(coordinate_column_pairs[0].astype(float)),
                                 tuple(coordinate_column_pairs[1].astype(float))))
    
    return feature_row

In [None]:
raw_pair_df = pd.read_csv("/kaggle/input/foursquare-location-matching/pairs.csv")
raw_test_df =  pd.read_csv("/kaggle/input/foursquare-location-matching/test.csv")

# raw_pair_df = pd.read_csv("../data/pairs.csv")
# raw_test_df =  pd.read_csv("../data/train.csv", nrows=int(0.25*60000))

raw_pair_df = raw_pair_df.pipe(convert_str_columns_to_lower_case)
raw_test_df = raw_test_df.pipe(convert_str_columns_to_lower_case)

In [None]:
pairs_columns = ["name_1","name_2","categories_1","categories_2","address_1","address_2","latitude_1","longitude_1","latitude_2","longitude_2"]
test_columns = ["id","name","categories","address","latitude","longitude"]

In [None]:
fillna_values_train = {i:"" for i in pairs_columns}
raw_pair_df.fillna(value=fillna_values_train, inplace=True)
raw_pair_df = raw_pair_df[pairs_columns + ["match"]]


fillna_values_test = {i:"" for i in test_columns}
raw_test_df.fillna(value=fillna_values_test, inplace=True)
raw_test_df = raw_test_df[test_columns]

In [None]:
tic = time.time()
X = np.array(list(map(get_features, tqdm(raw_pair_df[pairs_columns].values))))
y = raw_pair_df.match.values
print(time.time()-tic)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_jobs=-1)
model.fit(X_train, y_train)

print(classification_report(y_test, model.predict(X_test).astype(int)))

In [None]:
plt.subplot(2,1,1)
plt.plot(model.feature_importances_, marker="o")
plt.xticks(rotation=75)
plt.show()

In [None]:
base_data_list = raw_test_df.copy()[["id","name","categories","address","latitude","longitude"]].values
data_row = []

for i in tqdm(range(len(base_data_list))):
    for j in range(i, len(base_data_list)):
        i_ = base_data_list[i]
        j_ = base_data_list[j]

        feature = get_features([i_[1], j_[1], i_[2], j_[2], i_[3], j_[3], i_[4], i_[5],
                                j_[4], j_[5] ])
        data_row.append([i_[0], j_[0]] + feature)
        data_row.append([j_[0], i_[0]] + feature)

In [None]:
data_row_np = np.array(data_row)
ids_ = data_row_np[:,:2]
prediction = model.predict(data_row_np[:,2:]).astype(int)
df = pd.DataFrame(ids_)
df["prediction"] = prediction

In [None]:
submission = (df[[0,1,"prediction"]]
.query("prediction == 1")
.groupby(0, as_index=False)
.agg({1:" ".join})
.rename(columns={1:"id", 2:"matches"})
)

In [None]:
submission.to_csv("submission.csv", index=False)