In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
test = pd.read_csv('../input/foursquare-location-matching/test.csv')

In [3]:
import joblib
import pickle
import lightgbm as lgb
import Levenshtein as lev
from math import radians, cos, sin, asin, sqrt

In [4]:
model = lgb.Booster(model_file='../input/foursquarelgb/lgb_model.txt')

In [5]:
%load_ext Cython

In [6]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [7]:
def distance_compute(lat1, lat2, lon1, lon2): 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 
    return c * r

def category_match_ratio(c1, c2):
  count = 0
  categories1 = c1.split(",")
  categories2 = c2.split(",")
  for c1 in categories1:
    for c2 in categories2:
      if(c1.replace(" ", "").lower() == c2.replace(" ", "").lower()):
        count+=1
  return count*2/(len(categories1)+len(categories2))

def lev_ratio(s1, s2):
  if(s1 != "nan" and s2 != "nan"):
    return lev.ratio(s1.lower(), s2.lower())
  else:
    return np.nan

def is_match(s1, s2):
  if(s1 != "nan" and s2 != "nan"):
    if(s1.lower() == s2.lower()):
      return 1
    else:
      return 0
  else:
    return np.nan


def lcs(s1, s2):
  if(s1 != "nan" and s2 != "nan"):
    return LCS(s1, s2)
  else:
    return np.nan

In [8]:
# constant parameter
lat_len = 111
dist_endure = 1.5
lat_endure = dist_endure / lat_len

# reading test data
df_sorted = test.sort_values(by=["latitude"], ignore_index=True)
lat = df_sorted["latitude"].values
lon = df_sorted["longitude"].values
phone = df_sorted["phone"].values.astype("str")
url = df_sorted["url"].values.astype("str")
address = df_sorted["address"].values.astype("str")
city = df_sorted["city"].values.astype("str")
state = df_sorted["state"].values.astype("str")
country = df_sorted["country"].values.astype("str")
categories = df_sorted["categories"].values.astype("str")
name = df_sorted["name"].values.astype("str")
id = df_sorted["id"].values.astype("str")
match = id.copy()


In [9]:

# rule-base algorithm start
poi_label = [i for i in range(len(test))]

for i in range(len(test)):
    input_list = []
    idx = []
    now = i + 1
    while (now < len(test) and lat[now] - lat[i] < lat_endure*0.8):
        if abs(lon[i]- lon[now]) > 50/lat_len:
            pass
        else:
            dis = distance_compute(lat[i], lat[now], lon[i], lon[now])
            if  dis > dist_endure:
                pass
            else:
                input_row = []
                input_row.append(dis/12)
                input_row.append(lev.ratio(name[i], name[now]))
                input_row.append(lcs(str(name[i]), str(name[now])))
                input_row.append(lev_ratio(address[i], address[now]))
                input_row.append(lcs(str(address[i]), str(address[now])))
                input_row.append(is_match(city[i], city[now]))
                input_row.append(is_match(state[i], state[now]))
                input_row.append(is_match(country[i], country[now]))
                input_row.append(is_match(url[i], url[now]))
                input_row.append(is_match(phone[i], phone[now]))
                input_row.append(category_match_ratio(categories[i], categories[now]))

                if(max(input_row[1],input_row[3],input_row[5],input_row[6]) > 0.5):
                    input_list.append(input_row)
                    idx.append(now)
        if now - i > 500:
            break
        now += 1
    if len(input_list) != 0:
        pred = model.predict(np.array(input_list))
        pred_idx = np.stack((np.array(pred),np.array(idx).astype("int")),axis=-1)
        pred_idx = np.flip(pred_idx[pred_idx[:, 0].argsort()],0)
        nmatch = 0
        while nmatch < 2 and nmatch < len(pred_idx):
            if pred_idx[nmatch][0] > 0.95:
                poi_label[int(pred_idx[nmatch][1])] = poi_label[i]
                nmatch += 1
            else:
                break

In [10]:
id_match_label = np.stack((np.array(id), np.array(match), np.array(poi_label)),axis=-1)
id_match_label = np.flip(id_match_label[id_match_label[:, 2].argsort()],1)

In [11]:
start = 0
while start < len(id_match_label):
    now = start + 1
    match_id = id_match_label[start][1]
    while now < len(id_match_label) and id_match_label[now][0] == id_match_label[start][0]:
        now += 1
    for i in range(start, now):
        for j in range(start, now):
            if (i != j):
                id_match_label[i][2] += (" " + id_match_label[j][1])
    start = now

In [12]:
# write to csv file
res = {"id":id_match_label[:,1], "matches":id_match_label[:,2]}
out_df = pd.DataFrame(res) 
out_df.to_csv('submission.csv', index = False)