In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
!pip install geopandas

In [None]:
import geopandas as gpd

The given train dataset is converted to geospatial dataframe to facilitate visualization of POI distribution on maps.A train-test split in 1:3 is considered on the train dataset.

In [None]:
locs=pd.read_csv("../input/foursquare-location-matching/train.csv")
locstest=locs[0:int((0.25*766280))]
locstrain=locs[int((0.25*766280)):]
locstrain.head()

In [None]:
locstrain.describe()

To facilitate plotting point distribution on the world map,the dataframe is modified with names of countries and the corresponding alpha-2 codes merged.

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.rename(columns={'name':'country'},inplace=True)

In [None]:
rel_abb=pd.read_csv("../input/abbreviations/archive/data.csv")
rel_abb.rename(columns={'Name':'country'},inplace=True)

In [None]:
mapworld=world.merge(rel_abb,on="country")

In [None]:
locstrain.rename(columns={'country':'Code'},inplace=True)

In [None]:
maplocs=locstrain.merge(mapworld,on="Code")

In [None]:
gmaplocs = gpd.GeoDataFrame(
    maplocs, geometry=gpd.points_from_xy(maplocs.longitude, maplocs.latitude))

In [None]:
gmaplocs.plot()

The number of points considered in each country is noted via a bar-graph.

In [None]:
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster


In [None]:
count_stats=gmaplocs['Code'].value_counts()
print(len(count_stats))

In [None]:
count_stats=count_stats.to_dict()

In [None]:
import matplotlib.pyplot as plt
from matplotlib.axis import Axis

In [None]:
labels=[]
country_count=[]
for x, y in count_stats.items():
    country_count.append(y)
    labels.append(x)
f,ax=plt.subplots(figsize=(25,25))
plt.barh(labels,country_count)
plt.ylabel("Country",fontweight="bold",size=16)
plt.xlabel("PoI count",fontweight="bold",size=16)
ax.margins(0.005,0.005)
ax.invert_yaxis()
plt.xticks(size = 11.5,fontweight="bold")
plt.yticks(size = 11.5,fontweight="bold")
plt.title("Country Stats",fontweight="bold",size=18)
plt.show()

In [None]:
gmaplocs.sort_values('Code')

In [None]:
rt=[]
loc_check=np.where(gmaplocs['Code']=='CA')
n=count_stats['CA']
rt.append(loc_check[0][0])
rt.append(loc_check[0][n-1])
rt
diff=rt[1]-rt[0]
diff

In [None]:
cagmap=gmaplocs[rt[0]:rt[1]]

The distribution of POIs in Canada is visualized on a Folium Map with Marker Clusters.This highlights the number of different POIs present.

In [None]:
import math
m_3 = folium.Map(location=[62.2270,-74.6191], tiles='cartodbpositron', zoom_start=3)
mc = MarkerCluster()
for idx, row in cagmap.iterrows():
    if not math.isnan(row['longitude']) and not math.isnan(row['latitude']):
        mc.add_child(Marker([row['latitude'], row['longitude']]))
m_3.add_child(mc)



Levenshtein Distance and Haversine Distance play a key role in obtaining the proximity of two different POIs based on the similarity of their names and a threshold set for the geographic distance to obtain a landmark.

In [None]:
from leven import levenshtein
from sklearn.cluster import dbscan

In [None]:
!pip install haversine

In [None]:
from haversine import *

In [None]:
ct=gmaplocs['categories'].value_counts()
print(len(ct))

Special characters and articles in names are omitted to obtain a standard cluster of words to facilitate similarity check.

In [None]:
def stand_category(dc):
     return dc.str.lower()\
                    .str.replace(",", "")\
                    .str.replace(".", "")\
                    .str.replace("'", "")\
                    .str.replace("the ", "")\
                    .str.replace("and","&")\
                    .str.split(" ")

In [None]:
from scipy.spatial import KDTree

K-Dimensional Trees help classify points following the k-nearest neighbour approach ,with a k-level tree organisation to obtain the desired number of segments,halving a plane of points horizontally and vertically for point-classification.
A tuple with the latitude and longitude of the given POI is obtained to check for points that could be matched.Querying the K-D tree helps obtain the values and indices of the neighbours. The thresholds set for the Lavershtein distance and the Haversine distance help deduce the proximity and the appropriate match of POI based on the id could be obtained.

In [None]:
def poi_generator(ddf):
    lat_arr=ddf['latitude'].to_list()
    long_arr=ddf['longitude'].to_list()
    lat_lon_map=tuple(zip(lat_arr,long_arr))
    kd_tree=KDTree(lat_lon_map)
    match_check = kd_tree.query(lat_lon_map, k=3, p = 1, workers = 5) 
    ddf.loc[:,"standardized_cat"] = stand_category(ddf.categories.astype(str))
    poi_pairs = {p : {p} for p in ddf.id.values}
    for i, r1 in ddf.iterrows():
        pot_ind = match_check[1][i]
        lat_1 = r1.latitude
        rs1="".join(r1.standardized_cat)
        for j in pot_ind:
            r2 = ddf.iloc[j, :]
            rs2="".join(r2.standardized_cat)
            p1=(lat_1,r1.longitude)
            p2=(r2.latitude,r2.longitude)
            if haversine(p1,p2) < 1.0:
                if levenshtein(rs1,rs2) < 0.83:
                    poi_pairs[r1.id].update(poi_pairs[r2.id])
                    poi_pairs[r2.id].update(poi_pairs[r1.id])
    for p_id, val in poi_pairs.items():
        poi_pairs[p_id] = " ".join(list(val))
    return pd.DataFrame({"id" : list(poi_pairs.keys()),
                "similar" : list(poi_pairs.values())})

In [None]:
testdata=pd.read_csv('../input/foursquare-location-matching/test.csv')

In [None]:
import pandas as pd
completetest=pd.concat([testdata,locstest])

In [None]:
validation=poi_generator(completetest)

In [None]:
validation['similar'].isnull().values.any()

In [None]:
validation.tail()

In [None]:
worldfinal = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
worldfinal.rename(columns={'name':'country'},inplace=True)

In [None]:
rel_abbtest=pd.read_csv("../input/abbreviations/archive/data.csv")
rel_abbtest.rename(columns={'Name':'country'},inplace=True)

In [None]:
mapworld1=worldfinal.merge(rel_abbtest,on="country")

In [None]:
completetest.rename(columns={'country':'Code'},inplace=True)

In [None]:
maplocs1=completetest.merge(mapworld1,on="Code")

In [None]:
gmaplocstest = gpd.GeoDataFrame(
    maplocs1, geometry=gpd.points_from_xy(maplocs1.longitude, maplocs1.latitude))

In [None]:
gmaplocsfinal=gmaplocstest.merge(validation,on="id")

In [None]:
gmaplocsfinal.plot()

Visualization of points on the map is obtained to verify the presence of distinct POIs with matching ids.

In [None]:
fmapset = gmaplocsfinal.dropna(subset=['similar'])


In [None]:
count_stats1=fmapset['Code'].value_counts()
print(len(count_stats1))

In [None]:
count_stats1=count_stats1.to_dict()

In [None]:
labels=[]
country_count=[]
for x, y in count_stats1.items():
    country_count.append(y)
    labels.append(x)
f,ax=plt.subplots(figsize=(25,25))
plt.barh(labels,country_count)
plt.ylabel("Country",fontweight="bold",size=16)
plt.xlabel("PoI count",fontweight="bold",size=16)
ax.margins(0.005,0.005)
ax.invert_yaxis()
plt.xticks(size = 11.5,fontweight="bold")
plt.yticks(size = 11.5,fontweight="bold")
plt.title("Country Stats",fontweight="bold",size=18)
plt.show()

In [None]:
fmapset.sort_values('Code')

In [None]:
rt1=[]
loc_check1=np.where(fmapset['Code']=='CA')
n1=count_stats1['CA']
rt1.append(loc_check1[0][0])
rt1.append(loc_check1[0][n1-1])
diff1=rt[1]-rt[0]
diff1

In [None]:
rt1

In [None]:
cagmapf=gmaplocs[rt[0]:rt[1]]

In [None]:
import math
m_4= folium.Map(location=[62.2270,-74.6191], tiles='cartodbpositron', zoom_start=3)
mc1 = MarkerCluster()
for idx, row in cagmapf.iterrows():
    if not math.isnan(row['longitude']) and not math.isnan(row['latitude']):
        mc1.add_child(Marker([row['latitude'], row['longitude']]))
m_4.add_child(mc)