# Purpose of the notebook

* have insites on how large each POIs are
* grasp a rough information about POI's distance thresholds
   - How much level of noise are in the geo-location data?

In [None]:
!pip install nb-black

The below process is required to handl the error `geopandas The Shapely GEOS version (3.9.1-CAPI-1.14.2) is incompatible with the GEOS version PyGEOS was compiled with (3.10.0-CAPI-1.16.0)`.

source: https://www.codestudyblog.com/cs2112pyc/1221122348.html

In [None]:
!pip uninstall pygeo
!pip install rtree

In [None]:
import geopandas
import folium
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.offline as py
import plotly.graph_objects as go
import pyarrow.parquet as pq
import pyarrow as pa

from numpy import sin, cos, deg2rad, rad2deg
from plotly.offline import init_notebook_mode, iplot
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from tqdm.auto import tqdm

pd.set_option("max_colwidth", 900)
plt.style.use("ggplot")
init_notebook_mode(connected=True)

%load_ext lab_black

In [None]:
def compose(df, fns):
    ret = df.copy()
    for fn in fns:
        ret = fn(ret)
    return ret

In [None]:
def convert_radians(df):
    df = df.copy()
    df["lon_rad"] = df["longitude"].apply(np.deg2rad)
    df["lat_rad"] = df["latitude"].apply(np.deg2rad)
    return df

In [None]:
def fill_blank(df):
    df = df.copy()
    cols = [
        "name",
        "address",
        "city",
        "state",
        "zip",
        "country",
        "url",
        "phone",
        "categories",
    ]
    df[cols] = df[cols].fillna("unknown")
    return df

In [None]:
def country_filter(countries):
    def filter_country(df):
        df = df.copy()
        countries_set = set(countries)
        df = df.query("country in @countries_set")
        return df

    return filter_country

In [None]:
def create_poi2distance_df(train_df):
    poi2distances = {}
    centers = {"lat": [], "lon": []}
    for poi, df in tqdm(
        train_df[["lat_rad", "lon_rad", "point_of_interest"]].groupby(
            "point_of_interest"
        ),
        total=train_df["point_of_interest"].nunique(),
    ):
        if len(df) == 1:
            continue

        # calculate distance from center point
        X = df[["lat_rad", "lon_rad"]].to_numpy()
        center = np.median(X, axis=0).reshape(1, -1)
        distances = haversine_distances(X, center).reshape(-1)

        # scale to geodestic distance in meter
        poi2distances[poi] = distances * 6_371_000
        centers["lat"].append(center[0, 0])
        centers["lon"].append(center[0, 1])

    poi2distances_df = pd.DataFrame(
        {
            "point_of_interest": list(poi2distances.keys()),
            "distances": list(poi2distances.values()),
            "center_lat": rad2deg(centers["lat"]),
            "center_lon": rad2deg(centers["lon"]),
        }
    )
    return poi2distances_df

In [None]:
train = pd.read_csv("../input/foursquare-location-matching/train.csv")
train.head()

In [None]:
train_ext = compose(train, [fill_blank, convert_radians])
del train

In [None]:
poi2distances_df = create_poi2distance_df(train_ext)
poi2distances_df

In [None]:
poi2distances_df["n_points"] = poi2distances_df["distances"].apply(lambda x: len(x))

In [None]:
%%time
poi2distances_df["dist_max"] = poi2distances_df["distances"].agg(lambda x: np.quantile(x, 1.0))

In [None]:
poi2distances_df.query("n_points >= 2")

In [None]:
%%time
location_df = (
    train_ext.groupby("point_of_interest")
    .agg(name=("name", "max"), country=("country", "max"))
    .reset_index()
)

In [None]:
location_df

In [None]:
%%time
geo_df = pd.merge(
    location_df,
    poi2distances_df.drop("distances", axis=1),
    on="point_of_interest",
)

In [None]:
geo_df = geo_df.sort_values("dist_max", ascending=False)

In [None]:
geo_df.head(20)

In [None]:
train_ext.query("point_of_interest == 'P_f82d146eaf21da'")

In [None]:
train_ext.query("point_of_interest == 'P_667592b7b1e199'")

In [None]:
train_ext.query("point_of_interest == 'P_531442319393fc'")

In [None]:
train_ext.query("point_of_interest == 'P_0d4c210bcd7972'")

# Plot Bubble Map

## Definition of radius of POI

1. calculate center point of all points in the same POIs
2. set threshold as p95 of distance from center in a POI
3. drop distance >= threshold as outliers
4. re-calculate center points and distances
5. plot circle with radius of max distance from center

In below visualization, radius is scaled as 1/100-1/10 because radius are too large and hard to locate the center points.

In [None]:
def plot_buble_map(data, name_key, radius_key, zoom=15, scale=1, center=[0, 0]):
    this_map = folium.Map(location=center, tiles="OpenStreetMap", zoom_start=zoom)
    for i in range(0, len(data)):
        folium.Circle(
            location=[data.iloc[i]["center_lat"], data.iloc[i]["center_lon"]],
            popup=data.iloc[i][name_key],
            radius=float(data.iloc[i][radius_key]) * scale,
            fill_color="crimson",
        ).add_to(this_map)
    return this_map

# Visualize the size of POIs

## Top 50

In [None]:
%%time
plot_buble_map(geo_df.head(50).tail(50), "name", "dist_max", zoom=2, scale=0.001)

## Top 51-100

In [None]:
%%time
plot_buble_map(geo_df.head(100).tail(50), "name", "dist_max", zoom=2, scale=0.01)

## Top 101-200

In [None]:
%%time
plot_buble_map(geo_df.head(200).tail(100), "name", "dist_max", zoom=2, scale=0.01)

## Top 201-1000

In [None]:
%%time
plot_buble_map(geo_df.head(1000).tail(800), "name", "dist_max", zoom=2, scale=0.01)

## Top 1001-2000

In [None]:
%%time
plot_buble_map(geo_df.head(2000).tail(1000), "name", "dist_max", zoom=2, scale=0.01)

## Sumamry

* POIs which show correct location area
* POIs with too large area
    * The Museum Of Modern Art
    * Narita Airport
* POIs with too large area & also in far-away place  
    * London Heathrow Airport (shoud be in London)
    * Abu Dhabi International Airport (should be in UAE)

# Appendix

## Dubai International Airport located in "India"

In [None]:
geo_df.head(100).tail(50).query("name == 'Dubai International Airport (DXB)'")

In [None]:
train_ext.query("point_of_interest == 'P_8852e17b0feb24'")

## London Heathrow Airport in "Saudi Arabia"

In [None]:
train_ext.query("point_of_interest == 'P_350bed9dd5a213'")

## Dixie convenient stores in a abnormal straight line

Are these artificial noise?

In [None]:
%%time
plot_buble_map(geo_df.head(1000).tail(800), "name", "dist_max", zoom=6, scale=0.01, center=[4.672491e+01, 4.672471e+01])

In [None]:
train_ext.query("name == 'Дикси'")

## unique countries per POIs

In [None]:
counries_per_poi = train_ext.groupby("point_of_interest").agg(
    countries_per_poi=("country", "nunique")
)

In [None]:
duplicated = counries_per_poi.query("countries_per_poi > 1")

In [None]:
fig, ax = plt.subplots()
sns.histplot(counries_per_poi, ax=ax)
ax.set(yscale="log", title="number of countries per POI")
plt.show()

In [None]:
len(duplicated) / len(counries_per_poi)

Only 0.12% of POI have ambiguous country.