## Preprocessing of POI data
This script takes POI data with the columns `amenity, leisure, shop, tourism, geometry` and transforms it into clusters.  

In [1]:
import geopandas as gpd
import pandas as pd
import usecase
import pathlib
import usecase_helpers
import utility

Set file names and import options.

In [3]:
data_dir = pathlib.Pathh(".", "data")
weight_path = pathlib.Path(data_dir, "poi_weights_combined.csv")
weights = pd.read_csv(weight_path, sep=';', decimal=',', encoding='mbcs')
data_path = pathlib.Path(data_dir, "osm_poi.gpkg")
data = gpd.read_file(data_path)
boundaries_path = pathlib.Path(data_dir, "boundaries.gpkg")
boundaries = gpd.read_file(boundaries_path)
weights_dict = utility.weights_to_dict(weights)

Filter out unwanted data like not well defined or too big boundaries. This includes full Bundesländer that are not Bremen, Berlin or Hamburg

In [4]:
bad_ags = ["--"]
bad_bl = ["00", "01", "03", "05", "06", "07", "08", "09", "10", "12", "13", "14", "15", "16"]
for b in bad_bl:
    bad_ags.append(b + "000000")
b0 = boundaries["ags_0"].isin(bad_ags)
boundaries = boundaries.loc[~b0]

Set clustering options.

In [5]:
max_radius = 200
max_weight = 50
increment_radius = 20

Run preprocessing.

In [None]:
poi_cluster = usecase_helpers.preprocess_poi(data, boundaries, weights_dict, max_radius, max_weight, increment_radius)

In [7]:
poi_cluster

Unnamed: 0,geometry,potential,radius
0,POINT (4226601.471 3378146.784),60.0,60
1,POINT (4242106.413 3333273.481),100.0,20
2,POINT (4242267.333 3333185.375),80.0,20
3,POINT (4242011.674 3333325.381),80.0,20
4,POINT (4242043.058 3333308.155),80.0,20
...,...,...,...
0,POINT (4050702.503 3136461.616),7.0,200
0,POINT (4046663.597 3182216.625),8.5,200
0,POINT (4280005.961 3523447.665),8.0,200
0,POINT (4227191.979 3502270.189),4.5,200


In [8]:
poi_cluster.sort_values("radius")

Unnamed: 0,geometry,potential,radius
1258,POINT (4292547.218 2822228.951),72.0,20
7215,POINT (4556406.052 3269037.058),60.0,20
125,POINT (4225391.035 2865199.849),137.5,20
2065,POINT (4209768.626 3126665.453),51.0,20
7224,POINT (4554827.271 3270345.494),60.0,20
...,...,...,...
966,POINT (4354188.388 3144414.205),16.0,200
965,POINT (4345933.018 3146101.165),22.5,200
964,POINT (4355344.113 3144424.059),16.0,200
978,POINT (4343332.939 3134304.032),8.0,200


In [9]:
final_cluster = poi_cluster.sort_values("potential", ascending=False).reset_index()

In [10]:
final_cluster = final_cluster[["geometry", "potential", "radius"]]
final_cluster = final_cluster.dropna()
final_cluster

Unnamed: 0,geometry,potential,radius
0,POINT (4209935.675 2908889.905),1260.0,20
1,POINT (4375249.274 3258639.081),1260.0,20
2,POINT (4331769.430 3480372.272),1220.0,20
3,POINT (4202480.561 2947098.270),1220.0,20
4,POINT (4383198.427 2978397.280),1200.0,20
...,...,...,...
749392,POINT (4378035.942 3151998.282),1.5,200
749393,POINT (4509431.648 2858466.631),1.5,200
749394,POINT (4074002.151 3115557.584),1.5,200
749395,POINT (4178000.547 3134067.719),1.5,200


In [11]:
poi_cluster.dtypes

geometry     geometry
potential     float64
radius         object
dtype: object

In [12]:
result_path = pathlib.Path("data", "poi_cluster.gpkg")
final_cluster.to_file(result_path, driver="GPKG")