### installs

In [1]:
# installs
# pip3 install numpy
# pip3 install pandas
# pip3 install shapely
# brew install gdal # fiona dependency
# pip3 install fiona # geopandas dependency
# pip3 install pyproj # geopandas dependency
# pip3 install pygeos # geopandas dependency
# pip3 install geopandas
# pip3 install folium
# pip3 install matplotlib
# pip3 install seaborn
# python -m pip install "dask[complete]"
# IT DOES NOT WORK
# pip3 install git+git://github.com/jsignell/dask-geopandas.git


# TRY IT WITH A CONDA ENVIRONMENT
# conda create --name adsquare_conda
# conda activate adsquare_conda
# conda install python=3.9
# conda install -c anaconda jupyter
# conda install -c anaconda numpy
# conda install -c anaconda pandas
# conda install -c conda-forge shapely
# conda install -c conda-forge geopandas
# conda install -c conda-forge folium
# conda install -c conda-forge matplotlib
# conda install -c anaconda seaborn -> this didn't work -> pip3 install seaborn
# conda install -c conda-forge dask
# pip install git+git://github.com/jsignell/dask-geopandas.git

### imports

In [65]:
# imports
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path
import re
import time
import pickle
import shapely.wkt
from shapely.geometry import Point, Polygon
import geopandas # as gpd
from geopandas.tools import sjoin

import dask.dataframe # as ddf
import dask_geopandas # as dgpd

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 1. Analyze the store visitation by date and affinity profile of store visitors.

## 1.a Resolve the user visits per store, i.e. filter the GPS signals through polygons.

### abbreviations
**df** = Pandas DataFrame \
**gdf** = Geopandas GeoDataFrame \
**ddf** = Dask DataFrame \
**dgdf** = Dask GeoDataFrame

### stores

In [36]:
stores = pd.read_csv("../../assignment_data/stores.csv")

In [37]:
# transform strings to polygons in column "wkt"
stores["wkt"] = stores["wkt"].apply(lambda x: shapely.wkt.loads(x))

In [38]:
# rename "wkt" to "geometry"
stores = stores.rename(columns={"wkt": "geometry"}) # must be geometry for the geopandas join

In [39]:
# transform stores into a GeoDataFrame
stores_gdf = geopandas.GeoDataFrame(stores)

In [40]:
stores_gdf.head()

Unnamed: 0,store_id,store_name,geometry
0,place_1,McDonald's,"POLYGON ((13.46119 52.47099, 13.46103 52.47066..."
1,place_2,McDonald's,"POLYGON ((13.46835 52.54716, 13.46841 52.54716..."
2,place_3,McDonald's,"POLYGON ((13.31288 52.41979, 13.31303 52.41966..."
3,place_4,McDonald's,"POLYGON ((13.36358 52.56060, 13.36358 52.56061..."
4,place_5,McDonald's,"POLYGON ((13.32309 52.56135, 13.32307 52.56128..."


In [41]:
type(stores_gdf) # geopandas.geodataframe.GeoDataFrame

geopandas.geodataframe.GeoDataFrame

In [44]:
# ddf = dask_geopandas.from_dask_dataframe(ddf)
# stores_gdf = dg.from_dask_dataframe(stores_gdf)
stores_dgdf = dask.dataframe.from_pandas(stores_gdf, npartitions=3)

In [45]:
type(stores_dgdf) # dask_geopandas.core.GeoDataFrame

dask_geopandas.core.GeoDataFrame

In [47]:
# save as pickle
with open("../out_data/stores_dgdf.pickle", "wb") as pickle_file:
    pickle.dump(stores_dgdf, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

In [48]:
# check
with open("../out_data/stores_dgdf.pickle", "rb") as pickle_file:
    check_stores_dgdf = pickle.load(pickle_file)

In [49]:
type(check_stores_dgdf) # dask_geopandas.core.GeoDataFrame

dask_geopandas.core.GeoDataFrame

### gps_signals

In [50]:
# read all gps_signal csv batches with Dask
signals_ddf = dask.dataframe.read_csv("../../assignment_data/full_data/*.csv")

In [12]:
# shape of signals_ddf
a = signals_ddf.shape
%time a[0].compute(), a[1]

CPU times: user 43.8 s, sys: 9.17 s, total: 53 s
Wall time: 25.1 s


(56572824, 4)

In [51]:
signals_ddf.head()

Unnamed: 0,device_id,lat,lon,utc_timestamp
0,1,52.67649,13.31543,1609459727000
1,1,52.67649,13.31542,1609460036000
2,2,52.57837,13.58217,1609459241000
3,3,52.58373,13.33309,1609459502000
4,1,52.6765,13.31548,1609460387000


In [52]:
# sort by "utc_timestamp" ascending
# %time signals["utc_timestamp"].nsmallest(5).compute()

In [53]:
# transform utc_timestamp into yyyy-mm-dd
%time signals_ddf["utc_timestamp"] = signals_ddf["utc_timestamp"].astype("datetime64[ms]").dt.to_period("D")

CPU times: user 15 ms, sys: 623 µs, total: 15.6 ms
Wall time: 17.7 ms


In [54]:
# rename "utc_timestamp" to "date"
%time signals_ddf = signals_ddf.rename(columns={"utc_timestamp": "date"})

CPU times: user 5.88 ms, sys: 240 µs, total: 6.12 ms
Wall time: 5.96 ms


In [55]:
signals_ddf.head()

Unnamed: 0,device_id,lat,lon,date
0,1,52.67649,13.31543,2021-01-01
1,1,52.67649,13.31542,2021-01-01
2,2,52.57837,13.58217,2021-01-01
3,3,52.58373,13.33309,2021-01-01
4,1,52.6765,13.31548,2021-01-01


In [56]:
type(signals_ddf) # dask.dataframe.core.DataFrame

dask.dataframe.core.DataFrame

In [57]:
# from Dask DataFrame create Dask GeoPandas DataFrame
signals_dgdf = dask_geopandas.from_dask_dataframe(signals_ddf) # npartitions=4 ???

In [58]:
type(signals_dgdf) # dask_geopandas.core.GeoDataFrame

dask_geopandas.core.GeoDataFrame

https://blog.dask.org/2017/09/21/accelerating-geopandas-1 \
https://github.com/jsignell/dask-geopandas

In [59]:
# create POINT from lat lon and transform df into gdf
start = time.time()
print("Creating POINT from lat lon in signals_dgdf...")

signals_dgdf = signals_dgdf.set_geometry(dask_geopandas.points_from_xy(signals_dgdf, 'lat', 'lon'))
                          
end = time.time()
dt = end - start
print(f"The above task took {round(dt/60, 2)} minute(s).")

Creating POINT from lat lon in gps_signals and transforming the df into a gdf...
Creating POINT from lat lon in gps_signals and transforming the df into a gdf took 0.0 minute(s).


  a = np.array(


In [60]:
# %time signals_dgdf.head()

### join (merge) signals_dgdf and stores_dgdf

In [67]:
type(signals_dgdf)

dask_geopandas.core.GeoDataFrame

In [68]:
type(stores_dgdf)

dask_geopandas.core.GeoDataFrame

# HERE I STUCKED

In [72]:
# join signals_dgdf and stores_dgdf
start = time.time()
print("Joining signals_dgdf and stores_dgdf...")

signals_stores_dgdf = dask_geopandas.sjoin(signals_dgdf, stores_dgdf, how="inner")
# gf = dg.sjoin(gf, zones[['zone', 'borough', 'geometry']])

end = time.time()
dt = end - start
print(f"The above task took {round(dt/60, 2)} minute(s).")

Joining gps_signals_gdf and stores_gdf...


AttributeError: module 'dask_geopandas' has no attribute 'sjoin'

In [None]:
# shape of signals_stores_dgdf
a = signals_stores_dgdf.shape
%time a[0].compute(), a[1]

In [None]:
signals_stores_dgdf.head()