### installs

In [1]:
# installs
# pip3 install numpy
# pip3 install pandas
# pip3 install shapely
# brew install gdal # fiona dependency
# pip3 install fiona # geopandas dependency
# pip3 install pyproj # geopandas dependency
# pip3 install pygeos # geopandas dependency
# pip3 install geopandas
# pip3 install folium
# pip3 install matplotlib
# pip3 install seaborn

# python -m pip install "dask[complete]"
# pip3 install git+git://github.com/jsignell/dask-geopandas.git

# conda create --name adsquare_conda
# conda activate adsquare_conda
# conda install python=3.9
# conda install -c anaconda jupyter
# conda install -c anaconda numpy
# conda install -c anaconda pandas
# conda install -c conda-forge shapely
# conda install -c conda-forge geopandas
# conda install -c conda-forge folium
# conda install -c conda-forge matplotlib
# conda install -c anaconda seaborn -> this didn't work -> pip3 install seaborn
# conda install -c conda-forge dask
# pip install git+git://github.com/jsignell/dask-geopandas.git

### imports

In [65]:
# imports
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path
import re
import time
import pickle
import shapely.wkt
from shapely.geometry import Point, Polygon
import geopandas as gpd
from geopandas.tools import sjoin

import dask.dataframe as dd
import dask_geopandas as dg

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 1. Analyze the store visitation by date and affinity profile of store visitors.

## 1.a Resolve the user visits per store, i.e. filter the GPS signals through polygons.

### stores

In [36]:
stores = pd.read_csv("../../assignment_data/stores.csv")

In [37]:
# transform strings to polygons in column "wkt"
stores["wkt"] = stores["wkt"].apply(lambda x: shapely.wkt.loads(x))

In [38]:
# rename "wkt" to "geometry"
stores = stores.rename(columns={"wkt": "geometry"}) # must be geometry for the geopandas join

In [39]:
# transform stores into a GeoDataFrame
stores_gdf = gpd.GeoDataFrame(stores)

In [40]:
stores_gdf.head()

Unnamed: 0,store_id,store_name,geometry
0,place_1,McDonald's,"POLYGON ((13.46119 52.47099, 13.46103 52.47066..."
1,place_2,McDonald's,"POLYGON ((13.46835 52.54716, 13.46841 52.54716..."
2,place_3,McDonald's,"POLYGON ((13.31288 52.41979, 13.31303 52.41966..."
3,place_4,McDonald's,"POLYGON ((13.36358 52.56060, 13.36358 52.56061..."
4,place_5,McDonald's,"POLYGON ((13.32309 52.56135, 13.32307 52.56128..."


In [41]:
type(stores_gdf) # geopandas.geodataframe.GeoDataFrame

geopandas.geodataframe.GeoDataFrame

In [44]:
# ddf = dask_geopandas.from_dask_dataframe(ddf)
# stores_gdf = dg.from_dask_dataframe(stores_gdf)
stores_gdf = dd.from_pandas(stores_gdf, npartitions=3)

In [45]:
type(stores_gdf) # dask_geopandas.core.GeoDataFrame

dask_geopandas.core.GeoDataFrame

In [47]:
# save as pickle
with open("../out_data/analysis.pickle", "wb") as pickle_file:
    pickle.dump(stores_gdf, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

In [48]:
# check
with open("../out_data/analysis.pickle", "rb") as pickle_file:
    check_stores_gdf = pickle.load(pickle_file)

In [49]:
type(check_stores_gdf) # dask_geopandas.core.GeoDataFrame

dask_geopandas.core.GeoDataFrame

### gps_signals

In [50]:
# read all gps_signal csv batches with Dask
gps_signals = dd.read_csv("../../assignment_data/full_data/*.csv")

In [12]:
# shape of gps signals
a = gps_signals.shape
%time a[0].compute(), a[1]

CPU times: user 43.8 s, sys: 9.17 s, total: 53 s
Wall time: 25.1 s


(56572824, 4)

In [51]:
gps_signals.head()

Unnamed: 0,device_id,lat,lon,utc_timestamp
0,1,52.67649,13.31543,1609459727000
1,1,52.67649,13.31542,1609460036000
2,2,52.57837,13.58217,1609459241000
3,3,52.58373,13.33309,1609459502000
4,1,52.6765,13.31548,1609460387000


In [52]:
# sort by "utc_timestamp" ascending
# %time gps_signals["utc_timestamp"].nsmallest(5).compute()

In [53]:
# transform utc_timestamp into yyyy-mm-dd
%time gps_signals["utc_timestamp"] = gps_signals["utc_timestamp"].astype("datetime64[ms]").dt.to_period("D")

CPU times: user 15 ms, sys: 623 µs, total: 15.6 ms
Wall time: 17.7 ms


In [54]:
# rename "utc_timestamp" to "date"
%time gps_signals = gps_signals.rename(columns={"utc_timestamp": "date"})

CPU times: user 5.88 ms, sys: 240 µs, total: 6.12 ms
Wall time: 5.96 ms


In [55]:
gps_signals.head()

Unnamed: 0,device_id,lat,lon,date
0,1,52.67649,13.31543,2021-01-01
1,1,52.67649,13.31542,2021-01-01
2,2,52.57837,13.58217,2021-01-01
3,3,52.58373,13.33309,2021-01-01
4,1,52.6765,13.31548,2021-01-01


In [56]:
type(gps_signals) # dask.dataframe.core.DataFrame

dask.dataframe.core.DataFrame

In [57]:
# from Dask DataFrame create Dask GeoPandas DataFrame
gps_signals = dg.from_dask_dataframe(gps_signals)

In [58]:
type(gps_signals) # dask_geopandas.core.GeoDataFrame

dask_geopandas.core.GeoDataFrame

https://blog.dask.org/2017/09/21/accelerating-geopandas-1 \
https://github.com/jsignell/dask-geopandas

In [59]:
# create POINT from lat lon and transform df into gdf
start = time.time()
print("Creating POINT from lat lon in gps_signals and transforming the df into a gdf...")

gps_signals = gps_signals.set_geometry(dg.points_from_xy(gps_signals, 'lat', 'lon'))
                          
end = time.time()
dt = end - start
print(f"Creating POINT from lat lon in gps_signals and transforming the df into a gdf took {round(dt/60, 2)} minute(s).")

Creating POINT from lat lon in gps_signals and transforming the df into a gdf...
Creating POINT from lat lon in gps_signals and transforming the df into a gdf took 0.0 minute(s).


  a = np.array(


In [60]:
# %time gps_signals.head()

### join (merge) gps_signals and stores_gdf

In [61]:
# stores_gdf.shape

In [67]:
type(gps_signals)

dask_geopandas.core.GeoDataFrame

In [68]:
type(stores_gdf)

dask_geopandas.core.GeoDataFrame

# HERE I STUCKED

In [72]:
# join gps_signals_gdf and stores_gdf
start = time.time()
print("Joining gps_signals_gdf and stores_gdf...")

gps_sig_and_stores = dg.sjoin(gps_signals, stores_gdf, how="inner")
# gf = dg.sjoin(gf, zones[['zone', 'borough', 'geometry']])

end = time.time()
dt = end - start
print(f"Joining gps_signals_gdf and stores_gdf took {round(dt/60, 2)} minute(s).")

Joining gps_signals_gdf and stores_gdf...


AttributeError: module 'dask_geopandas' has no attribute 'sjoin'

In [None]:
# gps_sig_and_stores.shape

In [None]:
gps_sig_and_stores.head()

In [None]:
# gps_sig_and_stores_gdf[gps_sig_and_stores_gdf["store_id"] == "place_1"].head()

### users and user affinities

**users**

In [71]:
# create a list from the unique device ids
unique_device_ids = list(gps_signals["device_id"].unique())

KeyboardInterrupt: 

In [None]:
len(unique_device_ids)

In [None]:
# create a dataframe "users"
data = {
    "device_id": unique_device_ids
}

users = pd.DataFrame(data)

In [None]:
# users.shape

In [None]:
users = users.sort_values(by=["device_id"]).reset_index(drop=True)

In [None]:
# users.head()

In [None]:
# users.tail()

**user affinities**

In [None]:
# list the files in the user_affinities folder
path = os.getcwd()
# path = "/Users/robertbozsik/techtest/adsquare/assignment_data/affinities"
path = f"{str(Path(path).parents[1])}/assignment_data/affinities"
file_names = os.listdir(path)
file_names = sorted(file_names)
# file_names

In [None]:
# add .csv after the file names (it should be run only once)
if ".csv" not in file_names[0]:
    for index, file_name in enumerate(file_names):
        os.rename(os.path.join(path, file_name), os.path.join(path, "".join([file_name, ".csv"])))

In [None]:
# create a dictionary "user_affinities"
# the keys should be the names of the affinities
# the values should be lists of the affinity csvs
def list_from_affinities(aff_name: str) -> list:
    """Return a list created from the given user_affinity csv file"""
    aff_name = pd.read_csv(f"../../assignment_data/user_affinities/{aff_name}.csv", header=None, names=[aff_name])
    aff_name = list(aff_name.iloc[:, 0]) # all rows, first column
    return aff_name

user_affinities = {}

for file_name in file_names:
    name = file_name.split(".")[0]
    user_affinities[name] = list_from_affinities(name)

In [None]:
user_affinities.keys()

In [None]:
# len(user_affinities["addidas"])

In [None]:
# len(user_affinities["low_income"])

In [None]:
# len(user_affinities["retired"])

**add user_affinities to users**

In [None]:
# try it out with the affinity "addidas"
users["addidas"] = np.where(users["device_id"].isin(user_affinities["addidas"]), 1, 0)

In [None]:
# users["addidas"].value_counts()

In [None]:
for index, key in enumerate(user_affinities.keys()):
    if index > 0: # the column "addidas" has already been created
        users[key] = np.where(users["device_id"].isin(user_affinities[key]), 1, 0)

In [None]:
# users.shape

In [None]:
users.head()

In [None]:
# save users as csv
users.to_csv("../out_data/users.csv", index=False)

### merge gps_sig_and_stores and users (affinities)

In [None]:
# merge gps_sig_and_stores and users (affinities)
gps_sig_and_stores.shape

In [None]:
users.shape

In [None]:
# merge gps_sig_and_stores and users (affinities)
start = time.time()
print("Merging gps_sig_and_stores and users (affinities)...")

gpssig_stores_useraff = gps_sig_and_stores.merge(users, how="inner", on="device_id")

end = time.time()
dt = end - start
print(f"Merging gps_sig_and_stores and users (affinities) took {round(dt/60, 2)} minute(s).")

In [None]:
gpssig_stores_useraff.shape

In [None]:
gpssig_stores_useraff.head()

## 1.b Group the resolved visits by date (yyyy-mm-dd), store_name, and store_id.

## 1.c For each store_id/store_name/date provide the following metric.

### 1.c.i A total number of GPS signals per place_id/date.

### 1.c.ii A total number of unique visitors (i.e. device ids).

In [None]:
# create total number of GPS signals per place_id/date
# and create total number of unique visitors
gsu_total_and_unique = (gpssig_stores_useraff
                        .groupby(by=["date", "store_name", "store_id"])
                        .agg({"lat": "count", "device_id": "nunique"})
                        .rename(columns={"lat": "total_signals", "device_id": "unique_visits"})
                        .reset_index())

In [None]:
gsu_total_and_unique.shape

In [None]:
gsu_total_and_unique.head()

### 1.c.iii A total number of unique visitors belonging to each affinity group.

In [None]:
# drop duplicated date, store_name, store_id and device_id 
# in order to get unique visitors belonging to each affinity group
# and group by data, store_name and store_id
start = time.time()
print("Creating total number of unique visitors belonging to each affinity group...")

gsu_unique_aff = (gpssig_stores_useraff
                  .drop(["geometry", "index_right"], axis=1)
                  .drop_duplicates(subset=["date", "store_name", "store_id", "device_id"])
                  .groupby(by=["date", "store_name", "store_id"])
                  .agg(sum)
                  .reset_index())

end = time.time()
dt = end - start
print(f"Create total number of unique visitors belonging to each affinity group took {round(dt/60, 2)} minutes.")

In [None]:
gsu_unique_aff.shape

In [None]:
# delete unnecessary columns
gsu_unique_aff = gsu_unique_aff.drop(["date", "store_name", "store_id"], axis=1)

In [None]:
gsu_unique_aff.head()

### concat the gsu_total_and_unique and gsu_unique_aff

In [None]:
# concat the gsu_total_and_unique and gsu_unique_aff
final_df = pd.concat([gsu_total_and_unique, gsu_unique_aff], axis=1)

In [None]:
final_df.shape

In [None]:
final_df.head()

In [None]:
# save final df as csv
final_df.to_csv("../out_data/analysis.csv", index=False)
print("final_df saved as csv")