### installs

In [1]:
# installs
# python3 -m pip install --upgrade pip
# python3 -m pip install numpy
# python3 -m pip install pandas
# python3 -m pip install shapely
# brew install gdal # fiona dependency
# python3 -m pip install fiona # geopandas dependency
# python3 -m pip install pyproj # geopandas dependency
# python3 -m pip install pygeos # geopandas dependency
# python3 -m pip install geopandas
# python3 -m pip install jupyter
# python3 -m pip install folium
# python3 -m pip install matplotlib
# python3 -m pip install seaborn

### imports

In [2]:
# imports
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path
import re
import time
import pickle
import shapely.wkt
from shapely.geometry import Point, Polygon
import geopandas as gpd
from geopandas.tools import sjoin



In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 1. Analyze the store visitation by date and affinity profile of store visitors.

## 1.a Resolve the user visits per store, i.e. filter the GPS signals through polygons.

### stores

In [4]:
stores = pd.read_csv("../../assignment_data/stores.csv")

In [5]:
stores.shape

(247, 3)

In [6]:
# check for duplicated rows
stores.duplicated().value_counts()
# there are no duplicates

False    247
dtype: int64

In [7]:
stores.head()

Unnamed: 0,store_id,store_name,wkt
0,place_1,McDonald's,POLYGON ((13.4611920000000005 52.4709870000000...
1,place_2,McDonald's,POLYGON ((13.4683480000000007 52.5471599999999...
2,place_3,McDonald's,POLYGON ((13.3128810000000009 52.4197929999999...
3,place_4,McDonald's,POLYGON ((13.3635780000000004 52.5606039999999...
4,place_5,McDonald's,POLYGON ((13.3230909999999998 52.5613460000000...


In [8]:
# transform strings to polygons in column "wkt"
stores["wkt"] = stores["wkt"].apply(lambda x: shapely.wkt.loads(x))

In [9]:
# rename "wkt" to "geometry"
stores = stores.rename(columns={"wkt": "geometry"}) # must be geometry for the geopandas join

In [10]:
# type(stores["geometry"][0])

In [11]:
# transform stores into a GeoDataFrame
stores_gdf = gpd.GeoDataFrame(stores)

In [12]:
stores_gdf.head()

Unnamed: 0,store_id,store_name,geometry
0,place_1,McDonald's,"POLYGON ((13.46119 52.47099, 13.46103 52.47066..."
1,place_2,McDonald's,"POLYGON ((13.46835 52.54716, 13.46841 52.54716..."
2,place_3,McDonald's,"POLYGON ((13.31288 52.41979, 13.31303 52.41966..."
3,place_4,McDonald's,"POLYGON ((13.36358 52.56060, 13.36358 52.56061..."
4,place_5,McDonald's,"POLYGON ((13.32309 52.56135, 13.32307 52.56128..."


In [13]:
stores_gdf.shape

(247, 3)

In [14]:
type(stores_gdf)

geopandas.geodataframe.GeoDataFrame

In [15]:
# save stores_gdf as geojson for visualization in Tableau
stores_gdf.to_file("../out_data_sample/stores_gdf.geojson", driver="GeoJSON")

In [16]:
# save stores_gdf as pickle
stores_gdf.to_pickle("../out_data_sample/stores_gdf.pkl")

In [17]:
# save stores_gdf as pickle
# with open("../out_data_sample/stores_gdf.pickle", "wb") as pickle_file:
#     pickle.dump(stores_gdf, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
# check
# with open("../../out_data_sample/stores_gdf.pickle", "rb") as picke_file:
#     check_stores_gdf = pickle.load(picke_file)

In [19]:
# type(check_stores_gdf)

In [20]:
# check_stores_gdf.head()

### gps_signals

In [21]:
# read all gps_signal csv batches
gps_signals = pd.DataFrame()

start = time.time()
print("Reading all gps_signal csv batches...")

for file_name in glob.glob("../../assignment_data/sample_data/*.csv"):
    batch = pd.read_csv(file_name) # , low_memory=False
    gps_signals = pd.concat([gps_signals, batch], ignore_index=True)
    
end = time.time()
dt = end - start
print(f"Reading all gps_signal csv batches took {round(dt/60, 2)} minute(s).")

Reading all gps_signal csv batches...
Reading all gps_signal csv batches took 0.13 minute(s).


In [22]:
gps_signals.shape

(2262913, 4)

In [23]:
# check for duplicated rows
gps_signals.duplicated().value_counts()
# there are no duplicates

False    2262913
dtype: int64

In [24]:
# sort by "utc_timestamp" ascending
gps_signals = gps_signals.sort_values(by=["utc_timestamp"]).reset_index(drop=True)

In [25]:
# transform utc_timestamp into yyyy-mm-dd
gps_signals["utc_timestamp"] = gps_signals["utc_timestamp"].astype("datetime64[ms]").dt.to_period("D")

In [26]:
# rename "utc_timestamp" to "date"
gps_signals = gps_signals.rename(columns={"utc_timestamp": "date"})

In [27]:
gps_signals.head()

Unnamed: 0,device_id,lat,lon,date
0,49129,52.55457,13.57401,2021-01-01
1,46744,52.419566,13.208267,2021-01-01
2,22268,52.568431,13.523719,2021-01-01
3,41158,52.49306,13.38074,2021-01-01
4,31401,52.517286,13.317811,2021-01-01


In [28]:
gps_signals.shape

(2262913, 4)

In [29]:
# create POINT from lat lon and transform df into gdf
start = time.time()
print("Creating POINT from lat lon in gps_signals and transforming the df into a gdf...")

gps_signals_gdf = gpd.GeoDataFrame(gps_signals, 
                                   geometry=gpd.points_from_xy(gps_signals["lon"], gps_signals["lat"]))

end = time.time()
dt = end - start
print(f"Creating POINT from lat lon in gps_signals and transforming the df into a gdf took {round(dt/60, 2)} minute(s).")

Creating POINT from lat lon in gps_signals and transforming the df into a gdf...
Creating POINT from lat lon in gps_signals and transforming the df into a gdf took 0.02 minute(s).


In [30]:
gps_signals_gdf.head()

Unnamed: 0,device_id,lat,lon,date,geometry
0,49129,52.55457,13.57401,2021-01-01,POINT (13.57401 52.55457)
1,46744,52.419566,13.208267,2021-01-01,POINT (13.20827 52.41957)
2,22268,52.568431,13.523719,2021-01-01,POINT (13.52372 52.56843)
3,41158,52.49306,13.38074,2021-01-01,POINT (13.38074 52.49306)
4,31401,52.517286,13.317811,2021-01-01,POINT (13.31781 52.51729)


### spatial join gps_signals_gdf and stores_gdf

In [31]:
gps_signals_gdf.shape

(2262913, 5)

In [32]:
stores_gdf.shape

(247, 3)

In [33]:
# spatial join gps_signals_gdf and stores_gdf
start = time.time()
print("Joining gps_signals_gdf and stores_gdf...")

gps_sig_and_stores = sjoin(gps_signals_gdf, stores_gdf, how="inner")

end = time.time()
dt = end - start
print(f"Joining gps_signals_gdf and stores_gdf took {round(dt/60, 2)} minute(s).")
print(f"Joining gps_signals_gdf and stores_gdf took {round(dt, 2)} seconds.")

Joining gps_signals_gdf and stores_gdf...
Joining gps_signals_gdf and stores_gdf took 0.05 minute(s).
Joining gps_signals_gdf and stores_gdf took 2.78 seconds.


In [34]:
# reset index after the inner spatial join
gps_sig_and_stores = gps_sig_and_stores.reset_index(drop=True)

In [35]:
gps_sig_and_stores.shape

(2747, 8)

In [36]:
gps_sig_and_stores.head()

Unnamed: 0,device_id,lat,lon,date,geometry,index_right,store_id,store_name
0,41282,52.48534,13.36821,2021-01-01,POINT (13.36821 52.48534),237,place_238,Renault
1,78319,52.48539,13.368412,2021-01-05,POINT (13.36841 52.48539),237,place_238,Renault
2,7065,52.48532,13.36843,2021-01-07,POINT (13.36843 52.48532),237,place_238,Renault
3,78319,52.485324,13.368606,2021-01-11,POINT (13.36861 52.48532),237,place_238,Renault
4,38190,52.485382,13.368311,2021-01-13,POINT (13.36831 52.48538),237,place_238,Renault


In [37]:
gps_sig_and_stores.tail()

Unnamed: 0,device_id,lat,lon,date,geometry,index_right,store_id,store_name
2742,47868,52.44219,13.585334,2021-01-19,POINT (13.58533 52.44219),70,place_71,Aldi
2743,21441,52.442287,13.58551,2021-01-21,POINT (13.58551 52.44229),70,place_71,Aldi
2744,21441,52.442227,13.585457,2021-01-21,POINT (13.58546 52.44223),70,place_71,Aldi
2745,160071,52.51029,13.43308,2021-01-20,POINT (13.43308 52.51029),51,place_52,Subway
2746,25285,52.465107,13.49607,2021-01-21,POINT (13.49607 52.46511),77,place_78,Aldi


### users and user affinities

**users**

In [38]:
# create unique users by dropping each duplicated device_id
users = gps_signals[["device_id"]].drop_duplicates(subset=["device_id"]).reset_index(drop=True)

In [39]:
users.tail()

Unnamed: 0,device_id
94999,158319
95000,172360
95001,172344
95002,174085
95003,171878


In [40]:
# sort by device_id ascending
users = users.sort_values(by=["device_id"]).reset_index(drop=True)

In [41]:
users.shape

(95004, 1)

In [42]:
users.head()

Unnamed: 0,device_id
0,1
1,2
2,3
3,5
4,6


In [43]:
users.tail()

Unnamed: 0,device_id
94999,174401
95000,174402
95001,174405
95002,174406
95003,174410


**user affinities**

In [44]:
# list the files in the user_affinities folder
path = os.getcwd()
# path = "/Users/robertbozsik/techtest/adsquare/assignment_data/affinities"
# go two levels upwards compared to the current path
path = f"{str(Path(path).parents[1])}/assignment_data/affinities"
file_names = os.listdir(path)
file_names = sorted(file_names)
# file_names

In [45]:
# add .csv after the file names (it should be run only once)
if ".csv" not in file_names[0]:
    for index, file_name in enumerate(file_names):
        os.rename(os.path.join(path, file_name), os.path.join(path, "".join([file_name, ".csv"])))

In [46]:
# create a dictionary "user_affinities", key: value -> names-of-the-affinity: [lists-of-the-affinity-csv]
def list_from_affinities(aff_name: str) -> list:
    """Return a list created from the given user_affinity csv file"""
    aff_name = pd.read_csv(f"../../assignment_data/affinities/{aff_name}.csv", header=None, names=[aff_name])
    aff_name = list(aff_name.iloc[:, 0]) # all rows, first column
    return aff_name

user_affinities = {}

for file_name in file_names:
    name = file_name.split(".")[0]
    user_affinities[name] = list_from_affinities(name)

In [47]:
# user_affinities = {
#     "addidas": list_from_affinities("addidas"),
#     "bmw": list_from_affinities("bmw"),
#     "employed": list_from_affinities("employed"),
#     "honda": list_from_affinities("honda"),
#     "female": list_from_affinities("female"),
#     "apple": list_from_affinities("apple"),
#     "student": list_from_affinities("student"),
#     "low_income": list_from_affinities("low_income"),
#     "h_&_m": list_from_affinities("h_&_m"),
#     "middle_income": list_from_affinities("middle_income"),
#     "tommy_helfinger": list_from_affinities("tommy_helfinger"),
#     "job_seeking": list_from_affinities("job_seeking"),
#     "male": list_from_affinities("male"),
#     "high_income": list_from_affinities("high_income"),
#     "mercedes-benz": list_from_affinities("mercedes-benz"),
#     "retired": list_from_affinities("retired"), 
# }

In [48]:
user_affinities.keys()

dict_keys(['addidas', 'apple', 'bmw', 'employed', 'female', 'h_&_m', 'high_income', 'honda', 'job_seeking', 'low_income', 'male', 'mercedes-benz', 'middle_income', 'retired', 'student', 'tommy_helfinger'])

In [49]:
# len(user_affinities["addidas"])

In [50]:
# len(user_affinities["low_income"])

In [51]:
# len(user_affinities["retired"])

**add user_affinities to users**

In [52]:
# try it out with the affinity "addidas"
users["addidas"] = np.where(users["device_id"].isin(user_affinities["addidas"]), 1, 0)

In [53]:
# users["addidas"].value_counts()

In [54]:
for index, key in enumerate(user_affinities.keys()):
    if index > 0: # the column "addidas" has already been created
        users[key] = np.where(users["device_id"].isin(user_affinities[key]), 1, 0)

In [55]:
users.shape

(95004, 17)

In [56]:
users.head()

Unnamed: 0,device_id,addidas,apple,bmw,employed,female,h_&_m,high_income,honda,job_seeking,low_income,male,mercedes-benz,middle_income,retired,student,tommy_helfinger
0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
1,2,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
3,5,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0


In [57]:
# save users as csv
users.to_csv("../out_data_sample/users.csv", index=False)

### merge gps_sig_and_stores and users (affinities)

In [58]:
# merge gps_sig_and_stores and users (affinities)
gps_sig_and_stores.shape

(2747, 8)

In [59]:
gps_sig_and_stores.head()

Unnamed: 0,device_id,lat,lon,date,geometry,index_right,store_id,store_name
0,41282,52.48534,13.36821,2021-01-01,POINT (13.36821 52.48534),237,place_238,Renault
1,78319,52.48539,13.368412,2021-01-05,POINT (13.36841 52.48539),237,place_238,Renault
2,7065,52.48532,13.36843,2021-01-07,POINT (13.36843 52.48532),237,place_238,Renault
3,78319,52.485324,13.368606,2021-01-11,POINT (13.36861 52.48532),237,place_238,Renault
4,38190,52.485382,13.368311,2021-01-13,POINT (13.36831 52.48538),237,place_238,Renault


In [60]:
users.shape

(95004, 17)

In [61]:
users.head()

Unnamed: 0,device_id,addidas,apple,bmw,employed,female,h_&_m,high_income,honda,job_seeking,low_income,male,mercedes-benz,middle_income,retired,student,tommy_helfinger
0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
1,2,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
3,5,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0


In [62]:
# merge gps_sig_and_stores and users (affinities)
start = time.time()
print("Merging gps_sig_and_stores and users (affinities)...")

gpssig_stores_useraff = gps_sig_and_stores.merge(users, how="inner", on="device_id")

end = time.time()
dt = end - start
print(f"Merging gps_sig_and_stores and users (affinities) took {round(dt/60, 2)} minute(s).")

Merging gps_sig_and_stores and users (affinities)...
Merging gps_sig_and_stores and users (affinities) took 0.0 minute(s).


In [63]:
gpssig_stores_useraff.shape

(2747, 24)

In [64]:
gpssig_stores_useraff.head()

Unnamed: 0,device_id,lat,lon,date,geometry,index_right,store_id,store_name,addidas,apple,bmw,employed,female,h_&_m,high_income,honda,job_seeking,low_income,male,mercedes-benz,middle_income,retired,student,tommy_helfinger
0,41282,52.48534,13.36821,2021-01-01,POINT (13.36821 52.48534),237,place_238,Renault,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
1,78319,52.48539,13.368412,2021-01-05,POINT (13.36841 52.48539),237,place_238,Renault,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
2,78319,52.485324,13.368606,2021-01-11,POINT (13.36861 52.48532),237,place_238,Renault,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
3,78319,52.485392,13.368405,2021-01-18,POINT (13.36840 52.48539),237,place_238,Renault,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
4,78319,52.485362,13.368547,2021-01-19,POINT (13.36855 52.48536),237,place_238,Renault,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0


### create users_in_stores for visualizing on a map

In [65]:
users_in_stores = gpssig_stores_useraff[["device_id", "lat", "lon", "date", "store_id", "store_name"]]

In [66]:
users_in_stores.head()

Unnamed: 0,device_id,lat,lon,date,store_id,store_name
0,41282,52.48534,13.36821,2021-01-01,place_238,Renault
1,78319,52.48539,13.368412,2021-01-05,place_238,Renault
2,78319,52.485324,13.368606,2021-01-11,place_238,Renault
3,78319,52.485392,13.368405,2021-01-18,place_238,Renault
4,78319,52.485362,13.368547,2021-01-19,place_238,Renault


In [67]:
users_in_stores.to_csv("../out_data_sample/users_in_stores.csv", index=False)

## 1.b Group the resolved visits by date (yyyy-mm-dd), store_name, and store_id.

## 1.c For each store_id/store_name/date provide the following metric.

### 1.c.i A total number of GPS signals per place_id/date.

### 1.c.ii A total number of unique visitors (i.e. device ids).

In [68]:
# create total number of GPS signals per place_id/date
# and create total number of unique visitors
gsu_total_and_unique = (gpssig_stores_useraff
                        .groupby(by=["date", "store_name", "store_id"])
                        .agg({"lat": "count", "device_id": "nunique"})
                        .rename(columns={"lat": "total_signals", "device_id": "unique_visits"})
                        .reset_index())

In [69]:
gsu_total_and_unique.shape

(1199, 5)

In [70]:
gsu_total_and_unique.head()

Unnamed: 0,date,store_name,store_id,total_signals,unique_visits
0,2021-01-01,Aldi,place_64,1,1
1,2021-01-01,Burger King,place_42,4,1
2,2021-01-01,Burger King,place_43,1,1
3,2021-01-01,Burger King,place_50,1,1
4,2021-01-01,McDonald's,place_17,1,1


In [71]:
gsu_total_and_unique.tail()

Unnamed: 0,date,store_name,store_id,total_signals,unique_visits
1194,2021-01-21,Rewe,place_199,1,1
1195,2021-01-21,Rewe,place_200,1,1
1196,2021-01-21,Rewe,place_203,2,2
1197,2021-01-21,Rewe,place_209,3,2
1198,2021-01-21,Rewe,place_210,4,2


### 1.c.iii A total number of unique visitors belonging to each affinity group.

In [72]:
# total number of unique visitors belonging to each affinity group
start = time.time()
print("Creating total number of unique visitors belonging to each affinity group...")

gsu_unique_aff = (gpssig_stores_useraff
                  .drop(["lat", "lon", "geometry", "index_right"], axis=1)
                  .drop_duplicates(subset=["date", "store_name", "store_id", "device_id"])
                  .groupby(by=["date", "store_name", "store_id"])
                  .agg(sum) # would sum lat, lon and device_id as well what is nonsense!!!
                  .reset_index()
                  .drop(["device_id"], axis=1))

end = time.time()
dt = end - start
print(f"Create total number of unique visitors belonging to each affinity group took {round(dt/60, 2)} minutes.")

Creating total number of unique visitors belonging to each affinity group...
Create total number of unique visitors belonging to each affinity group took 0.0 minutes.


In [73]:
gsu_unique_aff.shape

(1199, 19)

In [74]:
gsu_unique_aff.head() # gsu_unique_aff

Unnamed: 0,date,store_name,store_id,addidas,apple,bmw,employed,female,h_&_m,high_income,honda,job_seeking,low_income,male,mercedes-benz,middle_income,retired,student,tommy_helfinger
0,2021-01-01,Aldi,place_64,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
1,2021-01-01,Burger King,place_42,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0
2,2021-01-01,Burger King,place_43,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
3,2021-01-01,Burger King,place_50,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0
4,2021-01-01,McDonald's,place_17,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0


In [75]:
# delete unnecessary columns
gsu_unique_aff = gsu_unique_aff.drop(["date", "store_name", "store_id"], axis=1)

In [76]:
gsu_unique_aff.head()

Unnamed: 0,addidas,apple,bmw,employed,female,h_&_m,high_income,honda,job_seeking,low_income,male,mercedes-benz,middle_income,retired,student,tommy_helfinger
0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0


### concat the gsu_total_and_unique and gsu_unique_aff

In [77]:
# concat the gsu_total_and_unique and gsu_unique_aff
final_df = pd.concat([gsu_total_and_unique, gsu_unique_aff], axis=1)

In [78]:
final_df.shape

(1199, 21)

In [79]:
final_df.head()

Unnamed: 0,date,store_name,store_id,total_signals,unique_visits,addidas,apple,bmw,employed,female,h_&_m,high_income,honda,job_seeking,low_income,male,mercedes-benz,middle_income,retired,student,tommy_helfinger
0,2021-01-01,Aldi,place_64,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
1,2021-01-01,Burger King,place_42,4,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0
2,2021-01-01,Burger King,place_43,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
3,2021-01-01,Burger King,place_50,1,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0
4,2021-01-01,McDonald's,place_17,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0


In [80]:
final_df.tail()

Unnamed: 0,date,store_name,store_id,total_signals,unique_visits,addidas,apple,bmw,employed,female,h_&_m,high_income,honda,job_seeking,low_income,male,mercedes-benz,middle_income,retired,student,tommy_helfinger
1194,2021-01-21,Rewe,place_199,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
1195,2021-01-21,Rewe,place_200,1,1,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0
1196,2021-01-21,Rewe,place_203,2,2,1,0,0,0,0,1,0,0,0,1,2,0,1,0,0,0
1197,2021-01-21,Rewe,place_209,3,2,0,0,0,0,0,0,0,0,0,1,2,0,1,0,0,0
1198,2021-01-21,Rewe,place_210,4,2,0,0,1,0,0,0,0,0,0,0,2,0,2,0,0,0


In [81]:
# save final df as csv
final_df.to_csv("../out_data_sample/analysis.csv", index=False)
print("final_df saved as csv")

final_df saved as csv


In [82]:
# Took about 20 seconds