### installs

In [1]:
# installs
# pip3 install numpy
# pip3 install pandas
# pip3 install shapely
# brew install gdal # fiona dependency
# pip3 install fiona # geopandas dependency
# pip3 install pyproj # geopandas dependency
# pip3 install pygeos # geopandas dependency
# pip3 install geopandas
# pip3 install folium
# pip3 install matplotlib
# pip3 install seaborn

### imports

In [1]:
# imports
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path
import re
import time
import pickle
import shapely.wkt
from shapely.geometry import Point, Polygon
import geopandas as gpd
from geopandas.tools import sjoin



In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 1. Analyze the store visitation by date and affinity profile of store visitors.

## 1.a Resolve the user visits per store, i.e. filter the GPS signals through polygons.

### stores

In [3]:
stores = pd.read_csv("../../assignment_data/stores.csv")

In [4]:
# stores.shape

In [5]:
# stores.head()

In [6]:
# transform strings to polygons in column "wkt"
stores["wkt"] = stores["wkt"].apply(lambda x: shapely.wkt.loads(x))

In [7]:
# rename "wkt" to "geometry"
stores = stores.rename(columns={"wkt": "geometry"}) # must be geometry for the geopandas join

In [8]:
# type(stores["geometry"][0])

In [9]:
# transform stores into a GeoDataFrame
stores_gdf = gpd.GeoDataFrame(stores)

In [10]:
stores_gdf.head()

Unnamed: 0,store_id,store_name,geometry
0,place_1,McDonald's,"POLYGON ((13.46119 52.47099, 13.46103 52.47066..."
1,place_2,McDonald's,"POLYGON ((13.46835 52.54716, 13.46841 52.54716..."
2,place_3,McDonald's,"POLYGON ((13.31288 52.41979, 13.31303 52.41966..."
3,place_4,McDonald's,"POLYGON ((13.36358 52.56060, 13.36358 52.56061..."
4,place_5,McDonald's,"POLYGON ((13.32309 52.56135, 13.32307 52.56128..."


In [11]:
# type(stores_gdf)

In [13]:
# save stores_gdf as geojson for visualization in Tableau
stores_gdf.to_file("../out_data/stores_gdf.geojson", driver="GeoJSON")

In [14]:
# save stores_gdf as pickle
with open("../out_data/stores_gdf.pickle", "wb") as pickle_file:
    pickle.dump(stores_gdf, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
# check
# with open("../../out_data/stores_gdf.pickle", "rb") as picke_file:
#     check_stores_gdf = pickle.load(picke_file)

In [16]:
# type(check_stores_gdf)

In [17]:
# check_stores_gdf.head()

### gps_signals

In [18]:
# read all gps_signal csv batches
gps_signals = pd.DataFrame()

start = time.time()
print("Reading all gps_signal csv batches...")

for file_name in glob.glob("../../assignment_data/sample_data/*.csv"):
    batch = pd.read_csv(file_name) # , low_memory=False
    gps_signals = pd.concat([gps_signals, batch], ignore_index=True)
    
end = time.time()
dt = end - start
print(f"Reading all gps_signal csv batches took {round(dt/60, 2)} minute(s).")

Reading all gps_signal csv batches...
Reading all gps_signal csv batches took 0.1 minute(s).


In [19]:
# gps_signals.shape

In [20]:
# sort by "utc_timestamp" ascending
gps_signals = gps_signals.sort_values(by=["utc_timestamp"]).reset_index(drop=True)

In [21]:
# transform utc_timestamp into yyyy-mm-dd
gps_signals["utc_timestamp"] = gps_signals["utc_timestamp"].astype("datetime64[ms]").dt.to_period("D")

In [22]:
# rename "utc_timestamp" to "date"
gps_signals = gps_signals.rename(columns={"utc_timestamp": "date"})

In [23]:
# gps_signals.head()

In [24]:
# create POINT from lat lon and transform df into gdf
start = time.time()
print("Creating POINT from lat lon in gps_signals and transforming the df into a gdf...")

gps_signals_gdf = gpd.GeoDataFrame(gps_signals, 
                                   geometry=gpd.points_from_xy(gps_signals["lon"], gps_signals["lat"]))

end = time.time()
dt = end - start
print(f"Creating POINT from lat lon in gps_signals and transforming the df into a gdf took {round(dt/60, 2)} minute(s).")

Creating POINT from lat lon in gps_signals and transforming the df into a gdf...
Creating POINT from lat lon in gps_signals and transforming the df into a gdf took 0.58 minute(s).


In [25]:
gps_signals_gdf.head()

Unnamed: 0,device_id,lat,lon,date,geometry
0,49129,52.55457,13.57401,2021-01-01,POINT (13.57401 52.55457)
1,46744,52.419566,13.208267,2021-01-01,POINT (13.20827 52.41957)
2,22268,52.568431,13.523719,2021-01-01,POINT (13.52372 52.56843)
3,41158,52.49306,13.38074,2021-01-01,POINT (13.38074 52.49306)
4,31401,52.517286,13.317811,2021-01-01,POINT (13.31781 52.51729)


### join (merge) gps_signals_gdf and stores_gdf

In [26]:
# gps_signals_gdf.shape

In [27]:
# stores_gdf.shape

In [28]:
# join gps_signals_gdf and stores_gdf
start = time.time()
print("Joining gps_signals_gdf and stores_gdf...")

gps_sig_and_stores = sjoin(gps_signals_gdf, stores_gdf, how="left")

end = time.time()
dt = end - start
print(f"Joining gps_signals_gdf and stores_gdf took {round(dt/60, 2)} minute(s).")

Joining gps_signals_gdf and stores_gdf...
Joining gps_signals_gdf and stores_gdf took 4.89 minute(s).


In [29]:
# gps_sig_and_stores.shape

In [30]:
gps_sig_and_stores.head()

Unnamed: 0,device_id,lat,lon,date,geometry,index_right,store_id,store_name
0,49129,52.55457,13.57401,2021-01-01,POINT (13.57401 52.55457),,,
1,46744,52.419566,13.208267,2021-01-01,POINT (13.20827 52.41957),,,
2,22268,52.568431,13.523719,2021-01-01,POINT (13.52372 52.56843),,,
3,41158,52.49306,13.38074,2021-01-01,POINT (13.38074 52.49306),,,
4,31401,52.517286,13.317811,2021-01-01,POINT (13.31781 52.51729),,,


In [31]:
# gps_sig_and_stores_gdf[gps_sig_and_stores_gdf["store_id"] == "place_1"].head()

### users and user affinities

**users**

In [32]:
# create a list from the unique device ids
unique_device_ids = list(gps_signals["device_id"].unique())

In [33]:
len(unique_device_ids)

95004

In [34]:
# create a dataframe "users"
data = {
    "device_id": unique_device_ids
}

users = pd.DataFrame(data)

In [35]:
# users.shape

In [36]:
users = users.sort_values(by=["device_id"]).reset_index(drop=True)

In [37]:
# users.head()

In [38]:
# users.tail()

**user affinities**

In [6]:
# list the files in the user_affinities folder
path = os.getcwd()
# path = "/Users/robertbozsik/techtest/adsquare/assignment_data/affinities"
path = f"{str(Path(path).parents[1])}/assignment_data/affinities"
file_names = os.listdir(path)
file_names = sorted(file_names)
# file_names

In [40]:
# add .csv after the file names (it should be run only once)
if ".csv" not in file_names[0]:
    for index, file_name in enumerate(file_names):
        os.rename(os.path.join(path, file_name), os.path.join(path, "".join([file_name, ".csv"])))

In [41]:
# create a dictionary "user_affinities"
# the keys should be the names of the affinities
# the values should be lists of the affinity csvs
def list_from_affinities(aff_name: str) -> list:
    """Return a list created from the given user_affinity csv file"""
    aff_name = pd.read_csv(f"../../assignment_data/user_affinities/{aff_name}.csv", header=None, names=[aff_name])
    aff_name = list(aff_name.iloc[:, 0]) # all rows, first column
    return aff_name

user_affinities = {}

for file_name in file_names:
    name = file_name.split(".")[0]
    user_affinities[name] = list_from_affinities(name)

In [42]:
user_affinities.keys()

dict_keys(['addidas', 'apple', 'bmw', 'employed', 'female', 'h_&_m', 'high_income', 'honda', 'job_seeking', 'low_income', 'male', 'mercedes-benz', 'middle_income', 'retired', 'student', 'tommy_helfinger'])

In [43]:
# len(user_affinities["addidas"])

In [44]:
# len(user_affinities["low_income"])

In [45]:
# len(user_affinities["retired"])

**add user_affinities to users**

In [46]:
# try it out with the affinity "addidas"
users["addidas"] = np.where(users["device_id"].isin(user_affinities["addidas"]), 1, 0)

In [47]:
# users["addidas"].value_counts()

In [48]:
for index, key in enumerate(user_affinities.keys()):
    if index > 0: # the column "addidas" has already been created
        users[key] = np.where(users["device_id"].isin(user_affinities[key]), 1, 0)

In [49]:
# users.shape

In [50]:
users.head()

Unnamed: 0,device_id,addidas,apple,bmw,employed,female,h_&_m,high_income,honda,job_seeking,low_income,male,mercedes-benz,middle_income,retired,student,tommy_helfinger
0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
1,2,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
3,5,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0


In [51]:
# save users as csv
users.to_csv("../out_data/users.csv", index=False)

### merge gps_sig_and_stores and users (affinities)

In [52]:
# merge gps_sig_and_stores and users (affinities)
gps_sig_and_stores.shape

(2262916, 8)

In [53]:
users.shape

(95004, 17)

In [54]:
# merge gps_sig_and_stores and users (affinities)
start = time.time()
print("Merging gps_sig_and_stores and users (affinities)...")

gpssig_stores_useraff = gps_sig_and_stores.merge(users, how="inner", on="device_id")

end = time.time()
dt = end - start
print(f"Merging gps_sig_and_stores and users (affinities) took {round(dt/60, 2)} minute(s).")

Merging gps_sig_and_stores and users (affinities)...
Merging gps_sig_and_stores and users (affinities) took 0.05 minute(s).


In [55]:
gpssig_stores_useraff.shape

(2262916, 24)

In [56]:
gpssig_stores_useraff.head()

Unnamed: 0,device_id,lat,lon,date,geometry,index_right,store_id,store_name,addidas,apple,bmw,employed,female,h_&_m,high_income,honda,job_seeking,low_income,male,mercedes-benz,middle_income,retired,student,tommy_helfinger
0,49129,52.55457,13.57401,2021-01-01,POINT (13.57401 52.55457),,,,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
1,49129,52.55457,13.57401,2021-01-01,POINT (13.57401 52.55457),,,,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
2,49129,52.55457,13.57401,2021-01-01,POINT (13.57401 52.55457),,,,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
3,49129,52.5526,13.57996,2021-01-01,POINT (13.57996 52.55260),,,,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
4,49129,52.5526,13.57996,2021-01-01,POINT (13.57996 52.55260),,,,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0


## 1.b Group the resolved visits by date (yyyy-mm-dd), store_name, and store_id.

## 1.c For each store_id/store_name/date provide the following metric.

### 1.c.i A total number of GPS signals per place_id/date.

### 1.c.ii A total number of unique visitors (i.e. device ids).

In [57]:
# create total number of GPS signals per place_id/date
# and create total number of unique visitors
gsu_total_and_unique = (gpssig_stores_useraff
                        .groupby(by=["date", "store_name", "store_id"])
                        .agg({"lat": "count", "device_id": "nunique"})
                        .rename(columns={"lat": "total_signals", "device_id": "unique_visits"})
                        .reset_index())

In [58]:
gsu_total_and_unique.shape

(1199, 5)

In [59]:
gsu_total_and_unique.head()

Unnamed: 0,date,store_name,store_id,total_signals,unique_visits
0,2021-01-01,Aldi,place_64,1,1
1,2021-01-01,Burger King,place_42,4,1
2,2021-01-01,Burger King,place_43,1,1
3,2021-01-01,Burger King,place_50,1,1
4,2021-01-01,McDonald's,place_17,1,1


### 1.c.iii A total number of unique visitors belonging to each affinity group.

In [60]:
# drop duplicated date, store_name, store_id and device_id 
# in order to get unique visitors belonging to each affinity group
# and group by data, store_name and store_id
start = time.time()
print("Creating total number of unique visitors belonging to each affinity group...")

gsu_unique_aff = (gpssig_stores_useraff
                  .drop(["geometry", "index_right"], axis=1)
                  .drop_duplicates(subset=["date", "store_name", "store_id", "device_id"])
                  .groupby(by=["date", "store_name", "store_id"])
                  .agg(sum)
                  .reset_index())

end = time.time()
dt = end - start
print(f"Create total number of unique visitors belonging to each affinity group took {round(dt/60, 2)} minutes.")

Creating total number of unique visitors belonging to each affinity group...
Create total number of unique visitors belonging to each affinity group took 3.51 minutes.


In [61]:
gsu_unique_aff.shape

(1199, 22)

In [62]:
# delete unnecessary columns
gsu_unique_aff = gsu_unique_aff.drop(["date", "store_name", "store_id"], axis=1)

In [63]:
gsu_unique_aff.head()

Unnamed: 0,device_id,lat,lon,addidas,apple,bmw,employed,female,h_&_m,high_income,honda,job_seeking,low_income,male,mercedes-benz,middle_income,retired,student,tommy_helfinger
0,35745,52.541398,13.436914,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
1,40894,52.422292,13.311893,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0
2,8753,52.527226,13.2322,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
3,47558,52.408845,13.370406,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0
4,23806,52.504506,13.623608,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0


### concat the gsu_total_and_unique and gsu_unique_aff

In [64]:
# concat the gsu_total_and_unique and gsu_unique_aff
final_df = pd.concat([gsu_total_and_unique, gsu_unique_aff], axis=1)

In [65]:
final_df.shape

(1199, 24)

In [66]:
final_df.head()

Unnamed: 0,date,store_name,store_id,total_signals,unique_visits,device_id,lat,lon,addidas,apple,bmw,employed,female,h_&_m,high_income,honda,job_seeking,low_income,male,mercedes-benz,middle_income,retired,student,tommy_helfinger
0,2021-01-01,Aldi,place_64,1,1,35745,52.541398,13.436914,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
1,2021-01-01,Burger King,place_42,4,1,40894,52.422292,13.311893,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0
2,2021-01-01,Burger King,place_43,1,1,8753,52.527226,13.2322,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
3,2021-01-01,Burger King,place_50,1,1,47558,52.408845,13.370406,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0
4,2021-01-01,McDonald's,place_17,1,1,23806,52.504506,13.623608,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0


In [67]:
# save final df as csv
final_df.to_csv("../out_data/analysis.csv", index=False)
print("final_df saved as csv")

final_df saved as csv
