# Introduction 

Author: Yuzhu Zhang (yuzhu.zhang@deliveryhero.com)

In [None]:
%%capture
# install packages
#!pip install autoplotter

In [None]:
# load packages
from datetime import datetime, timedelta, time, date
from scipy import stats
import json
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.io import gbq
from google.cloud import bigquery
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import scipy.stats
#import geopandas
from shapely import wkt
import scipy
from scipy import signal
%matplotlib inline
from google.colab import drive # to allow colab save file in my drive
drive.mount('/content/drive')
from google.colab import auth
auth.authenticate_user()
print('Authenticated')
cwd=os.getcwd()
cwd
%unload_ext google.colab.data_table

Mounted at /content/drive
Authenticated
The google.colab.data_table extension is not loaded.


In [None]:
# define a few things (project id, start date etc.)
project_id = "fulfillment-dwh-production"
client = bigquery.Client(project = project_id)
entity_id = 'FP_SG'
country_code = 'sg'
start_date = '2020-09-04'
end_date = '2020-09-24'
pre_date = '2020-08-14'
time_zone = "Asia/Singapore" # to check delay data
zone_id = "3" # 3 = Far East, If you need two zones like zone 9 and zone 35, then put "9,35" behind "="
%cd "/content/drive/Shared drives/Global Pricing/2 - Entities/APAC/Singapore/7. DPS testing/2. DPS_AB_Testing_20200902"
%pwd
%ls

[Errno 2] No such file or directory: '/content/drive/Shared drives/Global Pricing/2 - Entities/APAC/Singapore/7. DPS testing/2. DPS_AB_Testing_20200902'
/content
adc.json  [0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [None]:
# trunc time function
def ceil_dt(dt, delta):
    return dt + (datetime.min - dt) % delta

# 2. Methodology

## A/B testing

We will check the statistical/substantial significance level of the key metrics.
To check the statistical significance level, we will group the data into hours and make paired comparison.

What is Wilcoxon signed-rank test?
The Wilcoxon signed-rank test is a non-parametric statistical hypothesis test used to compare two related samples, matched samples, or repeated measurements on a single sample to assess whether their population mean ranks differ (i.e. it is a paired difference test). It can be used as an alternative to the paired Student's t-test (also known as "t-test for matched pairs" or "t-test for dependent samples") when the distribution of the difference between two samples' means cannot be assumed to be normally distributed.[1] A Wilcoxon signed-rank test is a nonparametric test that can be used to determine whether two dependent samples were selected from populations having the same distribution.

As we could not assume that the distribution will be normal, we will use wilcoxon signed-rank test to check the significance


Ressource: 

*   https://medium.com/@kangeugine/hypothesis-test-21795f788f7d
*   https://stackoverflow.com/questions/15984221/how-to-perform-two-sample-one-tailed-t-test-with-numpy-scipy






## Pre/Post Analysis

# 3. A/B Test Analysis

## Sanity Check

### a. Invariant: user amount in control and variantion

In [None]:
# Pre-condition check
query_user = """
WITH city_data AS (
  SELECT p.entity_id
    , country_code
    , ci.name AS city_name
    , ci.id AS city_id
    , zo.shape AS zone_shape 
    , zo.name AS zone_name
    , zo.id AS zone_id
  FROM cl.countries co
  LEFT JOIN UNNEST(co.platforms) p
  LEFT JOIN UNNEST(co.cities) ci
  LEFT JOIN UNNEST(ci.zones) zo
  WHERE entity_id = \"""" + entity_id + """\"
  and zo.id in (""" + zone_id + """)
)
 
SELECT  
cast(DATETIME(created_at, "Asia/Singapore") as date) as local_date
--, extract(hour from created_at) as hour
, customer.variant
, count(distinct customer.id) as user_count
FROM `fulfillment-dwh-production.cl.dynamic_pricing_user_sessions` s
left join city_data cd ON s.entity_id = cd.entity_id
WHERE cast(DATETIME(created_at, "Asia/Singapore") as date) between \"""" + start_date + """\" and \"""" + end_date + """\"
and created_date >= \"""" + start_date + """\" 
and s.entity_id = \"""" + entity_id + """\"
and customer.variant in ("Variation1", "Control")
and ST_CONTAINS(cd.zone_shape, customer.location) IS TRUE
group by 1,2

    """

user = client.query(query_user).to_dataframe()
user.head()

Unnamed: 0,local_date,variant,user_count
0,2020-09-05,Control,13847
1,2020-09-11,Control,11983
2,2020-09-15,Variation1,10848
3,2020-09-13,Control,14120
4,2020-09-08,Variation1,12006


In [None]:
user_group = user.pivot_table("user_count", index=["local_date"], columns ="variant", aggfunc = "sum")
user_group = user_group.reset_index()
user_group["delta"]=user_group["Variation1"]/user_group["Control"]-1
#user_group['delta'] = user_group['delta'].astype(float).map(lambda n: '{:.2%}'.format(n))

In [None]:
user_group["delta"]

0     0.029622
1     0.001228
2    -0.007930
3    -0.028486
4    -0.002244
5    -0.001099
6    -0.015773
7     0.008929
8     0.005755
9    -0.002691
10    0.000466
11    0.000369
12    0.003732
13   -0.001617
14   -0.004071
15    0.010132
16    0.004970
17    0.009012
18   -0.011158
19   -0.002888
20    0.006485
Name: delta, dtype: float64

In [None]:
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =user_group.local_date, y=user_group.Control, mode="lines", name ="control"), secondary_y=False)
fig.add_trace(go.Scatter(x =user_group.local_date, y=user_group.Variation1, mode="lines", name ="variation1"), secondary_y=False)
fig.add_trace(go.Scatter(x =user_group.local_date, y=user_group.delta, mode="lines", name ="delta", line = dict(color='royalblue', width=4, dash='dash')), secondary_y=True)
# Add figure title
fig.update_layout(title_text="Daily users in Control and Test")
# Set x-axis title
fig.update_xaxes(title_text="Date")
# Set y-axes titles
fig.update_yaxes(title_text="Daily user amount", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)
fig.show()

In [None]:
results = stats.wilcoxon(user_group['Variation1'], user_group['Control']) 

alpha = 0.05
if (results[0] != 0) & (results[1] <= alpha):
    print ("reject null hypothesis, mean of {} is greater than mean of {}".format('Variation1','Control'))
else:
    print ("accept null hypothesis")

accept null hypothesis


### b. Invariant: user location in control and variantion

In [None]:
# Pre-condition check
query_location = """
WITH city_data AS (
  SELECT p.entity_id
    , country_code
    , ci.name AS city_name
    , ci.id AS city_id
    , zo.shape AS zone_shape 
    , zo.name AS zone_name
    , zo.id AS zone_id
  FROM cl.countries co
  LEFT JOIN UNNEST(co.platforms) p
  LEFT JOIN UNNEST(co.cities) ci
  LEFT JOIN UNNEST(ci.zones) zo
  WHERE entity_id = \"""" + entity_id + """\"
  and zo.id in (""" + zone_id + """)
)
 
SELECT  
st_x(customer.location) as lon
,st_y(customer.location) as lat
-- st_x(ST_GEOGPOINTFROMGEOHASH(st_geohash(customer.location, 7))) as lon
--, st_y(ST_GEOGPOINTFROMGEOHASH(st_geohash(customer.location, 7))) as lat
, customer.variant
, count(distinct customer.id) as user_count
FROM `fulfillment-dwh-production.cl.dynamic_pricing_user_sessions` s
left join city_data cd ON s.entity_id = cd.entity_id
WHERE cast(DATETIME(created_at, "Asia/Singapore") as date) between \"""" + start_date + """\" and \"""" + end_date + """\"
and created_date >= \"""" + start_date + """\" 
and s.entity_id = \"""" + entity_id + """\"
and customer.variant in ("Variation1", "Control")
and ST_CONTAINS(cd.zone_shape, customer.location) IS TRUE
group by 1,2,3

    """

location = client.query(query_location).to_dataframe()
location.head()

Unnamed: 0,lon,lat,variant,user_count
0,103.953526,1.341785,Control,25
1,103.953462,1.363027,Control,3
2,103.935981,1.360708,Control,20
3,103.93196,1.356503,Control,43
4,103.928939,1.324307,Control,1


In [None]:
location.loc[location['variant'] == "Control"]

Unnamed: 0,lon,lat,variant,user_count
0,103.953526,1.341785,Control,25
1,103.953462,1.363027,Control,3
2,103.935981,1.360708,Control,20
3,103.931960,1.356503,Control,43
4,103.928939,1.324307,Control,1
...,...,...,...,...
75846,103.960401,1.358157,Control,1
75847,103.938930,1.335680,Control,1
75848,103.937320,1.323486,Control,1
75849,103.945109,1.354176,Control,1


In [None]:
# Initialize figure with subplots

subset = location.loc[location['variant'] == "Variantion1"]
fig = make_subplots(
    rows=1, cols=2,
    #column_widths=[0.6, 0.4],
    #row_heights=[0.4, 0.6]
           )

fig = px.scatter_mapbox(location, lat="lat", lon="lon", hover_name="variant", hover_data=["user_count"], size="user_count",opacity=0.5, 
                        color_discrete_sequence=["red"], 
                        zoom=12, height=200
                        )
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## Deep Dive
We look at different metrics:

Economics Metrics
*   Orders
*   Delivery fee
*   Basket value

Logistics Metrics 

*   Travel Time


In [None]:
query1 = """
WITH costs as (
  select
    p.entity_id,
    l.platform_order_code platform_order_code,
    sum(p.delivery_costs) delivery_costs,
    sum(p.delivery_costs_eur) delivery_costs_eur
  from 
    (select
      entity_id,
      country_code,
      created_date,
      order_id,
      delivery_costs,
      delivery_costs_eur,
      row_number() over(partition by entity_id, order_id order by created_date desc) as rank
    from cl.utr_timings) p
  left join cl.orders l on p.order_id = l.order_id and p.country_code = l.country_code
  where p.entity_id = \"""" + entity_id + """\"
    and rank = 1
  group by 1,2)

select
zone_name
, zone_id
, vertical_type
, operating_system
, cast(DATETIME(created_at, timezone) as date) as local_date
, DATETIME(created_at, timezone) as local_time
, vendor_code
, platform_order_code_ga
, variant
, dps_delivery_fee_local
, dps_surge_fee_local
, dps_travel_time_local
, cast(c.commission_local as float64) commission_local
, cast(c.commission_eur as float64) commission_eur
, gmv_eur
, gfv_eur
, delivery_fee_eur
, delivery_fee_local
, delivery_fee_local_accounting
, travel_time_distance_km
, mean_delay
, travel_time
, to_customer_time
, to_vendor_time
, delivery_distance
, delivery_costs
, delivery_costs_eur
from cl._dps_sessions_mapped_to_orders o
left join pandata_raw_il_backend_latest.fct_order_commissions c on o.entity_id = c.global_entity_id and o.platform_order_code_ga = c.order_code
left join costs cos on cos.entity_id =o.entity_id and cos.platform_order_code = o.platform_order_code
where o.entity_id = \"""" + entity_id + """\"
and zone_id in (""" + zone_id + """)
and variant in ("Control","Variation1")
and cast(DATETIME(created_at, timezone) as date) between \"""" + start_date + """\"  and \"""" + end_date + """\"
group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
    """

df = client.query(query1).to_dataframe()

In [None]:
# define stage of the setup
conditions = [
    (df['mean_delay'] <= 3.5),
    (df['mean_delay'] > 3.5) & (df['mean_delay'] <= 9),
    (df['mean_delay'] > 9) & (df['mean_delay'] < 15),
    (df['mean_delay'] >= 15) & (df['mean_delay'] < 17),
    (df['mean_delay'] >= 17)
    ]
values = ['price_easing', 'default', 'surge1',"surge_shrink1","shrink2"]
df["stage"] = np.select(conditions,values)
#different stage for control
df["stage_new"] = df["stage"]
df.loc[(df.variant == "Control") & (df.stage == "price_easing") , 'stage_new'] = "default"
df.loc[(df.variant == "Control") & (df.stage == "surge1") , 'stage_new'] = "default"
#df.loc[(df.variant == "Control") & (df.stage == "surge_shrink1") , 'stage_new'] = "shrink1"

# local time to datetime format
#df["local_time"] = df["local_time"].dt.to_pydatetime()
#df.drop(['time', 'new_lt'], axis=1, inplace=True)
# define time block
df["local_hour"] = df.local_time.dt.hour
df["hour_block"] = df["local_hour"]
df.loc[df.local_hour <= 9, 'hour_block'] = 9
df.loc[df.local_hour >=21, 'hour_block'] = 21
df["rev"] = df["commission_local"] + df["delivery_fee_local"]/1.07
df["profit"] = df["rev"]-df["delivery_costs"]
df["profit_eur"] = df["commission_eur"] + df["delivery_fee_eur"]/1.07 - df["delivery_costs_eur"]

# round travel time to 1 digit
df["tt"] = df["travel_time"].round(1)

df["new_time"] = ceil_dt((df['local_time'].dt.to_pydatetime()), timedelta(minutes=30))
df["new_time"] = df['new_time'].dt.time

df["order"] = df["platform_order_code_ga"]
df["df"] = df["delivery_fee_local"]
# drop duplicates
df_copy = df[['zone_name', 'zone_id', 
       'local_date', 'local_time', 'vendor_code', 'platform_order_code_ga',
       'variant', 'commission_local', 'gmv_eur', 'gfv_eur',
       'delivery_fee_eur', 'delivery_fee_local',
       'delivery_fee_local_accounting', 'travel_time_distance_km',
       'mean_delay', 'travel_time',"rev","profit","order","df", "hour_block","tt","profit_eur",
       'delivery_distance', 'delivery_costs']].drop_duplicates()

In [None]:
df_sig = df_copy.pivot_table(values = ["order", "df","rev","delivery_distance","profit"], index=["local_date", "hour_block"],
                        columns = "variant",
                       aggfunc ={"order": "count","df":"sum","rev":"sum","delivery_distance":"sum" ,"profit":"sum",}).reset_index()
df_sig = df_sig.fillna(0)

Variants = ["Variation1"]

In [None]:
check = df_copy.pivot_table(values = ["order","delivery_fee_eur","profit_eur"], 
                            #index=["local_date", "hour_block"],
                        columns = "variant",
                       aggfunc ={"order": "count","delivery_fee_eur":"mean","profit_eur":"mean"}).reset_index()
check

variant,index,Control,Variation1
0,delivery_fee_eur,1.702132,1.727833
1,order,66395.0,65021.0
2,profit_eur,-0.356689,-0.309834


In [None]:
# create empty array list
rows_list = []
metrics = ["order", "df","delivery_distance","rev","profit"]

# loop the data for each zone and each stage
for metric in metrics:
  for variant in Variants:
    sum_delta = df_sig[(metric,variant)].sum()/df_sig[(metric,"Control")].sum()-1   
    sum_sig = stats.wilcoxon(df_sig[(metric,variant)], df_sig[(metric,"Control")]) [1]/2
    sum_sig_tf = sum_sig <= 0.05
    if metric != "order":
      avg_delta = (df_sig[(metric,variant)].sum()/df_sig[("order",variant)].sum())/(df_sig[(metric,"Control")].sum()/df_sig[("order","Control")].sum())-1
      avg_sig = stats.wilcoxon(df_sig[(metric,variant)]/df_sig[("order",variant)],df_sig[(metric,"Control")]/df_sig[("order","Control")])[1]/2
      avg_sig_tf = avg_sig <= 0.05
    else:
      avg_delta=0
      avg_sig = "NA"
      avg_sig_tf = "NA"
    rows_list.append([metric, variant,sum_delta,sum_sig_tf,avg_delta, avg_sig_tf])
result = pd.DataFrame(rows_list, columns=['metrics','variant','sum_delta','sum_sig',"avg_delta","avg_sig"])

def color_negative_red(val):
    color = 'green' if val == True else 'black'
    return f'color: {color}'
result.style.format({"sum_delta": "{:.2%}",
                     "avg_delta": "{:.2%}"})\
                .applymap(color_negative_red)\
                 .hide_index()\
                 .bar(align='mid', color=['#d65f5f', '#5fba7d'])\
                 .set_properties(**{'text-align': 'center'})

metrics,variant,sum_delta,sum_sig,avg_delta,avg_sig
order,Variation1,-2.07%,True,0.00%,
df,Variation1,-0.60%,False,1.50%,True
delivery_distance,Variation1,-2.95%,True,-0.90%,True
rev,Variation1,-0.46%,False,1.64%,True
profit,Variation1,-96.60%,True,-96.53%,True


In [None]:
df_sum = df[df.platform_order_code_ga.notnull()]
#df_sum = df[df['platform_order_code_ga'].isnull()]
df_sum = df_sum.fillna(0)

df_sum_pv = pd.pivot_table(df_sum,values = ["dps_delivery_fee","platform_order_code", "gfv_eur","travel_time"], index = ["stage"], columns = "variant", 
                     aggfunc = {'dps_delivery_fee' : 'mean', 'platform_order_code' : 'count', 'gfv_eur' : 'mean', 'travel_time':'mean'}).reset_index()
df_sum_pv.rename(columns={'dps_delivery_fee': 'avg_df', 'platform_order_code_ga': 'order', 'gfv_eur':'avg_gfv', 'tt':'avg_tt'}, inplace=True)

df_sum_pv.columns = ['_'.join(col).strip() for col in df_sum_pv.columns.values]
#df_sum_pv.rename(columns={'zone_group_': 'zone_group'}, inplace=True)
#df_sum_pv["df_delta"] = round(df_sum_pv["avg_df_during_test"]/df_sum_pv["avg_df_before_test"]-1,2)
#df_sum_pv["gfv_delta"] = round(df_sum_pv["avg_gfv_during_test"]/df_sum_pv["avg_gfv_before_test"]-1,2)
#df_sum_pv["order_delta"] = round(df_sum_pv["order_during_test"]/df_sum_pv["order_before_test"]-1,2)
#df_sum_pv["tt_delta"] = round(df_sum_pv["avg_tt_during_test"]/df_sum_pv["avg_tt_before_test"]-1,2)
df_sum_pv

Unnamed: 0,stage_,avg_df_Control,avg_df_Variation1,avg_gfv_Control,avg_gfv_Variation1,platform_order_code_Control,platform_order_code_Variation1,travel_time_Control,travel_time_Variation1
0,default,3.377075,3.400725,12.222221,12.219709,40057,40354,6.862553,6.829685
1,price_easing,3.273607,2.645453,11.487091,11.748976,2709,2904,7.020967,6.903364
2,shrink2,3.138458,3.669671,13.346157,13.882314,810,708,4.897432,5.025749
3,surge1,3.385357,3.550342,12.603858,12.804641,18732,17504,6.736322,6.554418
4,surge_shrink1,3.172871,3.776908,13.059233,13.422709,3905,3406,5.703493,5.818696


In [None]:
# Summary
pp = pp.fillna(0)
sum = pd.pivot_table(pp,values = ["dps_delivery_fee","platform_order_code_ga", "gfv_eur","tt"], index = ["zone_group"], columns = "period", 
                     aggfunc = {'dps_delivery_fee' : 'mean', 'platform_order_code_ga' : 'count', 'gfv_eur' : 'mean', 'tt':'mean'}).reset_index()
sum.rename(columns={'dps_delivery_fee': 'avg_df', 'platform_order_code_ga': 'order', 'gfv_eur':'avg_gfv', 'tt':'avg_tt'}, inplace=True)
#sum.columns.get_level_values(1)
sum.columns = ['_'.join(col).strip() for col in sum.columns.values]
sum.rename(columns={'zone_group_': 'zone_group'}, inplace=True)
sum["df_delta"] = round(sum["avg_df_during_test"]/sum["avg_df_before_test"]-1,2)
sum["gfv_delta"] = round(sum["avg_gfv_during_test"]/sum["avg_gfv_before_test"]-1,2)
sum["order_delta"] = round(sum["order_during_test"]/sum["order_before_test"]-1,2)
sum["tt_delta"] = round(sum["avg_tt_during_test"]/sum["avg_tt_before_test"]-1,2)
sum

In [None]:
query2 = """
SELECT  
DATETIME(s.created_at,  \"""" + time_zone + """\") as local_time
, s.mean_delay
FROM `fulfillment-dwh-production.cl._zone_stats` 
left join unnest(stats) s
WHERE created_date between \"""" + start_date + """\"  and \"""" + end_date + """\"
and country_code =  \"""" + country_code + """\"
and zone_id = """ + zone_id + """
    """

delay = client.query(query2).to_dataframe()

In [None]:
delay["new_time"] = ceil_dt((delay['local_time'].dt.to_pydatetime()), timedelta(minutes=30))
delay["new_time"] = delay['new_time'].dt.time
#delay.loc[delay.new_time <= time(hour=8, minute=0, second=0), 'new_time'] = time(hour=8, minute=0, second=0)
delay.head()

Unnamed: 0,local_time,mean_delay,new_time
0,2020-09-08 11:57:21.072024,10.197225,12:00:00
1,2020-09-08 11:58:22.520322,9.83985,12:00:00
2,2020-09-09 05:03:24.113467,9.0,05:30:00
3,2020-09-09 05:04:19.711465,9.0,05:30:00
4,2020-09-08 18:20:23.454071,4.718923,18:30:00


If you want to play around the data a bit by yourself, you could delete the "#" of next cell and run it. Then you will see an interactive dashboard

In [None]:
#run_app(df,mode='inline') 

### 1. Orders

#### Aggregated level

In [None]:
# order data analysis
order_sum = df.pivot_table("platform_order_code_ga", index=["local_date"], columns ="variant", aggfunc = "count").reset_index()
order_sum["delta"]=order_sum["Variation1"]/order_sum["Control"] -1

In [None]:
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =order_sum.local_date, y=order_sum.Control, mode="lines", name ="control"), secondary_y=False)
fig.add_trace(go.Scatter(x =order_sum.local_date, y=order_sum.Variation1, mode="lines", name ="variation1"), secondary_y=False)
#fig.add_trace(go.Scatter(x =order_sum.local_date, y=order_sum.delta, mode="lines", name ="delta", line = dict(color='royalblue', width=4, dash='dash')), secondary_y=True)
# Add figure title
fig.update_layout(title_text="Order amount in control and test group")
# Set x-axis title
fig.update_xaxes(title_text="Date")
# Set y-axes titles
fig.update_yaxes(title_text="Order amount", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)
fig.show()

In [None]:
order_hour = df.pivot_table("platform_order_code_ga", index=["new_time"], columns ="variant", aggfunc = "count").reset_index()
order_hour = order_hour.fillna(0)
order_hour["control_pp"] = order_hour["Control"]/order_hour["Control"].sum()
order_hour["variation_pp"] = order_hour["Variation1"]/order_hour["Variation1"].sum()
order_hour["delta"]=order_hour["Variation1"]/order_hour["Control"] -1

In [None]:
# delay
delay_agg = delay.pivot_table("mean_delay",index=["new_time"],aggfunc = "mean").reset_index()

In [None]:
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Bar(x =order_hour.new_time, y=order_hour.control_pp, name ="control"), secondary_y=False)
fig.add_trace(go.Bar(x =order_hour.new_time, y=order_hour.variation_pp,name ="variation1"), secondary_y=False)
#fig.add_trace(go.Scatter(x =order_hour.new_time, y=order_hour.delta, mode="lines", name ="delta", line = dict(color='gold', width=4, dash='dash')), secondary_y=True)
fig.add_trace(go.Scatter(x =delay_agg.new_time, y=delay_agg.mean_delay, mode="lines", name ="Mean delay", line = dict(color='gold', width=4, dash='dash')), secondary_y=True)
# Add figure title
#fig.update_layout(title_text="Daily users in Control and Test")
# Set x-axis title
fig.update_xaxes(title_text="Time")
# Set y-axes titles
fig.update_yaxes(title_text="Order share", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Mean delay (in minute)", showgrid=False,secondary_y=True)
fig.show()

In [None]:
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Bar(x =order_hour.new_time, y=order_hour.Control, name ="control"), secondary_y=False)
fig.add_trace(go.Bar(x =order_hour.new_time, y=order_hour.Variation1,name ="variation1"), secondary_y=False)
#fig.add_trace(go.Scatter(x =order_hour.new_time, y=order_hour.delta, mode="lines", name ="delta", line = dict(color='gold', width=4, dash='dash')), secondary_y=True)
fig.add_trace(go.Scatter(x =delay_agg.new_time, y=delay_agg.mean_delay, mode="lines", name ="Mean delay", line = dict(color='gold', width=4, dash='dash')), secondary_y=True)
# Add figure title
#fig.update_layout(title_text="Daily users in Control and Test")
# Set x-axis title
fig.update_xaxes(title_text="Time")
# Set y-axes titles
fig.update_yaxes(title_text="Order amount", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Mean delay (in minute)", showgrid=False,secondary_y=True)
fig.show()

In [None]:
# Daily Order Significance Check
order_sum.describe()
stats.wilcoxon(order_sum['Variation1'], order_sum['Control'])

WilcoxonResult(statistic=64.0, pvalue=0.07340740770021606)

In [None]:
# Daily Hourly Order Significance Check
order_sig = df.pivot_table("platform_order_code_ga", index=["local_date", "hour_block"], columns ="variant", aggfunc = "count").reset_index()
order_sig
# one-tailed hypothesis, test is less than control
stats.wilcoxon(order_sig['Variation1'], order_sig['Control']) 

WilcoxonResult(statistic=13753.5, pvalue=0.0007736955478681122)

#### Stage level

In [None]:
order = df.pivot_table("platform_order_code_ga", index=["local_date", "stage"],
                       #columns = "stage", 
                       aggfunc = "count").reset_index()
order = order.fillna(0)
#order = order[order.stage != "0"]
#order = order.reset_index()
fig = px.bar(order, x="local_date",y="platform_order_code_ga", color="stage", category_orders={"stage": ["price_easing", "default", "surge1", "surge_shrink1", "shrink2"]})
# Add figure title
#fig.update_layout(title_text="Daily users in Control and Test")
# Set x-axis title
fig.update_xaxes(title_text="Date")
# Set y-axes titles
fig.update_yaxes(title_text="Order amount", showgrid=False)

fig.show()

In [None]:
# prepare the dataset
pivot = df.pivot_table("platform_order_code_ga", index=["stage","local_date"], columns ="variant", aggfunc = "count").reset_index()
pivot["delta"] = pivot["Variation1"]/pivot["Control"]-1
pivot['delta'] = pivot['delta'].astype(float).map(lambda n: '{:.2%}'.format(n))
#graph
fig = px.line(pivot, x="local_date", y="delta", color="stage", text = "delta",category_orders={"stage": ["price_easing", "default", "surge1", "surge_shrink1", "shrink2"]})
# Set x-axis title
fig.update_xaxes(title_text="Date")
# Set y-axes titles
fig.update_yaxes(title_text="delta (variation1 vs control)", showgrid=False)
fig.show()

In [None]:
pivot = df.pivot_table("platform_order_code_ga", index=["stage","new_time"], columns ="variant", aggfunc = "count").reset_index()
#pivot["delta"] = pivot["Variation1"]/pivot["Control"]-1
#pivot['delta'] = pivot['delta'].astype(float).map(lambda n: '{:.2%}'.format(n))
fig = px.bar(pivot, x="new_time", y="Variation1", color="stage", title="Order distribution in Variation")
fig.show()

In [None]:
pivot = df.pivot_table("platform_order_code_ga", index=["stage_new","new_time","variant"], aggfunc = "count").reset_index()
fig = px.bar(pivot, x="new_time", y="platform_order_code_ga", color="stage_new", 
             #barmode="group",
             #facet_row="variant", 
             facet_col="variant",
             )
fig.show()

Significance level in different stage

In [None]:
# definition
zone_names = ["Far_east"]
stages = (df["stage"].unique()).tolist()

In [None]:
# Prepare for the data
order_data = df.pivot_table("platform_order_code_ga", index=["local_date", "hour_block", "stage", "variant","zone_name"], aggfunc = "count").reset_index()
order_data = order_data.fillna(0)

# create empty array list
rows_list = []

# loop the data for each zone and each stage
for zone_name in zone_names:
  for stage in stages:
    zone_data = order_data[(order_data["zone_name"] == zone_name) & (order_data["stage"] == stage)] #filter relevant zone and stage for each loop
    zone_agg = zone_data.pivot_table("platform_order_code_ga", index=["local_date", "hour_block"], columns = "variant",aggfunc = "sum").reset_index() #group the data by date and hourly block
    zone_agg = zone_agg.fillna(0)
    mean_control = round(zone_agg["Control"].mean(),2)
    mean_variation1 = round(zone_agg["Variation1"].mean(),2)
    std_control = round(zone_agg["Control"].std(),2)
    std_variation1 = round(zone_agg["Variation1"].std(),2)
    cor_control_variation1 = round(zone_agg["Control"].corr(zone_agg["Variation1"]),2)
    sample_size_control = zone_agg["Control"].sum()
    sample_size_variation1 = zone_agg["Variation1"].sum()
    sample_size_total = sample_size_control + sample_size_variation1
    #sig_control_variation1_twoside = round(stats.wilcoxon(zone_agg['Variation1'], zone_agg['Control'])[1],2)
    sig_control_variation1_oneside = round(stats.wilcoxon(zone_agg['Variation1'], zone_agg['Control']) [1]/2,2)
    rows_list.append([zone_name, stage, mean_control, mean_variation1, std_control, std_variation1, cor_control_variation1, sample_size_control, sample_size_variation1, sample_size_total, sig_control_variation1_oneside])

# load data into dataframe
result = pd.DataFrame(rows_list, columns=['zone_name','stage','mean_control','mean_variation','std_control', 'std_variation', 'correlation', 'size_control', 'size_variation', 'size_total', 'p-value'])
# add two columns to data frame
result["significance"] = result["p-value"]<=0.05
result["delta"] = round(result["mean_variation"]/result["mean_control"]-1,2)
result

#https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe

Unnamed: 0,zone_name,stage,mean_control,mean_variation,std_control,std_variation,correlation,size_control,size_variation,size_total,p-value,significance,delta
0,Far_east,price_easing,21.16,22.69,28.1,27.93,0.98,2709.0,2904.0,5613.0,0.0,True,0.07
1,Far_east,default,154.66,155.81,90.05,90.7,0.98,40057.0,40354.0,80411.0,0.26,False,0.01
2,Far_east,surge1,89.2,83.35,103.74,97.83,0.99,18732.0,17504.0,36236.0,0.0,True,-0.07
3,Far_east,surge_shrink1,55.0,47.97,67.34,59.05,0.99,3905.0,3406.0,7311.0,0.0,True,-0.13
4,Far_east,shrink2,27.93,24.41,34.58,29.6,0.97,810.0,708.0,1518.0,0.02,True,-0.13


### 2. Delivery Fee

In [None]:
df_avg = df.pivot_table("dps_delivery_fee", index=["local_date"], columns ="variant", aggfunc = "mean")
df_avg = df_avg.reset_index()
df_avg["delta"]=df_avg["Variation1"]/df_avg["Control"] -1

fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =df_avg.local_date, y=df_avg.Control, mode="lines", name ="control"), secondary_y=False)
fig.add_trace(go.Scatter(x =df_avg.local_date, y=df_avg.Variation1, mode="lines", name ="variation1"), secondary_y=False)
#fig.add_trace(go.Scatter(x =df_avg.local_date, y=df_avg.delta, mode="lines", name ="delta", line = dict(color='gray', width=4, dash='dash')), secondary_y=True)
# Add figure title
fig.update_layout(title_text="Daily avg. DF of Control and Test")
# Set x-axis title
fig.update_xaxes(title_text="Date")
# Set y-axes titles
fig.update_yaxes(title_text="Avg. DF", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)
fig.show()

In [None]:
fig = px.box(df, x="stage", y="dps_delivery_fee", color = "variant")
fig.show()

#### Aggregated level

In [None]:
# Daily delivery fee significance check
#df_avg.describe()
stats.wilcoxon(df_avg['Variation1'], df_avg['Control'])

WilcoxonResult(statistic=59.0, pvalue=0.04955245211417102)

In [None]:
# Daily hourly delivery fee significance check
df_sig = df.pivot_table("dps_delivery_fee", index=["local_date", "hour_block"], columns ="variant", aggfunc = "mean").reset_index()
# one-tailed hypothesis, test is less than control
stats.wilcoxon(df_sig['Variation1'], df_sig['Control']) 

WilcoxonResult(statistic=12948.0, pvalue=1.0545732748802742e-05)

#### Stage level

In [None]:
pivot = df.pivot_table("dps_delivery_fee", index=["stage","local_date"], columns ="variant", aggfunc = "mean")
pivot["delta"] = pivot["Variation1"]/pivot["Control"]-1
pivot['delta'] = pivot['delta'].astype(float).map(lambda n: '{:.2%}'.format(n))
pivot = pivot.reset_index()
pivot.head()

fig = px.line(pivot, x="local_date", y="delta", color="stage", text = "delta",category_orders={"stage": ["price_easing", "default", "surge1", "surge_shrink1", "shrink2"]})
# Set x-axis title
fig.update_xaxes(title_text="Date")
# Set y-axes titles
fig.update_yaxes(title_text="delta (variation1 vs control)", showgrid=False)
fig.show()

In [None]:
# Prepare for the data
df_data = pd.pivot_table(df, values=["dps_delivery_fee","platform_order_code_ga", "gfv_eur","tt"], index=["local_date", "hour_block", "stage", "zone_name"],columns= "variant", 
                         aggfunc = {'dps_delivery_fee' : 'sum', 'platform_order_code_ga' : 'count', 'gfv_eur' : 'sum', 'tt':'sum'}).reset_index()
df_data.rename(columns={'dps_delivery_fee': 'sum_df', 'platform_order_code_ga': 'order'}, inplace=True)
#df_data.columns = df_data.columns.get_level_values(1)
df_data.columns = ['_'.join(col).strip() for col in df_data.columns.values]
df_data.rename(columns={'local_date_': 'local_date', 'hour_block_': 'hour_block','stage_': 'stage','zone_name_':'zone_name'}, inplace=True)
df_data.columns = map(str.lower, df_data.columns)
df_data["avg_df_control"] = df_data["sum_df_control"]/df_data["order_control"]
df_data["avg_df_variation1"] = df_data["sum_df_variation1"]/df_data["order_variation1"]
df_data["avg_gfv_control"] = df_data["gfv_eur_control"]/df_data["order_control"]
df_data["avg_gfv_variation1"] = df_data["gfv_eur_variation1"]/df_data["order_variation1"]
df_data["avg_tt_control"] = df_data["tt_control"]/df_data["order_control"]
df_data["avg_tt_variation1"] = df_data["tt_variation1"]/df_data["order_variation1"]
df_data = df_data.fillna(0)

In [None]:
# create empty array list
rows_list = []

# loop the data for each zone and each stage
for zone_name in zone_names:
  for stage in stages:
    zone_data = df_data[(df_data["zone_name"] == zone_name) & (df_data["stage"] == stage)] #filter relevant zone and stage for each loop
    zone_data = zone_data.fillna(0)
    mean_control = round(zone_data["sum_df_control"].sum()/zone_data["order_control"].sum(),2)
    mean_variation1 = round(zone_data["sum_df_variation1"].sum()/zone_data["order_variation1"].sum(),2)
    std_control = round(zone_data["avg_df_control"].std(),2)
    std_variation1 = round(zone_data["avg_df_variation1"].std(),2)
    cor_control_variation1 = round(zone_data["avg_df_control"].corr(zone_data["avg_df_variation1"]),2)
    sample_size_control = zone_data["order_control"].sum()
    sample_size_variation1 = zone_data["order_variation1"].sum()
    sample_size_total = sample_size_control + sample_size_variation1
    #sig_control_variation1_twoside = round(stats.wilcoxon(zone_agg['Variation1'], zone_agg['Control'])[1],2)
    sig_control_variation1_oneside = round(stats.wilcoxon(zone_data['avg_df_variation1'], zone_data['avg_df_control']) [1]/2,2)
    rows_list.append([zone_name, stage, mean_control, mean_variation1, std_control, std_variation1, cor_control_variation1, sample_size_control, sample_size_variation1, sample_size_total, sig_control_variation1_oneside])

# load data into dataframe
result = pd.DataFrame(rows_list, columns=['zone_name','stage','mean_control','mean_variation','std_control', 'std_variation', 'correlation', 'size_control', 'size_variation', 'size_total', 'p-value'])
# add two columns to data frame
result["significance"] = result["p-value"]<=0.05
result["delta"] = round(result["mean_variation"]/result["mean_control"]-1,2)
result

#https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe

Unnamed: 0,zone_name,stage,mean_control,mean_variation,std_control,std_variation,correlation,size_control,size_variation,size_total,p-value,significance,delta
0,Far_east,price_easing,3.27,2.65,1.28,1.0,0.1,2709.0,2904.0,5613.0,0.0,True,-0.19
1,Far_east,default,3.38,3.4,0.71,0.72,0.32,40057.0,40354.0,80411.0,0.01,True,0.01
2,Far_east,surge1,3.39,3.55,0.95,1.06,-0.23,18732.0,17504.0,36236.0,0.0,True,0.05
3,Far_east,surge_shrink1,3.17,3.78,1.13,1.29,0.1,3905.0,3406.0,7311.0,0.0,True,0.19
4,Far_east,shrink2,3.14,3.67,0.9,1.48,-0.23,810.0,708.0,1518.0,0.06,False,0.17


### 3. Basket Value

In [None]:
food_avg = df.pivot_table("gfv_eur", index=["local_date"], columns ="variant", aggfunc = "mean")
food_avg = food_avg.reset_index()
food_avg["delta"]=food_avg["Variation1"]/food_avg["Control"] -1

fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =food_avg.local_date, y=food_avg.Control, mode="lines", name ="control"), secondary_y=False)
fig.add_trace(go.Scatter(x =food_avg.local_date, y=food_avg.Variation1, mode="lines", name ="variation"), secondary_y=False)
#fig.add_trace(go.Scatter(x =food_avg.local_date, y=food_avg.delta, mode="lines", name ="delta", line = dict(color='gray', width=4, dash='dash')), secondary_y=True)
# Add figure title
fig.update_layout(title_text="Daily avg. Basket Value of Control and Test")
# Set x-axis title
fig.update_xaxes(title_text="Date")
# Set y-axes titles
fig.update_yaxes(title_text="Avg. Basket Value", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)
fig.show()

#### Aggregated level

In [None]:
# Daily basket value significance check
#df_avg.describe()
stats.wilcoxon(food_avg['Variation1'], food_avg['Control'])

WilcoxonResult(statistic=52.0, pvalue=0.02730632132788956)

In [None]:
# Daily hourly basket value significance check
food_sig = df.pivot_table("gfv_eur", index=["local_date", "hour_block"], columns ="variant", aggfunc = "mean").reset_index()
stats.wilcoxon(food_sig['Variation1'], food_sig['Control']) 

WilcoxonResult(statistic=17383.0, pvalue=0.3129581320459671)

#### Stage level

In [None]:
pivot = df.pivot_table("gfv_eur", index=["stage","local_date"], columns ="variant", aggfunc = "mean")
pivot["delta"] = pivot["Variation1"]/pivot["Control"]-1
pivot['delta'] = pivot['delta'].astype(float).map(lambda n: '{:.2%}'.format(n))
pivot = pivot.reset_index()
pivot.head()

fig = px.line(pivot, x="local_date", y="delta", color="stage", text = "delta",category_orders={"stage": ["price_easing", "default", "surge1", "surge_shrink1", "shrink2"]})
# Set x-axis title
fig.update_xaxes(title_text="Date")
# Set y-axes titles
fig.update_yaxes(title_text="delta (variation1 vs control)", showgrid=False)
fig.show()

In [None]:
subset = df[df.gfv_eur<50]
fig = px.box(subset, x="stage", y="gfv_eur", color = "variant")
fig.show()

In [None]:
# create empty array list
rows_list = []

# loop the data for each zone and each stage
for zone_name in zone_names:
  for stage in stages:
    zone_data = df_data[(df_data["zone_name"] == zone_name) & (df_data["stage"] == stage)] #filter relevant zone and stage for each loop
    zone_data = zone_data.fillna(0)
    mean_control = round(zone_data["gfv_eur_control"].sum()/zone_data["order_control"].sum(),2)
    mean_variation1 = round(zone_data["gfv_eur_variation1"].sum()/zone_data["order_variation1"].sum(),2)
    std_control = round(zone_data["avg_gfv_control"].std(),2)
    std_variation1 = round(zone_data["avg_gfv_variation1"].std(),2)
    cor_control_variation1 = round(zone_data["avg_gfv_control"].corr(zone_data["avg_gfv_variation1"]),2)
    sample_size_control = zone_data["order_control"].sum()
    sample_size_variation1 = zone_data["order_variation1"].sum()
    sample_size_total = sample_size_control + sample_size_variation1
    #sig_control_variation1_twoside = round(stats.wilcoxon(zone_agg['Variation1'], zone_agg['Control'])[1],2)
    sig_control_variation1_oneside = round(stats.wilcoxon(zone_data['avg_gfv_variation1'], zone_data['avg_gfv_control']) [1]/2,2)
    rows_list.append([zone_name, stage, mean_control, mean_variation1, std_control, std_variation1, cor_control_variation1, sample_size_control, sample_size_variation1, sample_size_total, sig_control_variation1_oneside])

# load data into dataframe
result = pd.DataFrame(rows_list, columns=['zone_name','stage','mean_control','mean_variation','std_control', 'std_variation', 'correlation', 'size_control', 'size_variation', 'size_total', 'p-value'])
# add two columns to data frame
result["significance"] = result["p-value"]<=0.05
result["delta"] = round(result["mean_variation"]/result["mean_control"]-1,2)
result

#https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe

Unnamed: 0,zone_name,stage,mean_control,mean_variation,std_control,std_variation,correlation,size_control,size_variation,size_total,p-value,significance,delta
0,Far_east,price_easing,11.49,11.75,7.06,6.15,-0.06,2709.0,2904.0,5613.0,0.16,False,0.02
1,Far_east,default,12.22,12.22,3.45,5.74,0.13,40057.0,40354.0,80411.0,0.27,False,0.0
2,Far_east,surge1,12.6,12.8,5.89,7.95,-0.01,18732.0,17504.0,36236.0,0.09,False,0.02
3,Far_east,surge_shrink1,13.06,13.42,6.62,5.49,0.44,3905.0,3406.0,7311.0,0.1,False,0.03
4,Far_east,shrink2,13.35,13.88,6.92,5.87,-0.1,810.0,708.0,1518.0,0.18,False,0.04


### 3. Travel Time

In [None]:
tt_order = df.pivot_table("platform_order_code_ga", index=["tt"], columns ="variant", aggfunc = "count")
tt_order["Control_pp"] = tt_order["Control"]/tt_order["Control"].sum()
tt_order["Variation1_pp"] = tt_order["Variation1"]/tt_order["Variation1"].sum()
tt_order = tt_order.reset_index()
tt_order.head()

variant,tt,Control,Variation1,Control_pp,Variation1_pp
0,0.0,15.0,23.0,0.000227,0.000355
1,0.1,16.0,18.0,0.000242,0.000277
2,0.2,58.0,59.0,0.000876,0.000909
3,0.3,88.0,65.0,0.001329,0.001002
4,0.4,217.0,175.0,0.003277,0.002697


In [None]:
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =tt_order.tt, y=tt_order.Control, mode="lines", name ="control"), secondary_y=False)
fig.add_trace(go.Scatter(x =tt_order.tt, y=tt_order.Variation1, mode="lines", name ="variation1"), secondary_y=False)
#fig.add_trace(go.Scatter(x =tt_order.tt, y=tt_order.delta, mode="lines", name ="delta", line = dict(color='gray', width=4, dash='dash')), secondary_y=True)
# Add figure title
fig.update_layout(title_text="Order distribution across travel time")
# Set x-axis title
fig.update_xaxes(title_text="Date")
# Set y-axes titles
fig.update_yaxes(title_text="Order amount", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)
fig.show()

In [None]:
import scipy
from scipy import signal
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =tt_order.tt, y=signal.savgol_filter(tt_order.Control,53, 3), mode="lines", name ="control"), secondary_y=False)
fig.add_trace(go.Scatter(x =tt_order.tt, y=signal.savgol_filter(tt_order.Variation1,53,3), mode="lines", name ="variation1"), secondary_y=False)
#fig.add_trace(go.Scatter(x =tt_order.tt, y=tt_order.delta, mode="lines", name ="delta", line = dict(color='gray', width=4, dash='dash')), secondary_y=True)
# Add figure title
fig.update_layout(title_text="Order across tt of Control and Test")
# Set x-axis title
fig.update_xaxes(title_text="Travel Time")
# Set y-axes titles
fig.update_yaxes(title_text="Order amount", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)
fig.show()

In [None]:
tt_surge = df[(df.stage == "surge_shrink1")|(df.stage == "surge1")|(df.stage == "shrink2")]
surge_pv = tt_surge.pivot_table("platform_order_code_ga", index=["tt"], columns ="variant", aggfunc = "count")
surge_pv["Control_pp"] = surge_pv["Control"]/surge_pv["Control"].sum()
surge_pv["Variation1_pp"] = surge_pv["Variation1"]/surge_pv["Variation1"].sum()
surge_pv = surge_pv.reset_index()

fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =surge_pv.tt, y=signal.savgol_filter(surge_pv.Control_pp,53, 3), mode="lines", name ="control"), secondary_y=False)
fig.add_trace(go.Scatter(x =surge_pv.tt, y=signal.savgol_filter(surge_pv.Variation1_pp,53,3), mode="lines", name ="variation1"), secondary_y=False)
#fig.add_trace(go.Scatter(x =tt_order.tt, y=tt_order.delta, mode="lines", name ="delta", line = dict(color='gray', width=4, dash='dash')), secondary_y=True)
# Add figure title
fig.update_layout(title_text="Order share across travel time of Control and Test is surge stage")
# Set x-axis title
fig.update_xaxes(title_text="Travel Time")
# Set y-axes titles
fig.update_yaxes(title_text="Orders%", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)
fig.show()

In [None]:
tt_default = df[df.stage == "default"]
default_pv = tt_default.pivot_table("platform_order_code_ga", index=["tt"], columns ="variant", aggfunc = "count")
default_pv["Control_pp"] = default_pv["Control"]/default_pv["Control"].sum()
default_pv["Variation1_pp"] = default_pv["Variation1"]/default_pv["Variation1"].sum()
default_pv = default_pv.reset_index()

fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =default_pv.tt, y=signal.savgol_filter(default_pv.Control_pp,53, 3), mode="lines", name ="control"), secondary_y=False)
fig.add_trace(go.Scatter(x =default_pv.tt, y=signal.savgol_filter(default_pv.Variation1_pp,53,3), mode="lines", name ="variation1"), secondary_y=False)
#fig.add_trace(go.Scatter(x =tt_order.tt, y=tt_order.delta, mode="lines", name ="delta", line = dict(color='gray', width=4, dash='dash')), secondary_y=True)
# Add figure title
fig.update_layout(title_text="Order share across travel time of Control and Test is default stage")
# Set x-axis title
fig.update_xaxes(title_text="Travel Time")
# Set y-axes titles
fig.update_yaxes(title_text="Orders%", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)
fig.show()

In [None]:
tt_avg = df.pivot_table("travel_time", index=["local_date"], columns ="variant", aggfunc = "mean").reset_index()
tt_avg["delta"]=tt_avg["Variation1"]/tt_avg["Control"] -1

fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =tt_avg.local_date, y=tt_avg.Control, mode="lines", name ="control"), secondary_y=False)
fig.add_trace(go.Scatter(x =tt_avg.local_date, y=tt_avg.Variation1, mode="lines", name ="variation"), secondary_y=False)
#fig.add_trace(go.Scatter(x =tt_avg.local_date, y=tt_avg.delta, mode="lines", name ="delta", line = dict(color='gray', width=4, dash='dash')), secondary_y=True)
# Add figure title
fig.update_layout(title_text="Daily Average Travel Time")
# Set x-axis title
fig.update_xaxes(title_text="Date")
# Set y-axes titles
fig.update_yaxes(title_text="Avg. Basket Value", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)
fig.show()

In [None]:
tt_order_hour = df.pivot_table("platform_order_code_ga", index=["tt","stage","variant"], aggfunc = "count").reset_index()
tt_order_hour.rename(columns={'platform_order_code_ga': 'order_amount'}, inplace=True)
#tt_order["Control_pp"] = tt_order["Control"]/tt_order["Control"].sum()
#tt_order["Variation1_pp"] = tt_order["Variation1"]/tt_order["Variation1"].sum()
tt_order_hour.head()

Unnamed: 0,tt,stage,variant,order_amount
0,0.0,default,Control,13
1,0.0,default,Variation1,16
2,0.0,price_easing,Variation1,1
3,0.0,surge1,Control,2
4,0.0,surge1,Variation1,5


In [None]:
def smoothTriangle(data, degree):
    triangle=np.concatenate((np.arange(degree + 1), np.arange(degree)[::-1])) # up then down
    smoothed=[]

    for i in range(degree, len(data) - degree * 2):
        point=data[i:i + len(triangle)] * triangle
        smoothed.append(np.sum(point)/np.sum(triangle))
    # Handle boundaries
    smoothed=[smoothed[0]]*int(degree + degree/2) + smoothed
    while len(smoothed) < len(data):
        smoothed.append(smoothed[-1])
    return smoothed

fig = px.line(x = tt_order_hour.tt, y = smoothTriangle(tt_order_hour.order_amount,10), color=tt_order_hour.variant, 
               facet_col=tt_order_hour.stage, facet_col_wrap=5
              )
#fig = px.line(tt_order_hour, x='tt', y='order_amount', color='variant',  facet_col='hour_block', facet_col_wrap=4)
#fig.add_trace(go.Scatter(x =tt_order.tt, y=signal.savgol_filter(tt_order.Control_pp,53, 3), mode="lines", name ="control"), secondary_y=False)
fig.show()

#fig.add_trace(go.Scatter(x =tt_order.tt, y=signal.savgol_filter(tt_order.Control_pp,53, 3), mode="lines", name ="control"), secondary_y=False)

#### Aggregated level

In [None]:
fig = px.box(df, x="variant", y="travel_time")
fig.show()

In [None]:
# Daily basket value significance check
#df_avg.describe()
stats.wilcoxon(tt_avg['Variation1'], tt_avg['Control'])

WilcoxonResult(statistic=38.0, pvalue=0.007065994730893274)

In [None]:
# Daily hourly basket value significance check
tt_sig = df.pivot_table("travel_time", index=["local_date", "hour_block"], columns ="variant", aggfunc = "mean").reset_index()
# one-tailed hypothesis, test is less than control
stats.wilcoxon(tt_sig['Variation1'], tt_sig['Control']) 

WilcoxonResult(statistic=15644.0, pvalue=0.019238182346056468)

#### Stage level

In [None]:
pivot = df.pivot_table("tt", index=["stage","local_date"], columns ="variant", aggfunc = "mean")
pivot["delta"] = pivot["Variation1"]/pivot["Control"]-1
pivot['delta'] = pivot['delta'].astype(float).map(lambda n: '{:.2%}'.format(n))
pivot = pivot.reset_index()
pivot.head()

fig = px.line(pivot, x="local_date", y="delta", color="stage", text = "delta",category_orders={"stage": ["price_easing", "default", "surge1", "surge_shrink1", "shrink2"]})
# Set x-axis title
fig.update_xaxes(title_text="Date")
# Set y-axes titles
fig.update_yaxes(title_text="delta (variation1 vs control)", showgrid=False)
fig.show()

In [None]:
#subset = df[df.gfv_eur<50]
fig = px.box(df, x="stage", y="travel_time", color = "variant")
fig.show()

In [None]:
# create empty array list
rows_list = []

# loop the data for each zone and each stage
for zone_name in zone_names:
  for stage in stages:
    zone_data = df_data[(df_data["zone_name"] == zone_name) & (df_data["stage"] == stage)] #filter relevant zone and stage for each loop
    zone_data = zone_data.fillna(0)
    mean_control = round(zone_data["tt_control"].sum()/zone_data["order_control"].sum(),2)
    mean_variation1 = round(zone_data["tt_variation1"].sum()/zone_data["order_variation1"].sum(),2)
    std_control = round(zone_data["avg_tt_control"].std(),2)
    std_variation1 = round(zone_data["avg_tt_variation1"].std(),2)
    cor_control_variation1 = round(zone_data["avg_tt_control"].corr(zone_data["avg_gfv_variation1"]),2)
    sample_size_control = zone_data["order_control"].sum()
    sample_size_variation1 = zone_data["order_variation1"].sum()
    sample_size_total = sample_size_control + sample_size_variation1
    #sig_control_variation1_twoside = round(stats.wilcoxon(zone_agg['Variation1'], zone_agg['Control'])[1],2)
    sig_control_variation1_oneside = round(stats.wilcoxon(zone_data['avg_gfv_variation1'], zone_data['avg_gfv_control']) [1]/2,2)
    rows_list.append([zone_name, stage, mean_control, mean_variation1, std_control, std_variation1, cor_control_variation1, sample_size_control, sample_size_variation1, sample_size_total, sig_control_variation1_oneside])

# load data into dataframe
result = pd.DataFrame(rows_list, columns=['zone_name','stage','mean_control','mean_variation','std_control', 'std_variation', 'correlation', 'size_control', 'size_variation', 'size_total', 'p-value'])
# add two columns to data frame
result["significance"] = result["p-value"]<=0.05
result["delta"] = round(result["mean_variation"]/result["mean_control"]-1,2)
result

#https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe

Unnamed: 0,zone_name,stage,mean_control,mean_variation,std_control,std_variation,correlation,size_control,size_variation,size_total,p-value,significance,delta
0,Far_east,price_easing,7.02,6.9,2.98,3.05,0.01,2709.0,2904.0,5613.0,0.16,False,-0.02
1,Far_east,default,6.86,6.83,1.81,1.83,-0.23,40057.0,40354.0,80411.0,0.27,False,-0.0
2,Far_east,surge1,6.74,6.55,3.08,2.33,-0.02,18732.0,17504.0,36236.0,0.09,False,-0.03
3,Far_east,surge_shrink1,5.7,5.82,2.66,3.88,0.14,3905.0,3406.0,7311.0,0.1,False,0.02
4,Far_east,shrink2,4.9,5.03,4.05,2.46,-0.35,810.0,708.0,1518.0,0.18,False,0.03


# Pre/Post Analysis

In [None]:
'''
query3 = """

SELECT 
DATETIME(o.created_at, o.timezone) as local_time
, o.platform_order_code
, t.variant
, vendor.vendor_code
, o.zone_id as zone_id
, st_distance(vendor.location,customer.location)*4.51650006498/1000 as travel_time
, CASE 
    WHEN cast(DATETIME(o.created_at, o.timezone) as date) < \"""" + start_date + """\" THEN 'before_test'
    ELSE 'during_test'
  END
  AS period
, pa.gfv_eur
, voucher_type
, voucher_value_local
, discount_type
, discount_value_local
, pa.delivery_fee_original_local df_origin
, pa.delivery_fee_local df_final
FROM fulfillment-dwh-production.cl.orders o
left join unnest(deliveries) d
left join unnest(porygon) p
left join unnest(d.timings) t
left join `dhh---analytics-apac.pandata.fct_orders` pa on o.platform_order_code = pa.order_code_google and lower(o.country_code) = lower(pa.country_iso_code)
left join fulfillment-dwh-production.cl._dps_sessions_mapped_to_orders t on o.platform_order_code = t.platform_order_code and t.entity_id = o.entity.id
WHERE created_date_local between \"""" + pre_date + """\"  and \"""" + end_date + """\"
and o.entity.id = \"""" + entity_id + """\"
and lower(pa.country_iso_code) = \"""" + country_code + """\"
and is_valid_order
and o.order_status = 'completed'
and d.delivery_status = 'completed'
and p.vehicle_profile = 'default'
and o.zone_id in (""" + zone_id + """)
group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14

    """

pp = client.query(query3).to_dataframe()

'''

'\nquery3 = """\n\nSELECT \nDATETIME(o.created_at, o.timezone) as local_time\n, o.platform_order_code\n, t.variant\n, vendor.vendor_code\n, o.zone_id as zone_id\n, st_distance(vendor.location,customer.location)*4.51650006498/1000 as travel_time\n, CASE \n    WHEN cast(DATETIME(o.created_at, o.timezone) as date) < """" + start_date + """" THEN \'before_test\'\n    ELSE \'during_test\'\n  END\n  AS period\n, pa.gfv_eur\n, voucher_type\n, voucher_value_local\n, discount_type\n, discount_value_local\n, pa.delivery_fee_original_local df_origin\n, pa.delivery_fee_local df_final\nFROM fulfillment-dwh-production.cl.orders o\nleft join unnest(deliveries) d\nleft join unnest(porygon) p\nleft join unnest(d.timings) t\nleft join `dhh---analytics-apac.pandata.fct_orders` pa on o.platform_order_code = pa.order_code_google and lower(o.country_code) = lower(pa.country_iso_code)\nleft join fulfillment-dwh-production.cl._dps_sessions_mapped_to_orders t on o.platform_order_code = t.platform_order_code an

In [None]:
query = """

select
zone_name
, zone_id
, vertical_type
, operating_system
, DATETIME(created_at, timezone) as local_time
, vendor_code
, platform_order_code
, platform_order_code_ga
, variant
, dps_delivery_fee
, dps_surge_fee
, dps_travel_time
, gmv_eur
, gfv_eur
, delivery_fee_eur
, delivery_fee_local_accounting
, travel_time_distance_km
, delay
, mean_delay
, travel_time
, to_customer_time
, to_vendor_time
, delivery_distance
, actual_delivery_time
, order_delay_mins
from cl._dps_sessions_mapped_to_orders
where entity_id = \"""" + entity_id + """\"
#and zone_id in (""" + zone_id + """)
#and zone_id is not null
and cast(DATETIME(created_at, timezone) as date) between \"""" + pre_date + """\"  and \"""" + end_date + """\"
group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
    """

pp = client.query(query).to_dataframe()

In [None]:
# clean the data for work 
# 1. define local date
pp["local_date"] = pp.local_time.dt.date
# define time interval
pp["new_time"] = ceil_dt((pp['local_time'].dt.to_pydatetime()), timedelta(minutes=30))
pp["new_time"] = pp['new_time'].dt.time
#pp.loc[pp.new_time <= time(hour=8, minute=0, second=0), 'new_time'] = time(hour=8, minute=0, second=0)

# define pre-post & variant
pp["period_variant"] = pp["variant"]
pp.loc[pp.local_date < date(year=2020, month=9, day=4) , 'period_variant'] = "before_test"
pp.loc[(pp.variant == "Control") & (pp.local_date >= date(year=2020, month=9, day=4)) , 'period_variant'] = "during_test_control"
pp.loc[(pp.variant == "Variation1") & (pp.local_date >= date(year=2020, month=9, day=4)) , 'period_variant'] = "during_test_variation"

# define period
pp["period"] = pp["local_date"]
pp.loc[pp.local_date < date(year=2020, month=9, day=4) , 'period'] = "before_test"
pp.loc[pp.local_date >= date(year=2020, month=9, day=4), 'period'] = "during_test"

# define zone group
pp["zone_group"] = pp["zone_name"] 
pp.loc[pp.zone_name == "Far_east", 'zone_group'] = "Far_east"
pp.loc[pp.zone_name == "Bedok", 'zone_group'] = "Bedok"
pp.loc[(pp.zone_name != "Bedok") & (pp.zone_name != "Far_east") , 'zone_group'] = "other_zones"

pp["tt"] = pp["travel_time"].round(1)

#only use test zone 3 Far East
pp3 = pp[pp.zone_id == 3]
#delete certain rows with "Original"
pp3 = pp3[pp3.period_variant != "Original"]

#not Far East, not Bedok
pp_other = pp[(pp.zone_id != 3) & (pp.zone_id != 36)]

## Deep Analysis

In [None]:
# Summary
pp = pp.fillna(0)
pp = pp[pp.zone_group != "Bedok"]
sum = pd.pivot_table(pp,values = ["dps_delivery_fee","platform_order_code_ga", "gfv_eur","tt"], index = ["zone_group"], columns = "period", 
                     aggfunc = {'dps_delivery_fee' : 'mean', 'platform_order_code_ga' : 'count', 'gfv_eur' : 'mean', 'tt':'mean'}).reset_index()
sum.rename(columns={'dps_delivery_fee': 'avg_df', 'platform_order_code_ga': 'order', 'gfv_eur':'avg_gfv', 'tt':'avg_tt'}, inplace=True)
#sum.columns.get_level_values(1)
sum.columns = ['_'.join(col).strip() for col in sum.columns.values]
sum.rename(columns={'zone_group_': 'zone_group'}, inplace=True)
sum["df_delta"] = sum["avg_df_during_test"]/sum["avg_df_before_test"]-1
sum["gfv_delta"] = sum["avg_gfv_during_test"]/sum["avg_gfv_before_test"]-1
sum["order_delta"] = sum["order_during_test"]/sum["order_before_test"]-1
sum["tt_delta"] = sum["avg_tt_during_test"]/sum["avg_tt_before_test"]-1
sum['df_delta'] = sum['df_delta'].astype(float).map(lambda n: '{:.2%}'.format(n))
sum['gfv_delta'] = sum['gfv_delta'].astype(float).map(lambda n: '{:.2%}'.format(n))
sum['order_delta'] = sum['order_delta'].astype(float).map(lambda n: '{:.2%}'.format(n))
sum['tt_delta'] = sum['tt_delta'].astype(float).map(lambda n: '{:.2%}'.format(n))
sum

Unnamed: 0,zone_group,avg_df_before_test,avg_df_during_test,avg_gfv_before_test,avg_gfv_during_test,order_before_test,order_during_test,avg_tt_before_test,avg_tt_during_test,df_delta,gfv_delta,order_delta,tt_delta
0,Far_east,2.710258,2.947733,10.558456,10.9136,161117,155265,6.948094,6.716036,8.76%,3.36%,-3.63%,-3.34%
1,other_zones,2.631776,2.697803,10.349022,10.815406,1358517,1345616,5.879755,5.889682,2.51%,4.51%,-0.95%,0.17%


In [None]:
# Summary
pp_gaorder = pp[pp.platform_order_code_ga != 0]
pp_gaorder = pp_gaorder[pp_gaorder.vertical_type == "restaurants"]
sum_ga = pd.pivot_table(pp_gaorder,values = ["dps_delivery_fee","platform_order_code_ga", "gfv_eur","tt"], index = ["zone_group"], columns = "period", 
                     aggfunc = {'dps_delivery_fee' : 'sum', 'platform_order_code_ga' : 'count', 'gfv_eur' : 'sum', 'tt':'sum'}).reset_index()
sum_ga


Unnamed: 0_level_0,zone_group,dps_delivery_fee,dps_delivery_fee,gfv_eur,gfv_eur,platform_order_code_ga,platform_order_code_ga,tt,tt
period,Unnamed: 1_level_1,before_test,during_test,before_test,during_test,before_test,during_test,before_test,during_test
0,Bedok,151942.4,158581.7,523660.2,535523.4,44938,45329,221613.6,222489.0
1,Far_east,410929.5,421719.2,1439368.0,1409266.0,116238,114566,742564.2,689008.0
2,other_zones,3384131.0,3309139.0,11984120.0,12213950.0,987181,1010255,5460334.1,5531981.3


### 1. Order change

In [None]:
subset = pp3[pp3.new_time > time(hour=8, minute=0, second=0)]
order_fe = subset.pivot_table("platform_order_code", index=["new_time"], columns ="period",  aggfunc=lambda x: len(x.unique())).reset_index()
order_fe = order_fe.fillna(0)
order_fe["before_pp"] = order_fe["before_test"]/order_fe["before_test"].sum()
order_fe["during_pp"] = order_fe["during_test"]/order_fe["during_test"].sum()
#order_fe["variation_pp"] = order_fe["during_test_variation"]/order_fe["during_test_variation"].sum()
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =order_fe.new_time, y=order_fe.before_pp, name ="Beforetesting"), secondary_y=False)
fig.add_trace(go.Scatter(x =order_fe.new_time, y=order_fe.during_pp,name ="During Test"), secondary_y=False)
#fig.add_trace(go.Scatter(x =order_fe.new_time, y=order_fe.variation_pp,name ="During Test Variation"), secondary_y=False)
#fig.add_trace(go.Scatter(x =order_fe.new_time, y=hour.delta, mode="lines", name ="delta", line = dict(color='gold', width=4, dash='dash')), secondary_y=True)
# Add figure title
#fig.update_layout(title_text="Orders in Control and Test (Far E")
# Set x-axis title
fig.update_xaxes(title_text="time")
# Set y-axes titles
fig.update_yaxes(title_text="Share of orders", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)
fig.show()

In [None]:
subset = pp3[pp3.new_time > time(hour=8, minute=0, second=0)]
order_fe = subset.pivot_table("platform_order_code", index=["new_time"], columns ="period_variant",  aggfunc=lambda x: len(x.unique())).reset_index()
order_fe = order_fe.fillna(0)
order_fe["before_pp"] = order_fe["before_test"]/order_fe["before_test"].sum()
order_fe["control_pp"] = order_fe["during_test_control"]/order_fe["during_test_control"].sum()
order_fe["variation_pp"] = order_fe["during_test_variation"]/order_fe["during_test_variation"].sum()
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =order_fe.new_time, y=order_fe.before_pp, name ="Before testing"), secondary_y=False)
fig.add_trace(go.Scatter(x =order_fe.new_time, y=order_fe.control_pp,name ="During Test: Control"), secondary_y=False)
fig.add_trace(go.Scatter(x =order_fe.new_time, y=order_fe.variation_pp,name ="During Test: Variation"), secondary_y=False)
#fig.add_trace(go.Scatter(x =order_fe.new_time, y=hour.delta, mode="lines", name ="delta", line = dict(color='gold', width=4, dash='dash')), secondary_y=True)
# Add figure title
#fig.update_layout(title_text="Orders in Control and Test")
# Set x-axis title
fig.update_xaxes(title_text="time")
# Set y-axes titles
fig.update_yaxes(title_text="Share of orders", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)
fig.show()

### 3. Delivevy Fee change/distribution

### 4. Basket value change/distribution

### 5. Travel time distribution

In [None]:
tt_order = pp3.pivot_table("platform_order_code", index=["tt"], columns ="period_variant", aggfunc = "count")
tt_order = tt_order.fillna(0)
tt_order["before_pp"] = tt_order["before_test"]/(tt_order["before_test"].sum())
tt_order["control_pp"] = tt_order["during_test_control"]/(tt_order["during_test_control"].sum())
tt_order["variation_pp"] = tt_order["during_test_variation"]/(tt_order["during_test_variation"].sum())
tt_order = tt_order.reset_index()
tt_order.head()

period_variant,tt,0,before_test,during_test_control,during_test_variation,before_pp,control_pp,variation_pp
0,0.0,8.0,31.0,15.0,23.0,0.000192,0.000227,0.000355
1,0.1,3.0,42.0,16.0,18.0,0.000261,0.000242,0.000277
2,0.2,25.0,103.0,58.0,59.0,0.000639,0.000876,0.000909
3,0.3,30.0,165.0,88.0,65.0,0.001024,0.001329,0.001002
4,0.4,58.0,442.0,217.0,175.0,0.002743,0.003277,0.002697


In [None]:
fig = go.Figure()

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =tt_order.tt, y=signal.savgol_filter(tt_order.before_pp,53, 3), mode="lines", name ="before test"), secondary_y=False)
fig.add_trace(go.Scatter(x =tt_order.tt, y=signal.savgol_filter(tt_order.control_pp,53,3), mode="lines", name ="during test control"), secondary_y=False)
fig.add_trace(go.Scatter(x =tt_order.tt, y=signal.savgol_filter(tt_order.variation_pp,53,3), mode="lines", name ="during test variation"), secondary_y=False)
# Add figure title
fig.update_layout(title_text="Order across travel time (Before test, Test-Control, Test-Variation)")
# Set x-axis title
fig.update_xaxes(title_text="Travel Time (in minutes)")
# Set y-axes titles
fig.update_yaxes(title_text="Share of orders", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)


fig.show()

### 2. Sessions converted to orders

In [None]:
query5 =  """
WITH city_data AS (
  SELECT p.entity_id
    , country_code
    , ci.name AS city_name
    , ci.id AS city_id
    , zo.shape AS zone_shape 
    , zo.name AS zone_name
    , zo.id AS zone_id
  FROM cl.countries co
  LEFT JOIN UNNEST(co.platforms) p
  LEFT JOIN UNNEST(co.cities) ci
  LEFT JOIN UNNEST(ci.zones) zo
  WHERE country_code = "sg" and zo.id = 3
),

orders as (
  select entity.id as entity_id
  , created_date
  , count(distinct platform_order_code) as order_amount
  from cl.orders o
  left join unnest(deliveries) d
  left join unnest(porygon) p
  where
  o.entity.id = 'FP_SG'
  and o.order_status = 'completed'
  and d.delivery_status = 'completed'
  and o.created_date between '2020-08-13' and '2020-09-21'
  and p.vehicle_profile = 'default'
  and zone_id in (3) 
  group by 1,2
  )
  
  
SELECT 
se.created_date as date
, o.order_amount
, count(distinct customer.id) customer_amount
, count(distinct customer.user_id) user_amount
, count(distinct customer.session.id) session_amount
FROM cl.dynamic_pricing_user_sessions se
LEFT JOIN city_data cd ON se.entity_id = cd.entity_id
LEFT JOIN orders o on se.entity_id = o.entity_id and o.created_date = se.created_date
where ST_CONTAINS(cd.zone_shape, customer.location) IS TRUE
and se.entity_id = "FP_SG"
and se.created_date between '2020-08-13' and '2020-09-21'
GROUP BY 1,2

    """

session_order = client.query(query5).to_dataframe()

In [None]:
session_order["cvr_customer"]=session_order["order_amount"]/session_order["customer_amount"]
session_order["cvr_session"]=session_order["order_amount"]/session_order["session_amount"]
fig = px.line(session_order.sort_values("date"), x="date", y="cvr_customer", title='')
fig.show()

### 6. Delay comparison

In [None]:
query4 = """

SELECT  
DATETIME(s.created_at,  \"""" + time_zone + """\") as local_time
,zone_id
,CASE 
    WHEN cast(DATETIME(s.created_at, \"""" + time_zone + """\") as date) < \"""" + start_date + """\" THEN 'before_test'
    ELSE 'during_test'
    end as period
, s.mean_delay
FROM `fulfillment-dwh-production.cl._zone_stats` 
left join unnest(stats) s
WHERE cast(DATETIME(s.created_at,  \"""" + time_zone + """\") as date) between \"""" + pre_date + """\"  and \"""" + end_date + """\"
and country_code =  \"""" + country_code + """\"
--and zone_id = """ + zone_id + """
and zone_id is not null

    """

pp_delay = client.query(query4).to_dataframe()

In [None]:
pp_delay["new_time"] = ceil_dt((pp_delay['local_time'].dt.to_pydatetime()), timedelta(minutes=10))
pp_delay["new_time"] = pp_delay['new_time'].dt.time
pp_delay.loc[pp_delay.new_time <= time(hour=8, minute=0, second=0), 'new_time'] = time(hour=8, minute=0, second=0)
delay_agg = pp_delay.pivot_table("mean_delay", index=["new_time", "period", "zone_group"], aggfunc = "mean").reset_index()

In [None]:
delay3 = pp_delay[pp_delay.zone_id == 3]
delay36 = pp_delay[pp_delay.zone_id == 36]
delay_other = pp_delay[(pp_delay.zone_id != 36)&(pp_delay.zone_id != 36)]

delay3_agg = delay3.pivot_table("mean_delay", index=["new_time", "period"], aggfunc = "mean").reset_index()
delay36_agg = delay36.pivot_table("mean_delay", index=["new_time", "period"], aggfunc = "mean").reset_index()
delay_other_agg = delay_other.pivot_table("mean_delay", index=["new_time", "period"], aggfunc = "mean").reset_index()

In [None]:
fig = px.line(delay3_agg, x="new_time", y="mean_delay", color = "period", title='Mean Delay before-test vs during-test (Far East)')
fig.show()

In [None]:
fig = px.line(delay_other_agg, x="new_time", y="mean_delay", color = "period", title='Mean Delay before-test vs during-test (Other Zones)')
fig.show()

In [None]:
pp_new["real_delay"] = pp["order_delay_mins"].round(1)
pp_new.head()

Unnamed: 0,zone_name,zone_id,vertical_type,operating_system,local_time,vendor_code,platform_order_code,platform_order_code_ga,variant,dps_delivery_fee,dps_surge_fee,dps_travel_time,gmv_eur,gfv_eur,delivery_fee_eur,delivery_fee_local_accounting,travel_time_distance_km,delay,mean_delay,travel_time,to_customer_time,to_vendor_time,delivery_distance,actual_delivery_time,order_delay_mins,local_date,new_time,period_variant,tt,real_delay
0,Far_east,3,restaurants,Web,2020-09-22 20:21:44.331155,v6tv,v6tv-xmfp,v6tv-xmfp,Variation1,1.99,-1.0,2.99,6.702253,5.456923,1.24533,1.99,0.411408,4.844058,3.199177,1.85,3.2,0.4,0.49,20.33,3.43,2020-09-22,20:30:00,during_test_variation,1.8,3.4
1,Far_east,3,restaurants,Android,2020-09-23 06:57:36.330984,x1qw,x1qw-pa5b,x1qw-pa5b,Variation1,2.115,-0.875,2.99,10.882557,9.637227,1.24533,1.99,0.57414,4.614669,3.385426,2.58,6.73,0.0,0.73,12.65,-1.25,2020-09-23,07:00:00,during_test_variation,2.6,-1.2
2,Far_east,3,restaurants,Android,2020-09-22 20:16:24.110821,g3sm,g3sm-fj40,g3sm-fj40,Variation1,1.99,-1.0,2.99,7.44069,6.19536,1.24533,1.99,0.482642,3.737256,2.305163,2.17,5.12,0.0,0.59,12.12,-1.78,2020-09-22,20:30:00,during_test_variation,2.2,-1.8
3,Far_east,3,restaurants,Android,2020-09-22 19:22:28.706929,y3ph,y3ph-dqoo,y3ph-dqoo,Variation1,1.99,-1.0,2.99,22.603678,21.358348,1.163977,1.99,0.626494,5.001153,3.303494,2.82,4.23,6.77,0.79,45.47,19.55,2020-09-22,19:30:00,during_test_variation,2.8,19.6
4,Far_east,3,restaurants,iOS,2020-09-22 20:17:50.925290,v7fw,v7fw-4pry,v7fw-4pry,Variation1,1.99,-1.0,2.99,4.3743,2.941232,1.24533,1.99,0.412424,4.607689,3.021993,1.86,2.48,0.0,0.55,8.9,-6.0,2020-09-22,20:30:00,during_test_variation,1.9,-6.0


In [None]:
real_delay = pp_new.pivot_table("order_delay_mins", index=["new_time","period_variant"],  aggfunc = "mean").reset_index()
real_delay


Unnamed: 0,new_time,period_variant,order_delay_mins
0,00:00:00,before_test,0.274807
1,00:00:00,during_test_control,-1.086524
2,00:00:00,during_test_variation,-0.623434
3,00:30:00,before_test,0.958032
4,00:30:00,during_test_control,0.793299
...,...,...,...
139,23:00:00,during_test_control,1.061189
140,23:00:00,during_test_variation,0.606564
141,23:30:00,before_test,-0.541464
142,23:30:00,during_test_control,-0.425826


In [None]:
delay_order = pp_new.pivot_table("platform_order_code", index=["real_delay"], columns ="period_variant", aggfunc = "count")
delay_order = delay_order.fillna(0)
delay_order["before_pp"] = delay_order["before_test"]/(delay_order["before_test"].sum())
delay_order["control_pp"] = delay_order["during_test_control"]/(delay_order["during_test_control"].sum())
delay_order["variation_pp"] = delay_order["during_test_variation"]/(delay_order["during_test_variation"].sum())
delay_order = delay_order.reset_index()
delay_order.head()

period_variant,real_delay,before_test,during_test_control,during_test_variation,before_pp,control_pp,variation_pp
0,-147.3,1.0,0.0,0.0,6e-06,0.0,0.0
1,-44.4,1.0,0.0,0.0,6e-06,0.0,0.0
2,-42.9,1.0,0.0,0.0,6e-06,0.0,0.0
3,-39.4,0.0,1.0,0.0,0.0,1.6e-05,0.0
4,-38.4,1.0,0.0,0.0,6e-06,0.0,0.0


In [None]:
fig = go.Figure()

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x =delay_order.real_delay, y=signal.savgol_filter(delay_order.before_pp,53, 3), mode="lines", name ="before test"), secondary_y=False)
fig.add_trace(go.Scatter(x =delay_order.real_delay, y=signal.savgol_filter(delay_order.control_pp,53,3), mode="lines", name ="during test control"), secondary_y=False)
fig.add_trace(go.Scatter(x =delay_order.real_delay, y=signal.savgol_filter(delay_order.variation_pp,53,3), mode="lines", name ="during test variation"), secondary_y=False)
# Add figure title
fig.update_layout(title_text="Order across travel time (Before test, Test-Contro, Test-Variation")
# Set x-axis title
fig.update_xaxes(title_text="Travel Time (in minutes)")
# Set y-axes titles
fig.update_yaxes(title_text="Share of orders", showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="Delta % difference of Variation1 vs Control", showgrid=False,secondary_y=True)


fig.show()

### 7. Shrinking/closure

In [None]:
query6 = """

SELECT 
datetime(start_at,\"""" + time_zone + """\") as start_time 
, zone_id
, action
, duration/60 as duration_min
FROM cl.delivery_areas_events
WHERE created_date between \"""" + pre_date + """\" and \"""" + end_date + """\"
and zone_id is not null
and country_code = "sg"

    """

event = client.query(query6).to_dataframe()

In [None]:
# 1. define local date
event["local_date"] = event.start_time.dt.date

# 2. define period
event["period"] = event["start_time"]
event.loc[event.local_date< date(year=2020, month=9, day=4), "period"] = "before_test"
event.loc[event.local_date>= date(year=2020, month=9, day=4), "period"] = "during_test"
event.head()

# 3. define zone
event["zone_group"] = event["zone_id"]
event.loc[event.zone_id == 3, 'zone_group'] = "Far_east"
event.loc[event.zone_id != 3 , 'zone_group'] = "other_zones"

In [None]:
close = event[event.action == "close"]
close_pv = close.pivot_table(["action","duration_min"],index=["zone_group"],columns = "period",aggfunc = {"action":"count", "duration_min":"sum"}).reset_index()
close_pv.columns = ['_'.join(col).strip() for col in close_pv.columns.values]
close_pv["duration_delta (during vs before)"] = close_pv["duration_min_during_test"]/close_pv["duration_min_before_test"]-1
close_pv.rename(columns={'zone_group_': 'zone_group', "action_before_test":"close_count_before_test", "action_during_test":"close_count_during_test"}, inplace=True)
close_pv['duration_delta (during vs before)'] = close_pv['duration_delta (during vs before)'].astype(float).map(lambda n: '{:.2%}'.format(n))
close_pv.drop(['duration_min_before_test',"duration_min_during_test"], axis=1, inplace=True)
close_pv

Unnamed: 0,zone_group,close_count_before_test,close_count_during_test,duration_delta (during vs before)
0,Far_east,40,32,-0.73%
1,other_zones,806,1630,-0.20%


In [None]:
shrink = event[event.action == "shrink"]
shrink_pv = shrink.pivot_table(["action","duration_min"],index=["zone_group"],columns = "period",aggfunc = {"action":"count", "duration_min":"sum"}).reset_index()
shrink_pv.columns = ['_'.join(col).strip() for col in shrink_pv.columns.values]
shrink_pv["duration_delta (during vs before)"] = shrink_pv["duration_min_during_test"]/shrink_pv["duration_min_before_test"]-1
shrink_pv.rename(columns={'zone_group_': 'zone_group', "action_before_test":"shrink_count_before_test", "action_during_test":"shrink_count_during_test"}, inplace=True)
shrink_pv['duration_delta (during vs before)'] = shrink_pv['duration_delta (during vs before)'].astype(float).map(lambda n: '{:.2%}'.format(n))
shrink_pv.drop(['duration_min_before_test',"duration_min_during_test"], axis=1, inplace=True)
shrink_pv

Unnamed: 0,zone_group,shrink_count_before_test,shrink_count_during_test,duration_delta (during vs before)
0,Far_east,1282,510,-32.37%
1,other_zones,13148,15424,9.28%


In [None]:
# define time interval
pp["new_time"] = ceil_dt((pp['local_time'].dt.to_pydatetime()), timedelta(minutes=30))
pp["new_time"] = pp['new_time'].dt.time
#pp.loc[pp.new_time <= time(hour=8, minute=0, second=0), 'new_time'] = time(hour=8, minute=0, second=0)

# define pre-post & variant
pp["period_variant"] = pp["variant"]
pp.loc[pp.local_date < date(year=2020, month=9, day=4) , 'period_variant'] = "before_test"
pp.loc[(pp.variant == "Control") & (pp.local_date >= date(year=2020, month=9, day=4)) , 'period_variant'] = "during_test_control"
pp.loc[(pp.variant == "Variation1") & (pp.local_date >= date(year=2020, month=9, day=4)) , 'period_variant'] = "during_test_variation"

# define period
pp["period"] = pp["local_date"]
pp.loc[pp.local_date < date(year=2020, month=9, day=4) , 'period'] = "before_test"
pp.loc[pp.local_date >= date(year=2020, month=9, day=4), 'period'] = "during_test"

# define zone group
pp["zone_group"] = pp["zone_name"] 
pp.loc[pp.zone_name == "Far_east", 'zone_group'] = "Far_east"
pp.loc[pp.zone_name == "Bedok", 'zone_group'] = "Bedok"
pp.loc[(pp.zone_name != "Bedok") & (pp.zone_name != "Far_east") , 'zone_group'] = "other_zones"

pp["tt"] = pp["travel_time"].round(1)