In [429]:
import numpy as np
import pandas as pd
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px
import db_dtypes
import bigframes.pandas as bpd
from IPython.display import display, HTML
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import math
import statsmodels.api as sm
from datetime import datetime
from typing import Union
import logging
import sys

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s:%(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)


growth_data = """

WITH holdout_entities AS (
  SELECT
        entity_id
       ,`Release Date` as release_date
  FROM `logistics-data-storage-staging.long_term_pricing.global_holdout_rollout`
  WHERE `Release Date` < DATE_SUB(CURRENT_DATE(), INTERVAL EXTRACT(DAYOFWEEK FROM CURRENT_DATE())- 2 DAY)
),
customer_information AS (
  SELECT
         e.entity_id
        ,e.release_date
        ,IF(d.created_date <= e.release_date, FALSE, COALESCE(is_customer_holdout, FALSE)) AS is_customer_holdout
        ,customer_id
  FROM `fulfillment-dwh-production.cl.dps_holdout_users` AS d
  JOIN holdout_entities AS e
        ON d.entity_id = e.entity_id
  WHERE d.created_date >= date('2025-01-01')
    AND created_date < CURRENT_DATE()  -- data is not full/not present for current date
    AND customer_id IS NOT NULL
    AND customer_id NOT IN UNNEST(ARRAY(SELECT id FROM `fulfillment-dwh-production.cl._bad_dps_logs_ids`)) -- get rid of ids like 'null', 'test' etc
    --AND d.entity_id in ('FP_MM','GV_CI', 'PY_UY','GV_BA','PY_NI','TB_IQ','TB_OM','PY_HN','TB_KW','GV_ME','PO_FI','FP_SG','FP_TH','HS_SA','FY_CY','TB_JO','GV_RS','GV_UG','GV_NG','NP_HU')
  GROUP BY 1, 2, 3, 4
)
  SELECT 
     e.entity_id
    ,e.customer_id customer_id
    ,e.is_customer_holdout
    ,COUNT(DISTINCT case when dps.created_date <= e.release_date then dps.platform_order_code end) AS orders_pre
    ,COUNT(DISTINCT case when dps.created_date > e.release_date then dps.platform_order_code end) AS orders_post
    ,SUM(case when dps.created_date <= e.release_date then dps.fully_loaded_gross_profit_eur end) AS flgp_pre
    ,SUM(case when dps.created_date > e.release_date then dps.fully_loaded_gross_profit_eur end) AS flgp_post
  FROM customer_information e
  LEFT JOIN `fulfillment-dwh-production.cl.dps_sessions_mapped_to_orders` dps
    ON dps.entity_id = e.entity_id
    AND dps.dps_customer_id = e.customer_id
  WHERE created_date >= DATE_SUB(release_date, INTERVAL 8 WEEK)
  AND created_date < CURRENT_DATE
  AND platform_order_code IS NOT NULL
  AND is_own_delivery
  AND is_sent
  --AND dps.entity_id in ('FP_MM','GV_CI', 'PY_UY','GV_BA','PY_NI','TB_IQ','TB_OM','PY_HN','TB_KW','GV_ME','PO_FI','FP_SG','FP_TH','HS_SA','FY_CY','TB_JO','GV_RS','GV_UG','GV_NG','NP_HU')
  AND vendor_vertical_parent in ('Restaurant','restaurant','restaurants')
  GROUP BY 1,2,3

  """


mkt_data = """
  
  WITH holdout_entities AS (
  SELECT
        entity_id
       ,`Release Date` as release_date
  FROM `logistics-data-storage-staging.long_term_pricing.global_holdout_rollout`
  WHERE `Release Date` < DATE_SUB(CURRENT_DATE(), INTERVAL EXTRACT(DAYOFWEEK FROM CURRENT_DATE())- 2 DAY)
),
customer_information AS (
  SELECT
         e.entity_id
        ,e.release_date
        ,IF(d.created_date <= e.release_date, FALSE, COALESCE(is_customer_holdout, FALSE)) AS is_customer_holdout
        ,customer_id
  FROM `fulfillment-dwh-production.cl.dps_holdout_users` AS d
  JOIN holdout_entities AS e
        ON d.entity_id = e.entity_id
  WHERE d.created_date >= date('2025-01-01')
    AND created_date < CURRENT_DATE()  -- data is not full/not present for current date
    AND customer_id IS NOT NULL
    AND customer_id NOT IN UNNEST(ARRAY(SELECT id FROM `fulfillment-dwh-production.cl._bad_dps_logs_ids`)) -- get rid of ids like 'null', 'test' etc
  GROUP BY 1, 2, 3, 4
)
  SELECT 
     e.entity_id
    ,e.customer_id customer_id
    ,e.is_customer_holdout
    ,COUNT(DISTINCT case when mkt.order_date <= e.release_date then mkt.order_id end) AS orders_pre
    ,COUNT(DISTINCT case when mkt.order_date > e.release_date then mkt.order_id end) AS orders_post
    ,SUM(case when mkt.order_date <= e.release_date then mkt.analytical_profit end) AS analytical_profit_pre
    ,SUM(case when mkt.order_date > e.release_date then mkt.analytical_profit end) AS analytical_profit_post
    ,SUM(case when mkt.order_date <= e.release_date then mkt.local_analytical_profit end) AS local_analytical_profit_pre
    ,SUM(case when mkt.order_date > e.release_date then mkt.local_analytical_profit end) AS local_analytical_profit_post
    ,SUM(case when mkt.order_date <= e.release_date then mkt.revenue_net end) AS revenue_pre
    ,SUM(case when mkt.order_date > e.release_date then mkt.revenue_net end) AS revenue_post
    ,SUM(case when mkt.order_date <= e.release_date then mkt.cost_of_sales end) AS cost_of_sales_pre
    ,SUM(case when mkt.order_date > e.release_date then mkt.cost_of_sales end) AS cost_of_sales_post
  FROM customer_information e
  LEFT JOIN `fulfillment-dwh-production.cl.dps_sessions_mapped_to_orders` dps
    ON dps.entity_id = e.entity_id
    AND dps.dps_customer_id = e.customer_id 
  LEFT JOIN fulfillment-dwh-production.curated_data_shared_mkt.bima_order_profitability mkt
    ON mkt.global_entity_id = dps.entity_id
    AND mkt.order_id = dps.platform_order_code
  WHERE order_date >= DATE_SUB(release_date, INTERVAL 8 WEEK)
  AND order_date < CURRENT_DATE
  and created_date >= DATE_SUB(release_date, INTERVAL 8 WEEK)
  and created_date < CURRENT_DATE
  AND platform_order_code IS NOT NULL
  AND is_own_delivery
  AND is_sent
  AND e.entity_id is not null
  AND vendor_vertical_parent in ('Restaurant','restaurant','restaurants')
  GROUP BY 1,2,3
  
  """


In [430]:
project_id = "logistics-customer-staging"
logging.info(f"Initializing BigQuery client for project: {project_id}")

try:
    # Initialize BigQuery client
    client = bigquery.Client(project=project_id)
except Exception as e:
    logging.error(f"Failed to initialize BigQuery client: {e}")
    sys.exit(1)

growth_df = client.query(growth_data).to_dataframe()

#mkt_df =  client.query(mkt_data).to_dataframe()

2025-02-11 22:46:45,977 INFO:Initializing BigQuery client for project: logistics-customer-staging






In [432]:
mkt_df['flgp_pre'] = mkt_df['analytical_profit_pre']
mkt_df['flgp_post'] = mkt_df['analytical_profit_post']

# mkt_df['flgp_pre'] = mkt_df['local_analytical_profit_pre']
# mkt_df['flgp_post'] = mkt_df['local_analytical_profit_post']

# mkt_df['flgp_pre'] = mkt_df['revenue_pre'] + mkt_df['cost_of_sales_pre']
# mkt_df['flgp_post'] = mkt_df['revenue_post'] + mkt_df['cost_of_sales_post']


In [433]:
import numpy as np
import pandas as pd

def calculate_sustainable_growth(data):
    
    data = data.copy()

    entity = data['entity_id'].iloc[0]

    # Apply CUPED to FLGP
    data_flgp = data.dropna(subset=['flgp_pre', 'flgp_post'])
    theta_flgp = np.cov(data_flgp['flgp_pre'], data_flgp['flgp_post'])[0, 1] / np.var(data_flgp['flgp_pre'])
    data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())

    # Apply CUPED to Orders
    data_orders = data.dropna(subset=['orders_pre', 'orders_post'])
    theta_orders = np.cov(data_orders['orders_pre'], data_orders['orders_post'])[0, 1] / np.var(data_orders['orders_pre'])
    data_orders['orders_post_cuped'] = data_orders['orders_post'] - theta_orders * (data_orders['orders_pre'] - data_orders['orders_pre'].mean())

    # Total FLGP and Orders (CUPED)
    holdout_total_flgp_cuped = data_flgp.loc[data_flgp['is_customer_holdout'], 'flgp_post_cuped'].sum()
    non_holdout_total_flgp_cuped = data_flgp.loc[~data_flgp['is_customer_holdout'], 'flgp_post_cuped'].sum()

    holdout_total_orders_cuped = data_orders.loc[data_orders['is_customer_holdout'], 'orders_post_cuped'].sum()
    non_holdout_total_orders_cuped = data_orders.loc[~data_orders['is_customer_holdout'], 'orders_post_cuped'].sum()

    # Normalize for Population Differences
    holdout_user_count = data['is_customer_holdout'].sum()
    non_holdout_user_count = (~data['is_customer_holdout']).sum()

    scaled_holdout_total_flgp_cuped = (holdout_total_flgp_cuped / holdout_user_count) * non_holdout_user_count if holdout_user_count != 0 else np.nan
    scaled_holdout_total_orders_cuped = (holdout_total_orders_cuped / holdout_user_count) * non_holdout_user_count if holdout_user_count != 0 else np.nan

    # FLGP per Order and Orders per User (CUPED)
    holdout_flgp_per_order_cuped = holdout_total_flgp_cuped / holdout_total_orders_cuped if holdout_total_orders_cuped != 0 else np.nan
    non_holdout_flgp_per_order_cuped = non_holdout_total_flgp_cuped / non_holdout_total_orders_cuped if non_holdout_total_orders_cuped != 0 else np.nan

    holdout_orders_per_user_cuped = holdout_total_orders_cuped / holdout_user_count if holdout_user_count != 0 else np.nan
    non_holdout_orders_per_user_cuped = non_holdout_total_orders_cuped / non_holdout_user_count if non_holdout_user_count != 0 else np.nan

    # Incremental Differences (CUPED)
    incremental_orders_cuped = non_holdout_total_orders_cuped - scaled_holdout_total_orders_cuped
    incremental_flgp_cuped = non_holdout_total_flgp_cuped - scaled_holdout_total_flgp_cuped

    # Percentage Changes (CUPED)
    percentage_change_orders_cuped = ((incremental_orders_cuped) / abs(scaled_holdout_total_orders_cuped)) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan
    percentage_change_flgp_cuped = ((incremental_flgp_cuped) / abs(scaled_holdout_total_flgp_cuped)) * 100 if scaled_holdout_total_flgp_cuped != 0 else np.nan

    # Sustainable Growth Calculation
    sustainable_growth = ((incremental_orders_cuped + (incremental_flgp_cuped / non_holdout_flgp_per_order_cuped)) / scaled_holdout_total_orders_cuped) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan

    return {
        'entity': entity,
        'sustainable_growth': sustainable_growth,
        'incremental_orders_cuped': incremental_orders_cuped,
        'incremental_flgp_cuped': incremental_flgp_cuped,
        'percentage_change_orders_cuped': percentage_change_orders_cuped,
        'percentage_change_flgp_cuped': percentage_change_flgp_cuped,
        'holdout_flgp_per_order_cuped': holdout_flgp_per_order_cuped,
        'non_holdout_flgp_per_order_cuped': non_holdout_flgp_per_order_cuped,
        'holdout_orders_per_user_cuped': holdout_orders_per_user_cuped,
        'non_holdout_orders_per_user_cuped': non_holdout_orders_per_user_cuped,
        'scaled_holdout_total_orders_cuped': scaled_holdout_total_orders_cuped,
        'non_holdout_total_orders_cuped': non_holdout_total_orders_cuped,
        'non_holdout_total_flgp_cuped': non_holdout_total_flgp_cuped,
        'scaled_holdout_total_flgp_cuped': scaled_holdout_total_flgp_cuped
    }


#calculate_sustainable_growth(growth_df)


results = []
for entity in mkt_df['entity_id'].unique():
    entity_data = mkt_df[mkt_df['entity_id'] == entity]
    result = calculate_sustainable_growth(entity_data)
    results.append(result)

pd.DataFrame(results)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

Unnamed: 0,entity,sustainable_growth,incremental_orders_cuped,incremental_flgp_cuped,percentage_change_orders_cuped,percentage_change_flgp_cuped,holdout_flgp_per_order_cuped,non_holdout_flgp_per_order_cuped,holdout_orders_per_user_cuped,non_holdout_orders_per_user_cuped,scaled_holdout_total_orders_cuped,non_holdout_total_orders_cuped,non_holdout_total_flgp_cuped,scaled_holdout_total_flgp_cuped
0,EF_GR,7.729787,138111.699112,33514.267634,6.11063,1.549559,0.956923,0.915791,2.178339,2.311449,2260188.0,2398299.0,2196340.0,2162826.0
1,FP_TW,1.809193,78427.237009,93392.018397,0.844197,0.966163,1.040486,1.041744,3.269494,3.297095,9290161.0,9368589.0,9759671.0,9666278.0
2,YS_TR,5.855582,143208.367893,708256.32491,1.723559,4.233997,2.01325,2.062935,1.985008,2.019221,8308874.0,8452083.0,17436100.0,16727840.0
3,PY_AR,1.42821,61342.206187,97936.653707,0.787178,0.640097,1.96342,1.960555,2.380435,2.399173,7792675.0,7854017.0,15398230.0,15300290.0
4,MJM_AT,3.401393,-707.678628,118588.565262,-0.075154,3.604571,3.493878,3.62254,2.288688,2.286968,941632.7,940925.0,3408538.0,3289950.0
5,FP_MY,1.814275,-24918.010045,97704.348202,-0.468499,2.347357,0.782583,0.804723,1.9025,1.893587,5318687.0,5293769.0,4260018.0,4162314.0
6,PY_EC,-1.462751,18538.614435,-27035.783598,1.862626,-3.161365,0.859236,0.816858,2.125594,2.165186,995294.7,1013833.0,828157.6,855193.4
7,DJ_CZ,-0.925941,1662.130284,-13051.833049,0.144272,-1.057371,1.071422,1.058566,2.250641,2.253888,1152083.0,1153745.0,1221315.0,1234367.0
8,NP_HU,-0.439964,-306.15843,-3609.561069,-0.027398,-0.410984,0.785951,0.782936,2.397249,2.396592,1117465.0,1117159.0,874663.9,878273.5
9,FP_MM,1.063984,10888.645986,-2532.584785,1.48277,-0.41097,0.839176,0.823517,2.016146,2.046041,734345.1,745233.7,613712.4,616245.0


In [434]:
import numpy as np
import pandas as pd

def calculate_sustainable_growth(data):
    
    data = data.copy()

    entity = data['entity_id'].iloc[0]

    # Apply CUPED to FLGP
    data_flgp = data.dropna(subset=['flgp_pre', 'flgp_post'])
    theta_flgp = np.cov(data_flgp['flgp_pre'], data_flgp['flgp_post'])[0, 1] / np.var(data_flgp['flgp_pre'])
    data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())

    # Apply CUPED to Orders
    data_orders = data.dropna(subset=['orders_pre', 'orders_post'])
    theta_orders = np.cov(data_orders['orders_pre'], data_orders['orders_post'])[0, 1] / np.var(data_orders['orders_pre'])
    data_orders['orders_post_cuped'] = data_orders['orders_post'] - theta_orders * (data_orders['orders_pre'] - data_orders['orders_pre'].mean())

    # Total FLGP and Orders (CUPED)
    holdout_total_flgp_cuped = data_flgp.loc[data_flgp['is_customer_holdout'], 'flgp_post_cuped'].sum()
    non_holdout_total_flgp_cuped = data_flgp.loc[~data_flgp['is_customer_holdout'], 'flgp_post_cuped'].sum()

    holdout_total_orders_cuped = data_orders.loc[data_orders['is_customer_holdout'], 'orders_post_cuped'].sum()
    non_holdout_total_orders_cuped = data_orders.loc[~data_orders['is_customer_holdout'], 'orders_post_cuped'].sum()

    # Normalize for Population Differences
    holdout_user_count = data['is_customer_holdout'].sum()
    non_holdout_user_count = (~data['is_customer_holdout']).sum()

    scaled_holdout_total_flgp_cuped = (holdout_total_flgp_cuped / holdout_user_count) * non_holdout_user_count if holdout_user_count != 0 else np.nan
    scaled_holdout_total_orders_cuped = (holdout_total_orders_cuped / holdout_user_count) * non_holdout_user_count if holdout_user_count != 0 else np.nan

    # FLGP per Order and Orders per User (CUPED)
    holdout_flgp_per_order_cuped = holdout_total_flgp_cuped / holdout_total_orders_cuped if holdout_total_orders_cuped != 0 else np.nan
    non_holdout_flgp_per_order_cuped = non_holdout_total_flgp_cuped / non_holdout_total_orders_cuped if non_holdout_total_orders_cuped != 0 else np.nan

    holdout_orders_per_user_cuped = holdout_total_orders_cuped / holdout_user_count if holdout_user_count != 0 else np.nan
    non_holdout_orders_per_user_cuped = non_holdout_total_orders_cuped / non_holdout_user_count if non_holdout_user_count != 0 else np.nan

    # Incremental Differences (CUPED)
    incremental_orders_cuped = non_holdout_total_orders_cuped - scaled_holdout_total_orders_cuped
    incremental_flgp_cuped = non_holdout_total_flgp_cuped - scaled_holdout_total_flgp_cuped

    # Percentage Changes (CUPED)
    percentage_change_orders_cuped = ((incremental_orders_cuped) / abs(scaled_holdout_total_orders_cuped)) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan
    percentage_change_flgp_cuped = ((incremental_flgp_cuped) / abs(scaled_holdout_total_flgp_cuped)) * 100 if scaled_holdout_total_flgp_cuped != 0 else np.nan

    # Sustainable Growth Calculation
    sustainable_growth = ((incremental_orders_cuped + (incremental_flgp_cuped / non_holdout_flgp_per_order_cuped)) / scaled_holdout_total_orders_cuped) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan

    return {
        'entity': entity,
        'sustainable_growth': sustainable_growth,
        'incremental_orders_cuped': incremental_orders_cuped,
        'incremental_flgp_cuped': incremental_flgp_cuped,
        'percentage_change_orders_cuped': percentage_change_orders_cuped,
        'percentage_change_flgp_cuped': percentage_change_flgp_cuped,
        'holdout_flgp_per_order_cuped': holdout_flgp_per_order_cuped,
        'non_holdout_flgp_per_order_cuped': non_holdout_flgp_per_order_cuped,
        'holdout_orders_per_user_cuped': holdout_orders_per_user_cuped,
        'non_holdout_orders_per_user_cuped': non_holdout_orders_per_user_cuped,
        'scaled_holdout_total_orders_cuped': scaled_holdout_total_orders_cuped,
        'non_holdout_total_orders_cuped': non_holdout_total_orders_cuped,
        'non_holdout_total_flgp_cuped': non_holdout_total_flgp_cuped,
        'scaled_holdout_total_flgp_cuped': scaled_holdout_total_flgp_cuped
    }


#calculate_sustainable_growth(growth_df)


results = []
for entity in growth_df['entity_id'].unique():
    entity_data = growth_df[growth_df['entity_id'] == entity]
    result = calculate_sustainable_growth(entity_data)
    results.append(result)

pd.DataFrame(results)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

Unnamed: 0,entity,sustainable_growth,incremental_orders_cuped,incremental_flgp_cuped,percentage_change_orders_cuped,percentage_change_flgp_cuped,holdout_flgp_per_order_cuped,non_holdout_flgp_per_order_cuped,holdout_orders_per_user_cuped,non_holdout_orders_per_user_cuped,scaled_holdout_total_orders_cuped,non_holdout_total_orders_cuped,non_holdout_total_flgp_cuped,scaled_holdout_total_flgp_cuped
0,TB_KW,5.20573,62630.521326,278085.989648,1.369434,3.933325,1.545875,1.584974,4.942683,5.010369,4573459.0,4636090.0,7348083.0,7069997.0
1,NP_HU,-6.087691,-451.545453,-22097.966183,-0.040403,-5.704617,0.346611,0.32697,2.397477,2.396508,1117593.0,1117142.0,365271.9,387369.8
2,PY_AR,1.959808,61035.332413,131408.977231,0.774959,1.189726,1.402412,1.408184,2.403514,2.422141,7875942.0,7936977.0,11176730.0,11045320.0
3,AP_PA,-1.117159,18120.753392,-42890.356423,1.96536,-2.934394,1.585284,1.509106,2.538482,2.588372,922007.0,940127.8,1418752.0,1461643.0
4,EF_GR,7.729787,138111.699112,33514.267634,6.11063,1.549559,0.956923,0.915791,2.178339,2.311449,2260188.0,2398299.0,2196340.0,2162826.0
5,FP_TH,3.69653,12711.235313,21956.00623,0.858698,2.895131,0.512316,0.52266,2.086541,2.104458,1480292.0,1493003.0,780333.0,758377.0
6,HF_EG,10.187228,306731.794673,174457.213054,4.395453,5.873791,0.425613,0.43164,3.575357,3.73251,6978388.0,7285120.0,3144553.0,2970096.0
7,PY_PE,-1.055107,22710.521546,-13998.599588,1.404214,-2.367838,0.365543,0.351946,2.114728,2.144424,1617312.0,1640023.0,577198.9,591197.5
8,TB_AE,2.196914,111908.709401,222609.221847,1.202605,0.992242,2.41093,2.405919,5.003414,5.063585,9305524.0,9417433.0,22657580.0,22434970.0
9,TB_JO,-0.896654,-36440.282263,3775.976886,-2.378375,1.541213,0.159906,0.166327,2.34172,2.286025,1532150.0,1495710.0,248776.2,245000.3


In [377]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind

def calculate_sustainable_growth(data):
    
    data = data.copy()

    # Apply CUPED to FLGP
    data_flgp = data.dropna(subset=['flgp_pre', 'flgp_post'])
    theta_flgp = np.cov(data_flgp['flgp_pre'], data_flgp['flgp_post'])[0, 1] / np.var(data_flgp['flgp_pre'])
    data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())

    # Apply CUPED to Orders
    data_orders = data.dropna(subset=['orders_pre', 'orders_post'])
    theta_orders = np.cov(data_orders['orders_pre'], data_orders['orders_post'])[0, 1] / np.var(data_orders['orders_pre'])
    data_orders['orders_post_cuped'] = data_orders['orders_post'] - theta_orders * (data_orders['orders_pre'] - data_orders['orders_pre'].mean())

    # T-tests for significance
    t_stat_orders, p_value_orders = ttest_ind(
        data_orders.loc[data_orders['is_customer_holdout'], 'orders_post_cuped'],
        data_orders.loc[~data_orders['is_customer_holdout'], 'orders_post_cuped'],
        equal_var=False
    )

    t_stat_flgp, p_value_flgp = ttest_ind(
        data_flgp.loc[data_flgp['is_customer_holdout'], 'flgp_post_cuped'],
        data_flgp.loc[~data_flgp['is_customer_holdout'], 'flgp_post_cuped'],
        equal_var=False
    )

    # Total FLGP and Orders (CUPED)
    holdout_total_flgp_cuped = data_flgp.loc[data_flgp['is_customer_holdout'], 'flgp_post_cuped'].sum()
    non_holdout_total_flgp_cuped = data_flgp.loc[~data_flgp['is_customer_holdout'], 'flgp_post_cuped'].sum()

    holdout_total_orders_cuped = data_orders.loc[data_orders['is_customer_holdout'], 'orders_post_cuped'].sum()
    non_holdout_total_orders_cuped = data_orders.loc[~data_orders['is_customer_holdout'], 'orders_post_cuped'].sum()

    # Normalize for Population Differences
    holdout_user_count = data['is_customer_holdout'].sum()
    non_holdout_user_count = (~data['is_customer_holdout']).sum()

    scaled_holdout_total_flgp_cuped = (holdout_total_flgp_cuped / holdout_user_count) * non_holdout_user_count if holdout_user_count != 0 else np.nan
    scaled_holdout_total_orders_cuped = (holdout_total_orders_cuped / holdout_user_count) * non_holdout_user_count if holdout_user_count != 0 else np.nan

    # FLGP per Order and Orders per User (CUPED)
    holdout_flgp_per_order_cuped = holdout_total_flgp_cuped / holdout_total_orders_cuped if holdout_total_orders_cuped != 0 else np.nan
    non_holdout_flgp_per_order_cuped = non_holdout_total_flgp_cuped / non_holdout_total_orders_cuped if non_holdout_total_orders_cuped != 0 else np.nan

    holdout_orders_per_user_cuped = holdout_total_orders_cuped / holdout_user_count if holdout_user_count != 0 else np.nan
    non_holdout_orders_per_user_cuped = non_holdout_total_orders_cuped / non_holdout_user_count if non_holdout_user_count != 0 else np.nan

    # Incremental Differences (CUPED)
    incremental_orders_cuped = non_holdout_total_orders_cuped - scaled_holdout_total_orders_cuped
    incremental_flgp_cuped = non_holdout_total_flgp_cuped - scaled_holdout_total_flgp_cuped

    # Percentage Changes (CUPED)
    percentage_change_orders_cuped = ((incremental_orders_cuped) / abs(scaled_holdout_total_orders_cuped)) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan
    percentage_change_flgp_cuped = ((incremental_flgp_cuped) / abs(scaled_holdout_total_flgp_cuped)) * 100 if scaled_holdout_total_flgp_cuped != 0 else np.nan

    # Sustainable Growth Calculation with significance filter
    if p_value_orders < 0.05 and p_value_flgp < 0.05:
        sustainable_growth = ((incremental_orders_cuped + (incremental_flgp_cuped / non_holdout_flgp_per_order_cuped)) / scaled_holdout_total_orders_cuped) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan
    elif p_value_orders < 0.05:
        sustainable_growth = (incremental_orders_cuped / scaled_holdout_total_orders_cuped) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan
    elif p_value_flgp < 0.05:
        sustainable_growth = ((incremental_flgp_cuped / non_holdout_flgp_per_order_cuped) / scaled_holdout_total_orders_cuped) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan
    else:
        sustainable_growth = 0

    return {
        'sustainable_growth': sustainable_growth,
        'incremental_orders_cuped': incremental_orders_cuped if p_value_orders < 0.05 else 0,
        'incremental_flgp_cuped': incremental_flgp_cuped if p_value_flgp < 0.05 else 0,
        'percentage_change_orders_cuped': percentage_change_orders_cuped if p_value_orders < 0.05 else 0,
        'percentage_change_flgp_cuped': percentage_change_flgp_cuped if p_value_flgp < 0.05 else 0,
        'holdout_flgp_per_order_cuped': holdout_flgp_per_order_cuped,
        'non_holdout_flgp_per_order_cuped': non_holdout_flgp_per_order_cuped,
        'holdout_orders_per_user_cuped': holdout_orders_per_user_cuped,
        'non_holdout_orders_per_user_cuped': non_holdout_orders_per_user_cuped,
        'scaled_holdout_total_orders_cuped': scaled_holdout_total_orders_cuped,
        'non_holdout_total_orders_cuped': non_holdout_total_orders_cuped,
        'non_holdout_total_flgp_cuped': non_holdout_total_flgp_cuped,
        'scaled_holdout_total_flgp_cuped': scaled_holdout_total_flgp_cuped,
        't_stat_orders': t_stat_orders,
        'p_value_orders': p_value_orders,
        't_stat_flgp': t_stat_flgp,
        'p_value_flgp': p_value_flgp
    }


results = []
for entity in growth_df['entity_id'].unique():
    entity_data = growth_df[growth_df['entity_id'] == entity]
    result = calculate_sustainable_growth(entity_data)
    result['entity_id'] = entity
    results.append(result)

pd.DataFrame(results)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

Unnamed: 0,sustainable_growth,incremental_orders_cuped,incremental_flgp_cuped,percentage_change_orders_cuped,percentage_change_flgp_cuped,holdout_flgp_per_order_cuped,non_holdout_flgp_per_order_cuped,holdout_orders_per_user_cuped,non_holdout_orders_per_user_cuped,scaled_holdout_total_orders_cuped,non_holdout_total_orders_cuped,non_holdout_total_flgp_cuped,scaled_holdout_total_flgp_cuped,t_stat_orders,p_value_orders,t_stat_flgp,p_value_flgp,entity_id
0,5.20573,62630.521326,278085.989648,1.369434,3.933325,1.545875,1.584974,4.942683,5.010369,4573459.0,4636090.0,7348083.0,7069997.0,-2.40529,0.016169,-3.102915,0.001920087,TB_KW
1,0.0,0.0,0.0,0.0,0.0,0.512316,0.52266,2.086541,2.104458,1480292.0,1493003.0,780333.0,758377.0,-0.811431,0.417131,-0.841388,0.4001645,FP_TH
2,0.0,0.0,0.0,0.0,0.0,0.346611,0.32697,2.397477,2.396508,1117593.0,1117142.0,365271.9,387369.8,0.039099,0.968813,0.653789,0.5132763,NP_HU
3,2.093355,106787.858908,0.0,2.093355,0.0,1.157772,1.116935,1.547378,1.57977,5101277.0,5208065.0,5817068.0,5906114.0,-4.366475,1.3e-05,1.429839,0.1527738,HS_SA
4,152.711307,-50934.931689,567724.803717,-2.303958,270.442022,-0.094956,0.165662,3.900626,3.810757,2210758.0,2159823.0,357800.0,-209924.8,2.495507,0.012591,-8.735763,3.025296e-18,FP_SG
5,0.0,0.0,0.0,0.0,0.0,1.322854,1.256943,3.030671,2.996799,483970.9,478561.9,601524.9,640222.8,0.645188,0.518851,0.96297,0.3356803,FY_CY
6,0.0,0.0,0.0,0.0,0.0,0.696914,0.696624,2.695149,2.735235,1036619.0,1052037.0,732873.7,722434.4,-1.074723,0.282531,-0.711069,0.4770838,FP_MM
7,-2.378375,-36440.282263,0.0,-2.378375,0.0,0.159906,0.166327,2.34172,2.286025,1532150.0,1495710.0,248776.2,245000.3,2.561887,0.010421,-0.277358,0.7815132,TB_JO
8,0.0,0.0,0.0,0.0,0.0,0.630914,0.649315,3.748408,3.74849,1781791.0,1781830.0,1156968.0,1124157.0,-0.002414,0.998074,-0.746264,0.4555359,TB_OM
9,2.215725,43126.673303,0.0,2.215725,0.0,0.648727,0.657167,2.909673,2.974143,1946391.0,1989518.0,1307445.0,1262677.0,-2.534445,0.011273,-1.556027,0.1197412,TB_IQ


In [374]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind

def calculate_sustainable_growth(data):
    
    data = data.copy()

    # Apply CUPED to FLGP
    data_flgp = data.dropna(subset=['flgp_pre', 'flgp_post'])
    theta_flgp = np.cov(data_flgp['flgp_pre'], data_flgp['flgp_post'])[0, 1] / np.var(data_flgp['flgp_pre'])
    data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())

    # Apply CUPED to Orders
    data_orders = data.dropna(subset=['orders_pre', 'orders_post'])
    theta_orders = np.cov(data_orders['orders_pre'], data_orders['orders_post'])[0, 1] / np.var(data_orders['orders_pre'])
    data_orders['orders_post_cuped'] = data_orders['orders_post'] - theta_orders * (data_orders['orders_pre'] - data_orders['orders_pre'].mean())

    # Total FLGP and Orders (CUPED)
    holdout_total_flgp_cuped = data_flgp.loc[data_flgp['is_customer_holdout'], 'flgp_post_cuped'].sum()
    non_holdout_total_flgp_cuped = data_flgp.loc[~data_flgp['is_customer_holdout'], 'flgp_post_cuped'].sum()

    holdout_total_orders_cuped = data_orders.loc[data_orders['is_customer_holdout'], 'orders_post_cuped'].sum()
    non_holdout_total_orders_cuped = data_orders.loc[~data_orders['is_customer_holdout'], 'orders_post_cuped'].sum()

    # Normalize for Population Differences
    holdout_user_count = data['is_customer_holdout'].sum()
    non_holdout_user_count = (~data['is_customer_holdout']).sum()

    scaled_holdout_total_flgp_cuped = (holdout_total_flgp_cuped / holdout_user_count) * non_holdout_user_count if holdout_user_count != 0 else np.nan
    scaled_holdout_total_orders_cuped = (holdout_total_orders_cuped / holdout_user_count) * non_holdout_user_count if holdout_user_count != 0 else np.nan

    # FLGP per Order and Orders per User (CUPED)
    holdout_flgp_per_order_cuped = holdout_total_flgp_cuped / holdout_total_orders_cuped if holdout_total_orders_cuped != 0 else np.nan
    non_holdout_flgp_per_order_cuped = non_holdout_total_flgp_cuped / non_holdout_total_orders_cuped if non_holdout_total_orders_cuped != 0 else np.nan

    holdout_orders_per_user_cuped = holdout_total_orders_cuped / holdout_user_count if holdout_user_count != 0 else np.nan
    non_holdout_orders_per_user_cuped = non_holdout_total_orders_cuped / non_holdout_user_count if non_holdout_user_count != 0 else np.nan

    # Incremental Differences (CUPED)
    incremental_orders_cuped = non_holdout_total_orders_cuped - scaled_holdout_total_orders_cuped
    incremental_flgp_cuped = non_holdout_total_flgp_cuped - scaled_holdout_total_flgp_cuped

    # Percentage Changes (CUPED)
    percentage_change_orders_cuped = ((incremental_orders_cuped) / abs(scaled_holdout_total_orders_cuped)) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan
    percentage_change_flgp_cuped = ((incremental_flgp_cuped) / abs(scaled_holdout_total_flgp_cuped)) * 100 if scaled_holdout_total_flgp_cuped != 0 else np.nan

    # Sustainable Growth Calculation
    sustainable_growth = ((incremental_orders_cuped + (incremental_flgp_cuped / non_holdout_flgp_per_order_cuped)) / scaled_holdout_total_orders_cuped) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan

    # T-tests for significance
    t_stat_orders, p_value_orders = ttest_ind(
        data_orders.loc[data_orders['is_customer_holdout'], 'orders_post_cuped'],
        data_orders.loc[~data_orders['is_customer_holdout'], 'orders_post_cuped'],
        equal_var=False
    )

    t_stat_flgp, p_value_flgp = ttest_ind(
        data_flgp.loc[data_flgp['is_customer_holdout'], 'flgp_post_cuped'],
        data_flgp.loc[~data_flgp['is_customer_holdout'], 'flgp_post_cuped'],
        equal_var=False
    )

    return {
        'sustainable_growth': sustainable_growth,
        'incremental_orders_cuped': incremental_orders_cuped,
        'incremental_flgp_cuped': incremental_flgp_cuped,
        'percentage_change_orders_cuped': percentage_change_orders_cuped,
        'percentage_change_flgp_cuped': percentage_change_flgp_cuped,
        'holdout_flgp_per_order_cuped': holdout_flgp_per_order_cuped,
        'non_holdout_flgp_per_order_cuped': non_holdout_flgp_per_order_cuped,
        'holdout_orders_per_user_cuped': holdout_orders_per_user_cuped,
        'non_holdout_orders_per_user_cuped': non_holdout_orders_per_user_cuped,
        'scaled_holdout_total_orders_cuped': scaled_holdout_total_orders_cuped,
        'non_holdout_total_orders_cuped': non_holdout_total_orders_cuped,
        'non_holdout_total_flgp_cuped': non_holdout_total_flgp_cuped,
        'scaled_holdout_total_flgp_cuped': scaled_holdout_total_flgp_cuped,
        't_stat_orders': t_stat_orders,
        'p_value_orders': p_value_orders,
        't_stat_flgp': t_stat_flgp,
        'p_value_flgp': p_value_flgp
    }


results = []
for entity in growth_df['entity_id'].unique():
    entity_data = growth_df[growth_df['entity_id'] == entity]
    result = calculate_sustainable_growth(entity_data)
    result['entity_id'] = entity
    results.append(result)

pd.DataFrame(results)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_flgp['flgp_post_cuped'] = data_flgp['flgp_post'] - theta_flgp * (data_flgp['flgp_pre'] - data_flgp['flgp_pre'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

Unnamed: 0,sustainable_growth,incremental_orders_cuped,incremental_flgp_cuped,percentage_change_orders_cuped,percentage_change_flgp_cuped,holdout_flgp_per_order_cuped,non_holdout_flgp_per_order_cuped,holdout_orders_per_user_cuped,non_holdout_orders_per_user_cuped,scaled_holdout_total_orders_cuped,non_holdout_total_orders_cuped,non_holdout_total_flgp_cuped,scaled_holdout_total_flgp_cuped,t_stat_orders,p_value_orders,t_stat_flgp,p_value_flgp,entity_id
0,5.20573,62630.521326,278085.989648,1.369434,3.933325,1.545875,1.584974,4.942683,5.010369,4573459.0,4636090.0,7348083.0,7069997.0,-2.40529,0.016169,-3.102915,0.001920087,TB_KW
1,3.69653,12711.235313,21956.00623,0.858698,2.895131,0.512316,0.52266,2.086541,2.104458,1480292.0,1493003.0,780333.0,758377.0,-0.811431,0.417131,-0.841388,0.4001645,FP_TH
2,-6.087691,-451.545453,-22097.966183,-0.040403,-5.704617,0.346611,0.32697,2.397477,2.396508,1117593.0,1117142.0,365271.9,387369.8,0.039099,0.968813,0.653789,0.5132763,NP_HU
3,0.530547,106787.858908,-89045.569885,2.093355,-1.507685,1.157772,1.116935,1.547378,1.57977,5101277.0,5208065.0,5817068.0,5906114.0,-4.366475,1.3e-05,1.429839,0.1527738,HS_SA
4,152.711307,-50934.931689,567724.803717,-2.303958,270.442022,-0.094956,0.165662,3.900626,3.810757,2210758.0,2159823.0,357800.0,-209924.8,2.495507,0.012591,-8.735763,3.025296e-18,FP_SG
5,-7.479041,-5409.04049,-38697.924111,-1.117637,-6.044446,1.322854,1.256943,3.030671,2.996799,483970.9,478561.9,601524.9,640222.8,0.645188,0.518851,0.96297,0.3356803,FY_CY
6,2.932932,15417.786582,10439.286924,1.487315,1.445015,0.696914,0.696624,2.695149,2.735235,1036619.0,1052037.0,732873.7,722434.4,-1.074723,0.282531,-0.711069,0.4770838,FP_MM
7,-0.896654,-36440.282263,3775.976886,-2.378375,1.541213,0.159906,0.166327,2.34172,2.286025,1532150.0,1495710.0,248776.2,245000.3,2.561887,0.010421,-0.277358,0.7815132,TB_JO
8,2.838187,39.229546,32810.726563,0.002202,2.918695,0.630914,0.649315,3.748408,3.74849,1781791.0,1781830.0,1156968.0,1124157.0,-0.002414,0.998074,-0.746264,0.4555359,TB_OM
9,5.715644,43126.673303,44767.575477,2.215725,3.545449,0.648727,0.657167,2.909673,2.974143,1946391.0,1989518.0,1307445.0,1262677.0,-2.534445,0.011273,-1.556027,0.1197412,TB_IQ
