In [70]:
import numpy as np
import pandas as pd
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind
import plotly.express as px
import db_dtypes
import bigframes.pandas as bpd
from IPython.display import display, HTML
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import math
import statsmodels.api as sm
from datetime import datetime
from typing import Union
import logging
import sys

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s:%(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)


mkt_data = """

WITH holdout_entities AS (
  SELECT
         entity_id,
        `Release Date` AS release_date
  FROM `logistics-data-storage-staging.long_term_pricing.global_holdout_rollout`
  WHERE `Release Date` < DATE_TRUNC(CURRENT_DATE(), WEEK)
  AND entity_id in ('PY_DO','PY_BO', 'FP_TW', 'PY_PY', 'DJ_CZ', 'PY_EC','MJM_AT' ,'PY_PE', 'PY_AR' ,'PY_GT','PY_SV' ,'FP_PH','PY_NI' ,'NP_HU' ,'FP_MM','EF_GR' ,'AP_PA'
  ,'YS_TR', 'PY_UY' ,'OP_SE' ,'PY_CL' ,'FP_BD' ,'FP_SG' ,'FO_NO' ,'PY_CR', 'FP_LA' ,'PO_FI','PY_HN', 'FP_MY' ,'FP_TH', 'FY_CY', 'PY_VE')
),
orders as (
  SELECT
      dps.entity_id entity_id
      ,dps.dps_customer_id customer_id
      ,COUNT(case when mkt.order_date <= e.release_date then mkt.order_id end) AS orders_pre
      ,COUNT(case when mkt.order_date > e.release_date then mkt.order_id end) AS orders_post
      ,SUM(case when mkt.order_date <= e.release_date then mkt.analytical_profit end) AS analytical_profit_pre
      ,SUM(case when mkt.order_date > e.release_date then mkt.analytical_profit end) AS analytical_profit_post
  FROM `fulfillment-dwh-production.cl.dps_sessions_mapped_to_orders` dps
  JOIN holdout_entities AS e
    ON dps.entity_id = e.entity_id
  left join fulfillment-dwh-production.curated_data_shared_mkt.bima_order_profitability mkt
    ON mkt.global_entity_id = dps.entity_id
    AND mkt.order_id = dps.platform_order_code
    AND order_date >= DATE_SUB(release_date, INTERVAL 8 WEEK)
    AND order_date < CURRENT_DATE
    AND global_entity_id in ('PY_DO','PY_BO', 'FP_TW', 'PY_PY', 'DJ_CZ', 'PY_EC','MJM_AT' ,'PY_PE', 'PY_AR' ,'PY_GT','PY_SV' ,'FP_PH','PY_NI' ,'NP_HU' ,'FP_MM','EF_GR' ,'AP_PA','YS_TR', 'PY_UY' ,'OP_SE' ,'PY_CL' ,'FP_BD' ,'FP_SG' ,'FO_NO' ,'PY_CR', 'FP_LA' ,'PO_FI','PY_HN', 'FP_MY' ,'FP_TH', 'FY_CY', 'PY_VE')
  WHERE dps.created_date BETWEEN DATE_SUB(e.release_date, INTERVAL 8 WEEK) AND CURRENT_DATE
    AND dps.platform_order_code IS NOT NULL
    AND dps.is_own_delivery
    AND dps.is_sent
    AND vendor_vertical_parent IN ('Restaurant','restaurant','restaurants')
    AND dps.entity_id in ('PY_DO','PY_BO', 'FP_TW', 'PY_PY', 'DJ_CZ', 'PY_EC','MJM_AT' ,'PY_PE', 'PY_AR' ,'PY_GT','PY_SV' ,'FP_PH','PY_NI' ,'NP_HU' ,'FP_MM','EF_GR' ,'AP_PA','YS_TR', 'PY_UY' ,'OP_SE' ,'PY_CL' ,'FP_BD' ,'FP_SG' ,'FO_NO' ,'PY_CR', 'FP_LA' ,'PO_FI','PY_HN', 'FP_MY' ,'FP_TH', 'FY_CY', 'PY_VE')
  GROUP BY 1, 2
), 
customer_information AS (
  SELECT
         e.entity_id,
         e.release_date,
         CASE WHEN d.created_date <= e.release_date THEN FALSE ELSE COALESCE(is_customer_holdout, FALSE) END AS is_customer_holdout,
         d.customer_id
  FROM `fulfillment-dwh-production.cl.dps_holdout_users` AS d
  JOIN holdout_entities AS e
        ON d.entity_id = e.entity_id
  LEFT JOIN `fulfillment-dwh-production.cl._bad_dps_logs_ids` bad_ids
    ON d.customer_id = bad_ids.id
  WHERE d.created_date BETWEEN '2025-01-01' AND CURRENT_DATE()
    AND d.customer_id IS NOT NULL
    AND bad_ids.id IS NULL
  GROUP BY 1, 2, 3, 4
)
SELECT 
   e.entity_id,
   e.customer_id,
   e.is_customer_holdout,
   o.orders_pre,
   o.orders_post,
   o.analytical_profit_pre,
   o.analytical_profit_post
FROM customer_information e
LEFT JOIN orders o
  ON o.customer_id = e.customer_id
  AND o.entity_id = e.entity_id

"""

dps_data = """

WITH holdout_entities AS (
  SELECT
         entity_id,
        `Release Date` AS release_date
  FROM `logistics-data-storage-staging.long_term_pricing.global_holdout_rollout`
  WHERE `Release Date` < DATE_TRUNC(CURRENT_DATE(), WEEK)
  AND entity_id not in ('PY_DO','PY_BO', 'FP_TW', 'PY_PY', 'DJ_CZ', 'PY_EC','MJM_AT' ,'PY_PE', 'PY_AR' ,'PY_GT','PY_SV' ,'FP_PH','PY_NI' ,'NP_HU' ,'FP_MM','EF_GR' ,'AP_PA'
  ,'YS_TR', 'PY_UY' ,'OP_SE' ,'PY_CL' ,'FP_BD' ,'FP_SG' ,'FO_NO' ,'PY_CR', 'FP_LA' ,'PO_FI','PY_HN', 'FP_MY' ,'FP_TH', 'FY_CY', 'PY_VE')
),
orders as (
  SELECT
      dps.entity_id entity_id
      ,dps.dps_customer_id customer_id
      ,COUNT(case when dps.created_date <= e.release_date then dps.platform_order_code end) AS orders_pre
      ,COUNT(case when dps.created_date > e.release_date then dps.platform_order_code end) AS orders_post
      ,SUM(case when dps.created_date <= e.release_date then dps.fully_loaded_gross_profit_eur end) AS analytical_profit_pre
      ,SUM(case when dps.created_date > e.release_date then dps.fully_loaded_gross_profit_eur end) AS analytical_profit_post
  FROM `fulfillment-dwh-production.cl.dps_sessions_mapped_to_orders` dps
  JOIN holdout_entities AS e
    ON dps.entity_id = e.entity_id
  WHERE dps.created_date BETWEEN DATE_SUB(e.release_date, INTERVAL 8 WEEK) AND CURRENT_DATE
    AND dps.platform_order_code IS NOT NULL
    AND dps.is_own_delivery
    AND dps.is_sent
    AND vendor_vertical_parent IN ('Restaurant','restaurant','restaurants')
    AND dps.entity_id not in ('PY_DO','PY_BO', 'FP_TW', 'PY_PY', 'DJ_CZ', 'PY_EC','MJM_AT' ,'PY_PE', 'PY_AR' ,'PY_GT','PY_SV' ,'FP_PH','PY_NI' ,'NP_HU' ,'FP_MM','EF_GR' ,'AP_PA','YS_TR', 'PY_UY' ,'OP_SE' ,'PY_CL' ,'FP_BD' ,'FP_SG' ,'FO_NO' ,'PY_CR', 'FP_LA' ,'PO_FI','PY_HN', 'FP_MY' ,'FP_TH', 'FY_CY', 'PY_VE')
  GROUP BY 1, 2
), 
customer_information AS (
  SELECT
         e.entity_id,
         e.release_date,
         CASE WHEN d.created_date <= e.release_date THEN FALSE ELSE COALESCE(is_customer_holdout, FALSE) END AS is_customer_holdout,
         d.customer_id
  FROM `fulfillment-dwh-production.cl.dps_holdout_users` AS d
  JOIN holdout_entities AS e
        ON d.entity_id = e.entity_id
  LEFT JOIN `fulfillment-dwh-production.cl._bad_dps_logs_ids` bad_ids
    ON d.customer_id = bad_ids.id
  WHERE d.created_date BETWEEN '2025-01-01' AND CURRENT_DATE()
    AND d.customer_id IS NOT NULL
    AND bad_ids.id IS NULL
  GROUP BY 1, 2, 3, 4
)
SELECT 
   e.entity_id,
   e.customer_id,
   e.is_customer_holdout,
   o.orders_pre,
   o.orders_post,
   o.analytical_profit_pre,
   o.analytical_profit_post
FROM customer_information e
LEFT JOIN orders o
  ON o.customer_id = e.customer_id
  AND o.entity_id = e.entity_id


"""

def combined_data(mkt, dps):

  mkt_df = client.query(mkt).to_dataframe()
  dps_df = client.query(dps).to_dataframe()

  # Append DataFrames
  combined = pd.concat([mkt_df, dps_df], ignore_index=True)

  return combined
  

In [71]:
project_id = "logistics-customer-staging"
logging.info(f"Initializing BigQuery client for project: {project_id}")

try:
    # Initialize BigQuery client
    client = bigquery.Client(project=project_id)
except Exception as e:
    logging.error(f"Failed to initialize BigQuery client: {e}")
    sys.exit(1)
    
combined_df = combined_data(mkt_data, dps_data)

2025-02-17 13:44:09,916 INFO:Initializing BigQuery client for project: logistics-customer-staging






In [None]:
client = bigquery.Client()
    
# Configure the load job
job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,  
    # ^ Overwrite table data each run; can also use WRITE_APPEND if you want to append
)

load_job = client.load_table_from_dataframe(
    df,
    table_id,
    job_config=job_config
)
load_job.result()  # Wait for the job to complete.

print(f"Loaded {df.shape[0]} rows into {table_id}")

In [72]:
def calculate_sustainable_growth(data, pre_profit, post_profit):
    
    data = data.copy()
    entity = data['entity_id'].iloc[0]


    # Apply CUPED to FLGP
    data_flgp = data.dropna(subset=[pre_profit, post_profit])
    theta_flgp = np.cov(data_flgp[pre_profit], data_flgp[post_profit])[0, 1] / np.var(data_flgp[pre_profit])
    data_flgp['flgp_post_cuped'] = data_flgp[post_profit] - theta_flgp * (data_flgp[pre_profit] - data_flgp[pre_profit].mean())

    # Apply CUPED to Orders
    data_orders = data.dropna(subset=['orders_pre', 'orders_post'])
    theta_orders = np.cov(data_orders['orders_pre'], data_orders['orders_post'])[0, 1] / np.var(data_orders['orders_pre'])
    data_orders['orders_post_cuped'] = data_orders['orders_post'] - theta_orders * (data_orders['orders_pre'] - data_orders['orders_pre'].mean())

    # Per User Metrics
    holdout_flgpu_post = data_flgp.loc[data_flgp['is_customer_holdout'], 'flgp_post_cuped'].mean()
    holdout_flgpu_pre = data_flgp.loc[data_flgp['is_customer_holdout'], pre_profit].mean()
    
    non_holdout_flgpu_post = data_flgp.loc[~data_flgp['is_customer_holdout'], 'flgp_post_cuped'].mean()
    non_holdout_flgpu_pre = data_flgp.loc[~data_flgp['is_customer_holdout'], pre_profit].mean()

    holdout_orders_per_user_post = data_orders.loc[data_orders['is_customer_holdout'], 'orders_post_cuped'].mean()
    holdout_orders_per_user_pre = data_orders.loc[data_orders['is_customer_holdout'], 'orders_pre'].mean()
    
    non_holdout_orders_per_user_post = data_orders.loc[~data_orders['is_customer_holdout'], 'orders_post_cuped'].mean()
    non_holdout_orders_per_user_pre = data_orders.loc[~data_orders['is_customer_holdout'], 'orders_pre'].mean()

    # Apply DID
    # DID to flgpu
    d_flgpu_pre = non_holdout_flgpu_pre - holdout_flgpu_pre
    d_flgpu_post = non_holdout_flgpu_post - holdout_flgpu_post

    non_holdout_flgpu_adjusted_post = non_holdout_flgpu_post - d_flgpu_pre
    holdout_flgpu_adjusted_post = holdout_flgpu_post 

    did_flgpu  = non_holdout_flgpu_adjusted_post - holdout_flgpu_adjusted_post

    # DID to orders_per_user
    d_orders_per_user_pre = non_holdout_orders_per_user_pre - holdout_orders_per_user_pre
    d_orders_per_user_post = non_holdout_orders_per_user_post - holdout_orders_per_user_post

    non_holdout_orders_per_user_adjusted_post = non_holdout_orders_per_user_post - d_orders_per_user_pre
    holdout_orders_per_user_adjusted_post = holdout_orders_per_user_post

    did_orders_per_user  = non_holdout_orders_per_user_adjusted_post - holdout_orders_per_user_adjusted_post

    # Total FLGP and Orders
    holdout_user_count = data['is_customer_holdout'].sum()
    non_holdout_user_count = (~data['is_customer_holdout']).sum()

    holdout_total_flgp_cuped = holdout_flgpu_adjusted_post * holdout_user_count
    non_holdout_total_flgp_cuped = non_holdout_flgpu_adjusted_post * non_holdout_user_count

    holdout_total_orders_cuped = holdout_orders_per_user_adjusted_post * holdout_user_count
    non_holdout_total_orders_cuped = non_holdout_orders_per_user_adjusted_post * non_holdout_user_count

    # Normalize for Population Differences
    holdout_user_count = data['is_customer_holdout'].sum()
    non_holdout_user_count = (~data['is_customer_holdout']).sum()

    scaled_holdout_total_flgp_cuped = (holdout_total_flgp_cuped / holdout_user_count) * non_holdout_user_count if holdout_user_count != 0 else np.nan
    scaled_holdout_total_orders_cuped = (holdout_total_orders_cuped / holdout_user_count) * non_holdout_user_count if holdout_user_count != 0 else np.nan

    #Calculate Per order Metrics
    holdout_flgp_per_order_cuped = (
    np.nan if pd.isna(holdout_total_orders_cuped) or holdout_total_orders_cuped == 0
    else holdout_total_flgp_cuped / holdout_total_orders_cuped
    )
       
    non_holdout_flgp_per_order_cuped = (
    np.nan if pd.isna(non_holdout_total_orders_cuped) or non_holdout_total_orders_cuped == 0
    else non_holdout_total_flgp_cuped / non_holdout_total_orders_cuped
    )

    holdout_orders_per_user_cuped = holdout_orders_per_user_adjusted_post
    non_holdout_orders_per_user_cuped = non_holdout_orders_per_user_adjusted_post

    # Incremental Differences (CUPED)
    incremental_orders_cuped = non_holdout_total_orders_cuped - scaled_holdout_total_orders_cuped
    incremental_flgp_cuped = non_holdout_total_flgp_cuped - scaled_holdout_total_flgp_cuped

    # Percentage Changes (CUPED)
    percentage_change_orders_cuped = ((incremental_orders_cuped) / abs(scaled_holdout_total_orders_cuped)) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan
    percentage_change_flgp_cuped = ((incremental_flgp_cuped) / abs(scaled_holdout_total_flgp_cuped)) * 100 if scaled_holdout_total_flgp_cuped != 0 else np.nan

    # Sustainable Growth Calculation
    sustainable_growth = ((incremental_orders_cuped + (incremental_flgp_cuped / non_holdout_flgp_per_order_cuped)) / scaled_holdout_total_orders_cuped) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan

    # T-tests for significance
    t_stat_orders, p_value_orders = ttest_ind(
        data_orders.loc[data_orders['is_customer_holdout'], 'orders_post_cuped'],
        data_orders.loc[~data_orders['is_customer_holdout'], 'orders_post_cuped'],
        equal_var=False
    )

    t_stat_flgp, p_value_flgp = ttest_ind(
        data_flgp.loc[data_flgp['is_customer_holdout'], 'flgp_post_cuped'],
        data_flgp.loc[~data_flgp['is_customer_holdout'], 'flgp_post_cuped'],
        equal_var=False
    )

    return {
        'entity': entity,
        'sustainable_growth': sustainable_growth,
        'percentage_change_orders_cuped': percentage_change_orders_cuped,
        'incremental_orders_cuped': incremental_orders_cuped,
        'non_holdout_total_orders_cuped':non_holdout_total_orders_cuped,
        'holdout_total_orders_cuped':scaled_holdout_total_orders_cuped,
        't_stat_orders': t_stat_orders,
        'p_value_orders': p_value_orders,
        'percentage_change_flgp_cuped': percentage_change_flgp_cuped,
        'incremental_flgp_cuped': incremental_flgp_cuped,
        'non_holdout_total_flgp_cuped': non_holdout_total_flgp_cuped,
        'holdout_total_flgp_cuped':scaled_holdout_total_flgp_cuped,
        't_stat_flgp': t_stat_flgp,
        'p_value_flgp': p_value_flgp,
        'holdout_flgp_per_order_cuped': holdout_flgp_per_order_cuped,
        'non_holdout_flgp_per_order_cuped': non_holdout_flgp_per_order_cuped,
        'holdout_orders_per_user_cuped': holdout_orders_per_user_cuped,
        'non_holdout_orders_per_user_cuped': non_holdout_orders_per_user_cuped
    }


metric_pairs = [
    ('analytical_profit_pre', 'analytical_profit_post'),
]

# Initialize results list
results = []

# Loop through each metric pair
for pre_metric, post_metric in metric_pairs:
    for entity in combined_df['entity_id'].unique():
        entity_data = combined_df[combined_df['entity_id'] == entity]
        
        try:
            result = calculate_sustainable_growth(entity_data, pre_metric, post_metric)
            result['metric_used'] = f"{pre_metric}_vs_{post_metric}"
            results.append(result)
        
        except Exception as e:
            print(f"Skipping entity {entity} due to error: {e}")
            continue

final_results_df = pd.DataFrame(results)

# Drop rows where sustainable_growth is NaN
final_results_df = final_results_df.dropna(subset=['sustainable_growth'])

# Save to CSV
final_results_df.to_csv('profitable_growth.csv', index=False)

print("Processing completed. Check profitable_growth.csv for results.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_flgp['flgp_post_cuped'] = data_flgp[post_profit] - theta_flgp * (data_flgp[pre_profit] - data_flgp[pre_profit].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_orders['orders_post_cuped'] = data_orders['orders_post'] - theta_orders * (data_orders['orders_pre'] - data_orders['orders_pre'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stab

Skipping entity GV_MA due to error: boolean value of NA is ambiguous
Skipping entity GV_TN due to error: boolean value of NA is ambiguous
Processing completed. Check profitable_growth.csv for results.


  avg = a.mean(axis, **keepdims_kw)
  ret = um.true_divide(
  theta_flgp = np.cov(data_flgp[pre_profit], data_flgp[post_profit])[0, 1] / np.var(data_flgp[pre_profit])
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  avg = a.mean(axis, **keepdims_kw)
  ret = um.true_divide(
  theta_orders = np.cov(data_orders['orders_pre'], data_orders['orders_post'])[0, 1] / np.var(data_orders['orders_pre'])
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [74]:
final_results_df

Unnamed: 0,entity,sustainable_growth,percentage_change_orders_cuped,incremental_orders_cuped,non_holdout_total_orders_cuped,holdout_total_orders_cuped,t_stat_orders,p_value_orders,percentage_change_flgp_cuped,incremental_flgp_cuped,non_holdout_total_flgp_cuped,holdout_total_flgp_cuped,t_stat_flgp,p_value_flgp,holdout_flgp_per_order_cuped,non_holdout_flgp_per_order_cuped,holdout_orders_per_user_cuped,non_holdout_orders_per_user_cuped,metric_used
0,YS_TR,12.771521,2.633387,553535.0,21573420.0,21019880.0,-3.727135,0.0001937833,10.960707,2769818.0,28040250.0,25270430.0,-11.736699,9.165552000000001e-32,1.202215,1.299759,2.22532,2.283921,analytical_profit_pre_vs_analytical_profit_post
1,FP_MY,-0.261487,-1.387622,-178557.6,12689330.0,12867890.0,1.061159,0.2886215,1.155173,270792.2,23712490.0,23441700.0,-3.511781,0.0004458184,1.821721,1.868695,1.731084,1.707063,analytical_profit_pre_vs_analytical_profit_post
2,FP_PH,0.312526,1.145506,119092.1,10515560.0,10396460.0,-3.671358,0.0002413936,-0.81682,-138342.9,16798430.0,16936770.0,-1.739,0.08204245,1.62909,1.597484,1.443116,1.459647,analytical_profit_pre_vs_analytical_profit_post
3,PY_CL,3.871605,2.241663,118898.8,5422942.0,5304043.0,-3.183219,0.001458038,1.620033,294533.5,18475250.0,18180720.0,-3.912302,9.18859e-05,3.427709,3.406868,2.128848,2.176569,analytical_profit_pre_vs_analytical_profit_post
4,FP_TW,5.745379,3.301851,631497.8,19757070.0,19125570.0,-2.325466,0.02005012,2.422733,925764.4,39137330.0,38211570.0,-1.992341,0.04634157,1.997931,1.980928,2.994528,3.093403,analytical_profit_pre_vs_analytical_profit_post
5,PY_DO,17.423253,7.744268,194825.4,2710562.0,2515737.0,-2.958558,0.003098263,9.86994,731599.6,8144001.0,7412402.0,-2.277766,0.02278003,2.946414,3.004543,3.012736,3.24605,analytical_profit_pre_vs_analytical_profit_post
6,PY_AR,2.313859,0.738103,107816.0,14715000.0,14607180.0,-2.941767,0.003264492,1.589067,810036.1,51785620.0,50975590.0,-3.167645,0.001537968,3.489762,3.519241,2.644067,2.663583,analytical_profit_pre_vs_analytical_profit_post
7,NP_HU,8.016673,0.47079,10411.16,2221833.0,2211422.0,0.17118,0.8640855,8.120409,264124.9,3516731.0,3252606.0,0.216438,0.8286543,1.470821,1.582806,2.605102,2.617367,analytical_profit_pre_vs_analytical_profit_post
8,PY_UY,-9.434178,0.11319,1896.438,1677336.0,1675440.0,-2.375812,0.01753679,-8.706292,-412067.8,4320921.0,4732988.0,1.51222,0.130505,2.824923,2.576061,2.860958,2.864196,analytical_profit_pre_vs_analytical_profit_post
9,PY_PE,-3.918162,-0.397136,-13550.47,3398493.0,3412044.0,-2.072159,0.03826547,-3.414364,-118357.5,3348100.0,3466458.0,-1.022713,0.3064745,1.015948,0.985172,2.284952,2.275878,analytical_profit_pre_vs_analytical_profit_post


In [82]:
from google.cloud import bigquery

project_id_new = 'logistics-data-storage-staging'
dataset_id = 'shazeb'
table_id = 'abc_performance_test'

client = bigquery.Client(project=project_id_new)

# 1. Create the dataset if it doesn't exist
dataset_ref = bigquery.Dataset(f"{project_id_new}.{dataset_id}")
dataset = bigquery.Dataset(dataset_ref)
try:
    client.get_dataset(dataset_ref)  # Check if dataset exists
    print(f"Dataset {dataset_id} already exists.")
except:
    dataset = client.create_dataset(dataset)
    print(f"Created dataset {dataset_id}.")

# # 2. Create the table if it doesn't exist
# table_ref = bigquery.Table(f"{project_id}.{dataset_id}.{table_id}")
# table = bigquery.Table(table_ref)

# # Define the schema if you want to specify exact column names & types up front.
# # Example schema:
# table.schema = [
#     bigquery.SchemaField("name", "STRING", mode="REQUIRED"),
#     bigquery.SchemaField("age", "INTEGER", mode="NULLABLE"),
#     bigquery.SchemaField("country", "STRING", mode="NULLABLE")
# ]

# try:
#     client.get_table(table_ref)  # Check if table exists
#     print(f"Table {table_id} already exists.")
# except:
#     table = client.create_table(table)
#     print(f"Created table {table_id} with schema {table.schema}.")





KeyError: 'projectId'

In [None]:
# import numpy as np
# import scipy.stats as stats

# def ratio_test(holdout_flgp, holdout_orders, non_holdout_flgp, non_holdout_orders, holdout_n, non_holdout_n):
    
#     # Compute means
#     holdout_mean_flgp = np.mean(holdout_flgp)
#     holdout_mean_orders = np.mean(holdout_orders)
#     non_holdout_mean_flgp = np.mean(non_holdout_flgp)
#     non_holdout_mean_orders = np.mean(non_holdout_orders)

#     # Compute FLGP per Order
#     holdout_flgp_per_order = holdout_mean_flgp / holdout_mean_orders
#     non_holdout_flgp_per_order = non_holdout_mean_flgp / non_holdout_mean_orders

#     # Compute variances
#     var_flgp_holdout = np.var(holdout_flgp, ddof=1)
#     var_flgp_non_holdout = np.var(non_holdout_flgp, ddof=1)
#     var_orders_holdout = np.var(holdout_orders, ddof=1)
#     var_orders_non_holdout = np.var(non_holdout_orders, ddof=1)

#     # Compute covariance
#     cov_flgp_orders_holdout = np.cov(holdout_flgp, holdout_orders, ddof=1)[0, 1]
#     cov_flgp_orders_non_holdout = np.cov(non_holdout_flgp, non_holdout_orders, ddof=1)[0, 1]

#     # Delta Method Standard Error for Holdout
#     se_holdout = (1 / holdout_mean_orders) * np.sqrt(
#         (var_flgp_holdout / holdout_n)
#         - 2 * (holdout_flgp_per_order) * (cov_flgp_orders_holdout / holdout_n)
#         + (holdout_flgp_per_order ** 2) * (var_orders_holdout / holdout_n)
#     )

#     # Delta Method Standard Error for Non-Holdout
#     se_non_holdout = (1 / non_holdout_mean_orders) * np.sqrt(
#         (var_flgp_non_holdout / non_holdout_n)
#         - 2 * (non_holdout_flgp_per_order) * (cov_flgp_orders_non_holdout / non_holdout_n)
#         + (non_holdout_flgp_per_order ** 2) * (var_orders_non_holdout / non_holdout_n)
#     )

#     # Compute difference in FLGP per Order
#     diff_flgp_per_order = non_holdout_flgp_per_order - holdout_flgp_per_order

#     # Compute Standard Error of the Difference
#     se_diff = np.sqrt(se_holdout**2 + se_non_holdout**2)

#     # Compute Z-score
#     z_score = diff_flgp_per_order / se_diff

#     # Compute p-value (two-tailed test)
#     p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))

#     return {
#         "diff_flgp_per_order": diff_flgp_per_order,
#         "z_score": z_score,
#         "p_value": p_value,
#         "holdout_flgp_per_order": holdout_flgp_per_order,
#         "non_holdout_flgp_per_order": non_holdout_flgp_per_order
#     }



# # holdout_flgp = data_flgp.loc[tt['is_customer_holdout'], 'analytical_profit_post']
# # non_holdout_flgp = data_flgp.loc[~tt['is_customer_holdout'], 'analytical_profit_post']

# # holdout_orders = data_orders.loc[tt['is_customer_holdout'], 'orders_post']
# # non_holdout_orders = data_orders.loc[~tt['is_customer_holdout'], 'orders_post']

# # holdout_n = len(holdout_flgp)
# # non_holdout_n = len(non_holdout_flgp)

# # # Perform Ratio-Based Hypothesis Test
# # ratio_test_result = ratio_test(holdout_flgp, holdout_orders, non_holdout_flgp, non_holdout_orders, holdout_n, non_holdout_n)

# # print(ratio_test_result)

# # Drop NaNs from both relevant columns at the same time to maintain alignment
# tt_clean = tt.dropna(subset=['analytical_profit_post', 'orders_post'])

# # Extract holdout & non-holdout FLGP and Orders
# holdout_flgp = tt_clean.loc[tt_clean['is_customer_holdout'], 'analytical_profit_post']
# non_holdout_flgp = tt_clean.loc[~tt_clean['is_customer_holdout'], 'analytical_profit_post']

# holdout_orders = tt_clean.loc[tt_clean['is_customer_holdout'], 'orders_post']
# non_holdout_orders = tt_clean.loc[~tt_clean['is_customer_holdout'], 'orders_post']

# # Compute sample sizes
# holdout_n = len(holdout_flgp)
# non_holdout_n = len(non_holdout_flgp)

# # Perform Ratio-Based Hypothesis Test
# ratio_test_result = ratio_test(holdout_flgp, holdout_orders, non_holdout_flgp, non_holdout_orders, holdout_n, non_holdout_n)

# print(ratio_test_result)
