In [70]:
import numpy as np
import pandas as pd
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind
import plotly.express as px
import db_dtypes
import bigframes.pandas as bpd
from IPython.display import display, HTML
import math
import statsmodels.api as sm
from typing import Union
import logging
import sys
from google.cloud.exceptions import NotFound
from datetime import datetime, timedelta
import time


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s:%(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)


mkt_data = """

WITH holdout_entities AS (
  SELECT
         entity_id,
        `Release Date` AS release_date
  FROM `logistics-data-storage-staging.long_term_pricing.global_holdout_rollout`
  WHERE `Release Date` < DATE_TRUNC(CURRENT_DATE(), WEEK)
  AND entity_id in ('PY_DO','PY_BO', 'FP_TW', 'PY_PY', 'DJ_CZ', 'PY_EC','MJM_AT' ,'PY_PE', 'PY_AR' ,'PY_GT','PY_SV' ,'FP_PH','PY_NI' ,'NP_HU' ,'FP_MM','EF_GR' ,'AP_PA'
  ,'YS_TR', 'PY_UY' ,'OP_SE' ,'PY_CL' ,'FP_BD' ,'FP_SG' ,'FO_NO' ,'PY_CR', 'FP_LA','PY_HN', 'FP_MY' ,'FP_TH', 'FY_CY', 'PY_VE','PO_FI')
),
orders as (
  SELECT
       dps.entity_id entity_id
      ,dps.dps_customer_id customer_id
      ,COUNT(case when mkt.order_date <= e.release_date then mkt.order_id end) AS orders_pre
      ,COUNT(case when mkt.order_date > e.release_date then mkt.order_id end) AS orders_post
      ,SUM(case when mkt.order_date <= e.release_date then mkt.analytical_profit end) AS analytical_profit_pre
      ,SUM(case when mkt.order_date > e.release_date then mkt.analytical_profit end) AS analytical_profit_post
  FROM `fulfillment-dwh-production.cl.dps_sessions_mapped_to_orders` dps
  JOIN holdout_entities AS e
    ON dps.entity_id = e.entity_id
  left join fulfillment-dwh-production.curated_data_shared_mkt.bima_order_profitability mkt
    ON mkt.global_entity_id = dps.entity_id
    AND mkt.order_id = dps.platform_order_code
    AND order_date >= DATE_SUB(release_date, INTERVAL 8 WEEK)
    AND order_date < CURRENT_DATE
    AND global_entity_id in ('PO_FI','PY_DO','PY_BO', 'FP_TW', 'PY_PY', 'DJ_CZ', 'PY_EC','MJM_AT' ,'PY_PE', 'PY_AR' ,'PY_GT','PY_SV' ,'FP_PH','PY_NI' ,'NP_HU' ,'FP_MM','EF_GR' ,'AP_PA','YS_TR', 'PY_UY' ,'OP_SE' ,'PY_CL' ,'FP_BD' ,'FP_SG' ,'FO_NO' ,'PY_CR', 'FP_LA','PY_HN', 'FP_MY' ,'FP_TH', 'FY_CY', 'PY_VE')
  WHERE dps.created_date >= DATE_SUB(e.release_date, INTERVAL 8 WEEK) 
    AND dps.created_date < CURRENT_DATE
    AND dps.platform_order_code IS NOT NULL
    AND dps.is_own_delivery
    AND dps.is_sent
    AND vendor_vertical_parent IN ('Restaurant','restaurant','restaurants')
    AND dps.entity_id in ('PO_FI','PY_DO','PY_BO', 'FP_TW', 'PY_PY', 'DJ_CZ', 'PY_EC','MJM_AT' ,'PY_PE', 'PY_AR' ,'PY_GT','PY_SV' ,'FP_PH','PY_NI' ,'NP_HU' ,'FP_MM','EF_GR' ,'AP_PA','YS_TR', 'PY_UY' ,'OP_SE' ,'PY_CL' ,'FP_BD' ,'FP_SG' ,'FO_NO' ,'PY_CR', 'FP_LA' ,'PY_HN', 'FP_MY' ,'FP_TH', 'FY_CY', 'PY_VE')
  GROUP BY 1, 2
), 
customer_information AS (
  SELECT
         e.entity_id,
         e.release_date,
         CASE WHEN d.created_date <= e.release_date THEN FALSE ELSE COALESCE(is_customer_holdout, FALSE) END AS is_customer_holdout,
         d.customer_id
  FROM `fulfillment-dwh-production.cl.dps_holdout_users` AS d
  JOIN holdout_entities AS e
        ON d.entity_id = e.entity_id
  LEFT JOIN `fulfillment-dwh-production.cl._bad_dps_logs_ids` bad_ids
    ON d.customer_id = bad_ids.id
  WHERE d.created_date BETWEEN '2025-01-01' AND CURRENT_DATE()
    AND d.customer_id IS NOT NULL
    AND bad_ids.id IS NULL
  GROUP BY 1, 2, 3, 4
)
SELECT 
   e.entity_id,
   e.customer_id,
   e.is_customer_holdout,
   o.orders_pre,
   o.orders_post,
   o.analytical_profit_pre,
   o.analytical_profit_post
FROM customer_information e
LEFT JOIN orders o
  ON o.customer_id = e.customer_id
  AND o.entity_id = e.entity_id

"""

dps_data = """

WITH holdout_entities AS (
  SELECT
         entity_id,
        `Release Date` AS release_date
  FROM `logistics-data-storage-staging.long_term_pricing.global_holdout_rollout`
  WHERE `Release Date` < DATE_TRUNC(CURRENT_DATE(), WEEK)
  AND entity_id not in ('PY_DO','PY_BO', 'FP_TW', 'PY_PY', 'DJ_CZ', 'PY_EC','MJM_AT' ,'PY_PE', 'PY_AR' ,'PY_GT','PY_SV' ,'FP_PH','PY_NI' ,'NP_HU' ,'FP_MM','EF_GR' ,'AP_PA'
  ,'YS_TR', 'PY_UY' ,'OP_SE' ,'PY_CL' ,'FP_BD' ,'FP_SG' ,'FO_NO' ,'PY_CR', 'FP_LA','PY_HN', 'FP_MY' ,'FP_TH', 'FY_CY', 'PY_VE','PO_FI')
),
orders as (
  SELECT
       dps.entity_id entity_id
      ,dps.dps_customer_id customer_id
      ,COUNT(case when dps.created_date <= e.release_date then dps.platform_order_code end) AS orders_pre
      ,COUNT(case when dps.created_date > e.release_date then dps.platform_order_code end) AS orders_post
      ,SUM(case when dps.created_date <= e.release_date then dps.fully_loaded_gross_profit_eur end) AS analytical_profit_pre
      ,SUM(case when dps.created_date > e.release_date then dps.fully_loaded_gross_profit_eur end) AS analytical_profit_post
  FROM `fulfillment-dwh-production.cl.dps_sessions_mapped_to_orders` dps
  JOIN holdout_entities AS e
    ON dps.entity_id = e.entity_id
  WHERE dps.created_date >= DATE_SUB(e.release_date, INTERVAL 8 WEEK) 
    AND dps.created_date < CURRENT_DATE
    AND dps.platform_order_code IS NOT NULL
    AND dps.is_own_delivery
    AND dps.is_sent
    AND vendor_vertical_parent IN ('Restaurant','restaurant','restaurants')
    AND dps.entity_id not in ('PO_FI','PY_DO','PY_BO', 'FP_TW', 'PY_PY', 'DJ_CZ', 'PY_EC','MJM_AT' ,'PY_PE', 'PY_AR' ,'PY_GT','PY_SV' ,'FP_PH','PY_NI' ,'NP_HU' ,'FP_MM','EF_GR' ,'AP_PA','YS_TR', 'PY_UY' ,'OP_SE' ,'PY_CL' ,'FP_BD' ,'FP_SG' ,'FO_NO' ,'PY_CR', 'FP_LA','PY_HN', 'FP_MY' ,'FP_TH', 'FY_CY', 'PY_VE')
  GROUP BY 1, 2
), 
customer_information AS (
  SELECT
         e.entity_id,
         e.release_date,
         CASE WHEN d.created_date <= e.release_date THEN FALSE ELSE COALESCE(is_customer_holdout, FALSE) END AS is_customer_holdout,
         d.customer_id
  FROM `fulfillment-dwh-production.cl.dps_holdout_users` AS d
  JOIN holdout_entities AS e
        ON d.entity_id = e.entity_id
  LEFT JOIN `fulfillment-dwh-production.cl._bad_dps_logs_ids` bad_ids
    ON d.customer_id = bad_ids.id
  WHERE d.created_date BETWEEN '2025-01-01' AND CURRENT_DATE()
    AND d.customer_id IS NOT NULL
    AND bad_ids.id IS NULL
  GROUP BY 1, 2, 3, 4
)
SELECT 
   e.entity_id,
   e.customer_id,
   e.is_customer_holdout,
   o.orders_pre,
   o.orders_post,
   o.analytical_profit_pre,
   o.analytical_profit_post
FROM customer_information e
LEFT JOIN orders o
  ON o.customer_id = e.customer_id
  AND o.entity_id = e.entity_id


"""

def combined_data(client,mkt, dps):

  mkt_df = client.query(mkt).to_dataframe()
  dps_df = client.query(dps).to_dataframe()

  # Append DataFrames
  combined = pd.concat([mkt_df, dps_df], ignore_index=True)

  return combined


#Function to create a dataset in Bigquery
def bq_create_dataset(client, dataset):
    dataset_ref = bigquery_client.dataset(dataset)

    try:
        dataset = bigquery_client.get_dataset(dataset_ref)
        print('Dataset {} already exists.'.format(dataset))
    except NotFound:
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = 'US'
        dataset = bigquery_client.create_dataset(dataset)
        print('Dataset {} created.'.format(dataset.dataset_id))
    return dataset

#Function to create a Table
def bq_create_table(client, dataset, table_name, schema):
    dataset_ref = client.dataset(dataset)
    table_ref = dataset_ref.table(table_name)

    try:
        table = client.get_table(table_ref)
        print('Table {} already exists.'.format(table.table_id))
    except NotFound:
        table = bigquery.Table(table_ref, schema=schema)
        table = client.create_table(table)
        print('Table {} created.'.format(table.table_id))
    return table

# Function to drop a table 
def drop_table(client, dataset_id, table_id):
    
    table_ref = client.dataset(dataset_id).table(table_id)
    client.delete_table(table_ref, not_found_ok=True)  # not_found_ok=True prevents errors if the table doesn't exist.
    print(f"Table {dataset_id}.{table_id} deleted successfully.")

# Function to insert rows to a table
def insert_df_rows_bigquery(client, dataset_id, table_id, df):
    
    # 1. Reference the table
    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)

    # 2. Fetch the table object to get its schema/order
    table = client.get_table(table_ref)

    # 3. Convert the DataFrame to a list of dictionaries
    # Each dict corresponds to a row with column names as keys
    # These keys must match your table’s column names
    rows_to_insert = df.to_dict(orient='records')

    # 4. Insert the rows
    errors = client.insert_rows(table, rows_to_insert)

    # 5. Check for errors
    if errors:
        print("Encountered errors while inserting rows: ", errors)
    else:
        print(f"Successfully inserted {len(rows_to_insert)} rows into {dataset_id}.{table_id}.")


def apply_nan_to_orders(df):
    
    mask_post = df['analytical_profit_post'].isna() & (df['orders_post'] == 0)
    df.loc[mask_post, 'orders_post'] = np.nan
    df['orders_post'] = df['orders_post'].astype(float)
    
    mask_pre = df['analytical_profit_pre'].isna() & (df['orders_pre'] == 0)
    df.loc[mask_pre, 'orders_pre'] = np.nan
    df['orders_pre'] = df['orders_pre'].astype(float)

    return df

def drop_missing_data(df, columns):
    
    return df.dropna(subset=columns)

  
def check_missing_users_data(df, groupby_col='entity_id'):
   
    result = (
        df
        .groupby(groupby_col)
        .apply(lambda g: pd.Series({
            'total_customers': g['customer_id'].nunique(),
            'missing_pre': g.loc[g['orders_pre'].isna(), 'customer_id'].nunique(),
            'missing_post': g.loc[g['orders_post'].isna(), 'customer_id'].nunique(),
            'missing_pre_or_post': g.loc[
                g['orders_pre'].isna() | g['orders_post'].isna(), 
                'customer_id'
            ].nunique(),
            'missing_pre_and_post': g.loc[
                g['orders_pre'].isna() & g['orders_post'].isna(), 
                'customer_id'
            ].nunique(),
        }))
    ).reset_index()

    result['missing_pre_percentage'] = result['missing_pre'] / result['total_customers']
    result['missing_post_percentage'] = result['missing_post'] / result['total_customers']
    result['missing_pre_or_post_percentage'] = result['missing_pre_or_post'] / result['total_customers']
    result['missing_pre_and_post_percentage'] = result['missing_pre_and_post'] / result['total_customers']
    
    return result

def create_csv(df, name):
    
    today = datetime.today().date()
    week_start = today - timedelta(days=today.weekday())
    df['updated_date'] = week_start

    csv_filename = f"{name}_{week_start}.csv"
    df.to_csv(csv_filename, index=False)
    logging.info(f"CSV saved as {csv_filename}")

  
def create_holdout_table(project_id, dataset, table_name):

    df = pd.read_csv('global_holdout_rollout_dates - rollout.csv')
        
    df = df.dropna(subset= 'entity_id')

    df_final = df[['Region','Country','entity_id','Release Date','Release Status']]
    df_final['Release Date'] = pd.to_datetime(df_final['Release Date'])
    df_final['Release Date'] = df_final['Release Date'].dt.date


    schema = [
                bigquery.SchemaField('Region', 'STRING', mode='REQUIRED'),
                bigquery.SchemaField('Country', 'STRING', mode='REQUIRED'),
                bigquery.SchemaField('entity_id', 'STRING', mode='REQUIRED'),
                bigquery.SchemaField('Release Date', 'DATE', mode='REQUIRED'),
                bigquery.SchemaField('Release Status', 'STRING', mode='REQUIRED'),
            ]

    project_id = project_id
    client = bigquery.Client(project=project_id)

    #drop_table(client, dataset, table_name)
    
    bq_create_table(client, dataset, table_name, schema)
    insert_df_rows_bigquery(client, dataset, table_name, df_final)

In [73]:
def calculate_sustainable_growth(data, pre_profit, post_profit):
    
    data = data.copy()
    entity = data['entity_id'].iloc[0]

    # Apply CUPED to FLGP
    data_flgp = data.dropna(subset=[pre_profit, post_profit])
    theta_flgp = np.cov(data_flgp[pre_profit], data_flgp[post_profit])[0, 1] / np.var(data_flgp[pre_profit])
    data_flgp['flgp_post_cuped'] = data_flgp[post_profit] - theta_flgp * (data_flgp[pre_profit] - data_flgp[pre_profit].mean())

    # Apply CUPED to Orders
    data_orders = data.dropna(subset=['orders_pre', 'orders_post'])
    theta_orders = np.cov(data_orders['orders_pre'], data_orders['orders_post'])[0, 1] / np.var(data_orders['orders_pre'])
    data_orders['orders_post_cuped'] = data_orders['orders_post'] - theta_orders * (data_orders['orders_pre'] - data_orders['orders_pre'].mean())

    # Per User Metrics
    holdout_flgpu_post = data_flgp.loc[data_flgp['is_customer_holdout'], 'flgp_post_cuped'].mean()
    holdout_flgpu_pre = data_flgp.loc[data_flgp['is_customer_holdout'], pre_profit].mean()
    
    non_holdout_flgpu_post = data_flgp.loc[~data_flgp['is_customer_holdout'], 'flgp_post_cuped'].mean()
    non_holdout_flgpu_pre = data_flgp.loc[~data_flgp['is_customer_holdout'], pre_profit].mean()

    holdout_orders_per_user_post = data_orders.loc[data_orders['is_customer_holdout'], 'orders_post_cuped'].mean()
    holdout_orders_per_user_pre = data_orders.loc[data_orders['is_customer_holdout'], 'orders_pre'].mean()
    
    non_holdout_orders_per_user_post = data_orders.loc[~data_orders['is_customer_holdout'], 'orders_post_cuped'].mean()
    non_holdout_orders_per_user_pre = data_orders.loc[~data_orders['is_customer_holdout'], 'orders_pre'].mean()

    # Apply DID
    # DID to flgpu
    d_flgpu_pre = non_holdout_flgpu_pre - holdout_flgpu_pre
    d_flgpu_post = non_holdout_flgpu_post - holdout_flgpu_post

    non_holdout_flgpu_adjusted_post = non_holdout_flgpu_post - d_flgpu_pre
    holdout_flgpu_adjusted_post = holdout_flgpu_post 

    did_flgpu  = non_holdout_flgpu_adjusted_post - holdout_flgpu_adjusted_post

    # DID to orders_per_user
    d_orders_per_user_pre = non_holdout_orders_per_user_pre - holdout_orders_per_user_pre
    d_orders_per_user_post = non_holdout_orders_per_user_post - holdout_orders_per_user_post

    non_holdout_orders_per_user_adjusted_post = non_holdout_orders_per_user_post - d_orders_per_user_pre
    holdout_orders_per_user_adjusted_post = holdout_orders_per_user_post

    did_orders_per_user  = non_holdout_orders_per_user_adjusted_post - holdout_orders_per_user_adjusted_post

    # user count
    holdout_user_count_flgp = data_flgp['is_customer_holdout'].sum()
    non_holdout_user_count_flgp = (~data_flgp['is_customer_holdout']).sum()

    holdout_user_count_orders = data_orders['is_customer_holdout'].sum()
    non_holdout_user_count_orders = (~data_orders['is_customer_holdout']).sum()

    # Total FLGP and Orders
    holdout_total_flgp_cuped = holdout_flgpu_adjusted_post * holdout_user_count_flgp
    non_holdout_total_flgp_cuped = non_holdout_flgpu_adjusted_post * non_holdout_user_count_flgp

    holdout_total_orders_cuped = holdout_orders_per_user_adjusted_post * holdout_user_count_orders
    non_holdout_total_orders_cuped = non_holdout_orders_per_user_adjusted_post * non_holdout_user_count_orders

    # Normalize for Population Differences
    scaled_holdout_total_flgp_cuped = (holdout_total_flgp_cuped / holdout_user_count_flgp) * non_holdout_user_count_flgp if holdout_user_count_flgp != 0 else np.nan
    scaled_holdout_total_orders_cuped = (holdout_total_orders_cuped / holdout_user_count_orders) * non_holdout_user_count_orders if holdout_user_count_orders != 0 else np.nan

    #Calculate Per order Metrics
    holdout_flgp_per_order_cuped = (
    np.nan if pd.isna(holdout_total_orders_cuped) or holdout_total_orders_cuped == 0
    else holdout_total_flgp_cuped / holdout_total_orders_cuped
    )
       
    non_holdout_flgp_per_order_cuped = (
    np.nan if pd.isna(non_holdout_total_orders_cuped) or non_holdout_total_orders_cuped == 0
    else non_holdout_total_flgp_cuped / non_holdout_total_orders_cuped
    )

    holdout_orders_per_user_cuped = holdout_orders_per_user_adjusted_post
    non_holdout_orders_per_user_cuped = non_holdout_orders_per_user_adjusted_post

    # Incremental Differences (CUPED)
    incremental_orders_cuped = non_holdout_total_orders_cuped - scaled_holdout_total_orders_cuped
    incremental_flgp_cuped = non_holdout_total_flgp_cuped - scaled_holdout_total_flgp_cuped

    # Percentage Changes (CUPED)
    percentage_change_orders_cuped = ((incremental_orders_cuped) / abs(scaled_holdout_total_orders_cuped)) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan
    percentage_change_flgp_cuped = ((incremental_flgp_cuped) / abs(scaled_holdout_total_flgp_cuped)) * 100 if scaled_holdout_total_flgp_cuped != 0 else np.nan

    # Sustainable Growth Calculation
    sustainable_growth = ((incremental_orders_cuped + (incremental_flgp_cuped / abs(non_holdout_flgp_per_order_cuped))) / scaled_holdout_total_orders_cuped) * 100 if scaled_holdout_total_orders_cuped != 0 else np.nan

    # T-tests for significance
    t_stat_orders, p_value_orders = ttest_ind(
        data_orders.loc[data_orders['is_customer_holdout'], 'orders_post_cuped'],
        data_orders.loc[~data_orders['is_customer_holdout'], 'orders_post_cuped'],
        equal_var=False
    )

    t_stat_flgp, p_value_flgp = ttest_ind(
        data_flgp.loc[data_flgp['is_customer_holdout'], 'flgp_post_cuped'],
        data_flgp.loc[~data_flgp['is_customer_holdout'], 'flgp_post_cuped'],
        equal_var=False
    )

    return {
        'entity': entity,
        'sustainable_growth': sustainable_growth,
        'percentage_change_orders_cuped': percentage_change_orders_cuped,
        'incremental_orders_cuped': incremental_orders_cuped,
        'non_holdout_total_orders_cuped':non_holdout_total_orders_cuped,
        'holdout_total_orders_cuped':scaled_holdout_total_orders_cuped,
        't_stat_orders': t_stat_orders,
        'p_value_orders': p_value_orders,
        'percentage_change_flgp_cuped': percentage_change_flgp_cuped,
        'incremental_flgp_cuped': incremental_flgp_cuped,
        'non_holdout_total_flgp_cuped': non_holdout_total_flgp_cuped,
        'holdout_total_flgp_cuped':scaled_holdout_total_flgp_cuped,
        't_stat_flgp': t_stat_flgp,
        'p_value_flgp': p_value_flgp,
        'holdout_flgp_per_order_cuped': holdout_flgp_per_order_cuped,
        'non_holdout_flgp_per_order_cuped': non_holdout_flgp_per_order_cuped,
        'holdout_orders_per_user_cuped': holdout_orders_per_user_cuped,
        'non_holdout_orders_per_user_cuped': non_holdout_orders_per_user_cuped
    }



In [74]:
def extract_data(project , mkt_data, dps_data):
    
    project_id = project
    logging.info(f"Initializing BigQuery client for project: {project_id}")
    try:
        client = bigquery.Client(project=project_id)
    except Exception as e:
        logging.error(f"Failed to initialize BigQuery client: {e}")
        raise e
    
    start_time = time.time()
    combined_df = combined_data(client,mkt_data, dps_data)
    elapsed_time = time.time() - start_time
    logging.info(f"Time to extract and combine data from DB: {elapsed_time:.2f} seconds")
    
    return combined_df

def create_growth_dataframe(df):
    
    metric_pairs = [
        ('analytical_profit_pre', 'analytical_profit_post'),
    ]
    
    results = []
    start_time = time.time()
    for pre_metric, post_metric in metric_pairs:
        for entity in df['entity_id'].unique():
            entity_data = df[df['entity_id'] == entity]
            try:
                result = calculate_sustainable_growth(entity_data, pre_metric, post_metric)
                result['metric_used'] = f"{pre_metric}_vs_{post_metric}"
                results.append(result)
            except Exception as e:
                logging.error(f"Skipping entity {entity} due to error: {e}")
                continue
    elapsed_time = time.time() - start_time
    logging.info(f"Time to calculate sustainable growth: {elapsed_time:.2f} seconds")
    
    final_results_df = pd.DataFrame(results)
    final_results_df = final_results_df.dropna(subset=['sustainable_growth'])
    
    # Determine the current week's start (Monday)
    today = datetime.today().date()
    week_start = today - timedelta(days=today.weekday())
    final_results_df['updated_date'] = week_start
    
    csv_filename = f"profitable_growth_{week_start}.csv"
    final_results_df.to_csv(csv_filename, index=False)
    logging.info(f"CSV saved as {csv_filename}")
    
    return final_results_df

def push_data_to_bigquery(project,df, dataset, table_name, schema):
    
    project_id = project
    try:
        bigquery_client = bigquery.Client(project=project_id_new)
    except Exception as e:
        logging.error(f"Failed to initialize BigQuery client for project {project_id_new}: {e}")
        raise e

    try:
        bq_create_dataset(bigquery_client, dataset)
        bq_create_table(bigquery_client, dataset, table_name, schema)
    except Exception as e:
        logging.error(f"Error creating dataset/table: {e}")
        raise e
    
    try:
        insert_df_rows_bigquery(
            client=bigquery_client,
            dataset_id=dataset,
            table_id=table_name,
            df=df
        )
        logging.info("Data inserted successfully into BigQuery table.")
    except Exception as e:
        logging.error(f"Error inserting data into BigQuery: {e}")
        raise e

In [75]:
if __name__ == "__main__":

    # ------------------------------------------------------------------------------
    # GET DATA FROM DB
    # ------------------------------------------------------------------------------

    raw_data = extract_data("logistics-customer-staging",mkt_data, dps_data)

    # ------------------------------------------------------------------------------
    # CLEAN DATA
    # ------------------------------------------------------------------------------

    #make sure 0's are converted to NaN's where applicable
    raw_data_cleaned = apply_nan_to_orders(raw_data)
    
    #remove customers with no data in the pre period or the post period or in both periods
    needed_cols = ['analytical_profit_pre', 'analytical_profit_post','orders_pre','orders_post']
    raw_data_final = drop_missing_data(raw_data_cleaned, needed_cols)

    # create csv with data
    create_csv(raw_data_final, "profitable_growth_raw")
    
    # ------------------------------------------------------------------------------
    # Calculate Sustainable Growth
    # ------------------------------------------------------------------------------
    
    sustainable_df = create_growth_dataframe(raw_data_final)
    
    # ------------------------------------------------------------------------------
    # Push Data to DB
    # ------------------------------------------------------------------------------
    
    # schema = [
    #         bigquery.SchemaField('entity', 'STRING', mode='REQUIRED'),
    #         bigquery.SchemaField('sustainable_growth', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('percentage_change_orders_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('incremental_orders_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('non_holdout_total_orders_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('holdout_total_orders_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('t_stat_orders', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('p_value_orders', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('percentage_change_flgp_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('incremental_flgp_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('non_holdout_total_flgp_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('holdout_total_flgp_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('t_stat_flgp', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('p_value_flgp', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('holdout_flgp_per_order_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('non_holdout_flgp_per_order_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('holdout_orders_per_user_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('non_holdout_orders_per_user_cuped', 'FLOAT', mode='REQUIRED'),
    #         bigquery.SchemaField('metric_used', 'STRING', mode='REQUIRED'),
    #         bigquery.SchemaField('updated_date', 'DATE', mode='REQUIRED'),
    #     ]

    # folder = "shazeb"
    # table_name = "abc_performance_backup"
    # push_data_to_bigquery('logistics-data-storage-staging', sustainable_df, folder, table_name)


2025-02-20 16:01:59,134 INFO:Initializing BigQuery client for project: logistics-customer-staging




2025-02-20 16:21:16,978 INFO:Time to extract and combine data from DB: 1157.13 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['updated_date'] = week_start


2025-02-20 16:22:45,595 INFO:CSV saved as profitable_growth_raw_2025-02-17.csv
2025-02-20 16:23:43,821 INFO:Time to calculate sustainable growth: 58.22 seconds
2025-02-20 16:23:43,826 INFO:CSV saved as profitable_growth_2025-02-17.csv


In [76]:
sustainable_df

Unnamed: 0,entity,sustainable_growth,percentage_change_orders_cuped,incremental_orders_cuped,non_holdout_total_orders_cuped,holdout_total_orders_cuped,t_stat_orders,p_value_orders,percentage_change_flgp_cuped,incremental_flgp_cuped,non_holdout_total_flgp_cuped,holdout_total_flgp_cuped,t_stat_flgp,p_value_flgp,holdout_flgp_per_order_cuped,non_holdout_flgp_per_order_cuped,holdout_orders_per_user_cuped,non_holdout_orders_per_user_cuped,metric_used,updated_date
0,FP_MY,-1.379557,-2.494951,-115883.394829,4528832.0,4644716.0,0.246618,0.8052054,1.157172,48162.275003,4210228.0,4162066.0,-3.516869,0.0004373631,0.896086,0.92965,3.518812,3.431019,analytical_profit_pre_vs_analytical_profit_post,2025-02-17
1,YS_TR,8.835194,1.059014,99261.554036,9472278.0,9373016.0,-3.579482,0.0003446137,8.336132,823587.772139,10703320.0,9879735.0,-11.206387,4.120751e-29,1.054061,1.129963,4.110758,4.154291,analytical_profit_pre_vs_analytical_profit_post,2025-02-17
2,FP_TW,5.909035,3.46316,301847.335355,9017800.0,8715952.0,-3.059845,0.002216234,2.421244,234876.283797,9935520.0,9700644.0,-1.99998,0.04551033,1.112976,1.101768,5.374514,5.560642,analytical_profit_pre_vs_analytical_profit_post,2025-02-17
3,PY_AR,1.513357,-0.124366,-10649.338448,8552266.0,8562915.0,-2.882629,0.003945806,1.667098,310774.390816,18952410.0,18641630.0,-3.272123,0.001068329,2.17702,2.216069,4.509885,4.504277,analytical_profit_pre_vs_analytical_profit_post,2025-02-17
4,AP_PA,-3.681039,0.386872,4082.315083,1059294.0,1055211.0,-1.722941,0.08496129,-3.894423,-72969.785237,1800730.0,1873700.0,1.605205,0.1085126,1.775663,1.699934,4.490663,4.508036,analytical_profit_pre_vs_analytical_profit_post,2025-02-17
5,EF_GR,4.76075,5.492715,137429.605067,2639464.0,2502034.0,-7.074097,1.586001e-12,-0.689072,-18401.29898,2652044.0,2670445.0,-0.883278,0.3771035,1.06731,1.004766,4.181549,4.41123,analytical_profit_pre_vs_analytical_profit_post,2025-02-17
6,FP_TH,-12.788265,-1.074721,-8514.289473,783718.4,792232.7,-0.974354,0.3299284,-10.587192,-65876.96423,556355.7,622232.7,0.432142,0.6656568,0.785417,0.709892,3.3278,3.292035,analytical_profit_pre_vs_analytical_profit_post,2025-02-17
7,FP_PH,-0.458405,0.376407,21347.973693,5692858.0,5671510.0,-5.133458,2.857939e-07,-0.824822,-36984.206596,4446918.0,4483902.0,-1.740765,0.08173252,0.790601,0.78114,2.973454,2.984646,analytical_profit_pre_vs_analytical_profit_post,2025-02-17
8,FP_MM,-1.978963,-0.887284,-5719.235127,638858.7,644578.0,-1.227823,0.2195958,-1.089452,-6662.48398,604881.9,611544.3,-0.264501,0.7914092,0.948752,0.946816,3.852551,3.818368,analytical_profit_pre_vs_analytical_profit_post,2025-02-17
9,FY_CY,6.615057,1.69806,9151.466278,548088.0,538936.5,-0.19025,0.8491312,5.080536,39728.610844,821705.3,781976.7,1.201497,0.2296948,1.450963,1.499222,5.324302,5.414712,analytical_profit_pre_vs_analytical_profit_post,2025-02-17


In [287]:
# project_id = "logistics-data-storage-staging"
# dataset_id = "shazeb"
# table_name = "abc_performance_backup"
# push_data_to_bigquery('logistics-data-storage-staging', sustainable_df, folder, table_name)

Dataset Dataset(DatasetReference('logistics-data-storage-staging', 'shazeb')) already exists.
table abc_performance_backup created.
Successfully inserted 52 rows into shazeb.abc_performance_backup.
[[34m2025-02-20T01:28:51.264+0100[0m] {[34m4238489762.py:[0m76} INFO[0m - Data inserted successfully into BigQuery table.[0m


In [285]:
# project_id = 'logistics-data-storage-staging'
# try:
#     bigquery_client = bigquery.Client(project=project_id)
# except Exception as e:
#     logging.error(f"Failed to initialize BigQuery client for project {project_id_new}: {e}")
#     raise e

# drop_table(bigquery_client, "shazeb", "abc_performance_backup")

Table shazeb.abc_performance_backup deleted successfully.


In [158]:
# project_id = 'logistics-data-storage-staging'
# try:
#     bigquery_client = bigquery.Client(project=project_id)
# except Exception as e:
#     logging.error(f"Failed to initialize BigQuery client for project {project_id_new}: {e}")
#     raise e

# drop_table(bigquery_client, "shazeb", "abc_performance")

Table shazeb.abc_performance deleted successfully.
