In [1]:
import numpy as np
import pandas as pd
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px
import db_dtypes
import bigframes.pandas as bpd
from IPython.display import display, HTML
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import math
import statsmodels.api as sm


def user_details(start_date, end_date,entity_d):

    query = f"""
    WITH listing AS (    
        SELECT   global_entity_id
                ,country
                ,CAST(DATE_TRUNC(injestion_time, MONTH) AS DATE) session_month
                ,CAST(DATE_TRUNC(injestion_time, ISOWEEK) AS DATE) session_week
                ,CAST(DATE_TRUNC(injestion_time, DAY) AS DATE) session_day
                ,EXTRACT(HOUR FROM injestion_time) AS session_hour
                ,session_key 
                ,perseus_session_id
                ,chainId 
                ,shopId
                ,userId
                ,df_impressions 
        FROM (
        SELECT  session_key
                ,COALESCE(pe.global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,userId
                ,ingestion_timestamp injestion_time
                ,country
                ,COALESCE(chainId, JSON_VALUE(eventVariables_json, "$.chainId") ) AS chainId
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY ingestion_timestamp) row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")) df_raw
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events` pe
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'shop_impressions.loaded'
                AND screenType = 'shop_list'
                AND pe.global_entity_id IN (""" + entity_d + """)
                AND locationCity IS NOT NULL
                AND shopType = 'restaurants'
        )
        WHERE row_num = 1
        AND df_raw IS NOT NULL
        ORDER BY global_entity_id, session_key
    ), shop_details AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'shop_details.loaded'
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), checkout AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'checkout.loaded'
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), orders AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction IN ('transaction')
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), rates AS (
        SELECT cu.country_iso
            ,cu.currency_code
            ,tmp.fx_rate_eur
        FROM `fulfillment-dwh-production.cl.countries` cu
        JOIN ( 
            WITH latest_fx_rate AS (
            SELECT 
                currency_code,
                fx_rate_eur,
                calculated_at,
                ROW_NUMBER() OVER (PARTITION BY currency_code ORDER BY calculated_at DESC) AS rn
            FROM `fulfillment-dwh-production.curated_data_shared_coredata.fx_rates`
            )
            SELECT 
                currency_code,
                fx_rate_eur,
                calculated_at AS max_calculated_at
            FROM latest_fx_rate
            WHERE rn = 1
        ) tmp ON tmp.currency_code = cu.currency_code
        GROUP BY 1, 2, 3
        ORDER BY 1
    )
    SELECT      l.global_entity_id
                ,l.userId
                -- Delivery fee listing calculations
                ,AVG(l.df_impressions / r.fx_rate_eur) AS delivery_fee_listing_eur_mean
                ,APPROX_QUANTILES(l.df_impressions / r.fx_rate_eur, 100)[OFFSET(50)] AS delivery_fee_listing_eur_median
                ,MIN(l.df_impressions / r.fx_rate_eur) AS delivery_fee_listing_eur_min
                ,MAX(l.df_impressions / r.fx_rate_eur) AS delivery_fee_listing_eur_max
                ,STDDEV(l.df_impressions / r.fx_rate_eur) AS delivery_fee_listing_eur_stddev
                
                -- Delivery fee details page calculations
                ,AVG(sd.df_impressions / r.fx_rate_eur) AS delivery_fee_details_eur_mean
                ,APPROX_QUANTILES(sd.df_impressions / r.fx_rate_eur, 100)[OFFSET(50)] AS delivery_fee_details_eur_median
                ,MIN(sd.df_impressions / r.fx_rate_eur) AS delivery_fee_details_eur_min
                ,MAX(sd.df_impressions / r.fx_rate_eur) AS delivery_fee_details_eur_max
                ,STDDEV(sd.df_impressions / r.fx_rate_eur) AS delivery_fee_details_eur_stddev
                
                -- Delivery fee checkout calculations
                ,AVG(co.df_impressions / r.fx_rate_eur) AS delivery_fee_checkout_eur_mean
                ,APPROX_QUANTILES(co.df_impressions / r.fx_rate_eur, 100)[OFFSET(50)] AS delivery_fee_checkout_eur_median
                ,MIN(co.df_impressions / r.fx_rate_eur) AS delivery_fee_checkout_eur_min
                ,MAX(co.df_impressions / r.fx_rate_eur) AS delivery_fee_checkout_eur_max
                ,STDDEV(co.df_impressions / r.fx_rate_eur) AS delivery_fee_checkout_eur_stddev
                
                -- Delivery fee order calculations
                ,AVG(o.df_impressions / r.fx_rate_eur) AS delivery_fee_order_eur_mean
                ,APPROX_QUANTILES(o.df_impressions / r.fx_rate_eur, 100)[OFFSET(50)] AS delivery_fee_order_eur_median
                ,MIN(o.df_impressions / r.fx_rate_eur) AS delivery_fee_order_eur_min
                ,MAX(o.df_impressions / r.fx_rate_eur) AS delivery_fee_order_eur_ma
                ,STDDEV(o.df_impressions / r.fx_rate_eur) AS delivery_fee_order_eur_stddev
                ,COUNT(l.shopId) AS total_vendor
                ,SUM(CASE WHEN sd.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_details_page
                ,SUM(CASE WHEN co.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_checkout
                ,SUM(CASE WHEN o.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_order
    FROM listing l
    LEFT JOIN shop_details sd ON sd.global_entity_id = l.global_entity_id AND sd.session_key = l.session_key AND sd.shopId = l.shopId
    LEFT JOIN checkout co ON co.global_entity_id = l.global_entity_id AND co.session_key = l.session_key AND co.shopId = l.shopId
    LEFT JOIN orders o ON o.global_entity_id = l.global_entity_id AND o.session_key = l.session_key AND o.shopId = l.shopId
    LEFT JOIN rates r ON l.country = r.country_iso
    GROUP BY 1, 2
    """

    return query



def user_conversion(start_date, end_date,entity_d):

    query = f"""
    WITH listing AS (    
        SELECT   global_entity_id
                ,country
                ,CAST(DATE_TRUNC(injestion_time, MONTH) AS DATE) session_month
                ,CAST(DATE_TRUNC(injestion_time, ISOWEEK) AS DATE) session_week
                ,CAST(DATE_TRUNC(injestion_time, DAY) AS DATE) session_day
                ,EXTRACT(HOUR FROM injestion_time) AS session_hour
                ,session_key 
                ,perseus_session_id
                ,chainId 
                ,shopId
                ,userId
                ,df_impressions 
        FROM (
        SELECT  session_key
                ,COALESCE(pe.global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,userId
                ,ingestion_timestamp injestion_time
                ,country
                ,COALESCE(chainId, JSON_VALUE(eventVariables_json, "$.chainId") ) AS chainId
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY ingestion_timestamp) row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")) df_raw
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events` pe
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'shop_impressions.loaded'
                AND screenType = 'shop_list'
                AND pe.global_entity_id IN (""" + entity_d + """)
                AND locationCity IS NOT NULL
                AND shopType = 'restaurants'
        )
        WHERE row_num = 1
        AND df_raw IS NOT NULL
        ORDER BY global_entity_id, session_key
    ), shop_details AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'shop_details.loaded'
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), checkout AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'checkout.loaded'
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), orders AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction IN ('transaction')
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), rates AS (
        SELECT cu.country_iso
            ,cu.currency_code
            ,tmp.fx_rate_eur
        FROM `fulfillment-dwh-production.cl.countries` cu
        JOIN ( 
            WITH latest_fx_rate AS (
            SELECT 
                currency_code,
                fx_rate_eur,
                calculated_at,
                ROW_NUMBER() OVER (PARTITION BY currency_code ORDER BY calculated_at DESC) AS rn
            FROM `fulfillment-dwh-production.curated_data_shared_coredata.fx_rates`
            )
            SELECT 
                currency_code,
                fx_rate_eur,
                calculated_at AS max_calculated_at
            FROM latest_fx_rate
            WHERE rn = 1
        ) tmp ON tmp.currency_code = cu.currency_code
        GROUP BY 1, 2, 3
        ORDER BY 1
    ), impression as (
    SELECT      l.global_entity_id
                ,l.country
                ,l.userId
                ,ROUND((l.df_impressions / r.fx_rate_eur) * 5) / 5 AS delivery_fee_listing
                ,COUNT(l.shopId) AS total_vendor
                ,SUM(CASE WHEN sd.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_details_page
                ,SUM(CASE WHEN co.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_checkout
                ,SUM(CASE WHEN o.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_order
                ,SUM(CASE WHEN sd.shopId IS NOT NULL THEN 1 ELSE 0 END) / COUNT(l.shopId) conversion_details
                ,SUM(CASE WHEN co.shopId IS NOT NULL THEN 1 ELSE 0 END) / COUNT(l.shopId) conversion_checkout
                ,SUM(CASE WHEN o.shopId IS NOT NULL THEN 1 ELSE 0 END) / COUNT(l.shopId) conversion_order
    FROM listing l
    LEFT JOIN shop_details sd ON sd.global_entity_id = l.global_entity_id AND sd.session_key = l.session_key AND sd.shopId = l.shopId
    LEFT JOIN checkout co ON co.global_entity_id = l.global_entity_id AND co.session_key = l.session_key AND co.shopId = l.shopId
    LEFT JOIN orders o ON o.global_entity_id = l.global_entity_id AND o.session_key = l.session_key AND o.shopId = l.shopId
    LEFT JOIN rates r ON l.country = r.country_iso
    GROUP BY 1, 2, 3, 4
    )
    select * 
    from (
    select  global_entity_id
        ,delivery_fee_listing
        ,avg(conversion_details) avg_conversion_details
        ,avg(conversion_checkout) avg_conversion_checkout
        ,avg(conversion_order) avg_conversion_order
        ,sum(total_vendor) vendors
        ,count(distinct userId) total_users
        ,count(distinct case when converted_details_page > 0 then userId end) converted_user_count_details
        ,count(distinct case when converted_checkout > 0 then userId end) converted_user_count_checkout
        ,count(distinct case when converted_order > 0 then userId end) converted_user_count_order
    FROM impression
    group by 1,2
    order by 1,2
    )
    
    where vendors > 100
    """

    return query


# def user_conversion_funnel(df):
    
#     for i in df['global_entity_id'].unique():

#         df_tmp = df[df['global_entity_id'] == i]

#         # Groupby and aggregate data for conversion counts
#         user_conversion = df_tmp.groupby(['global_entity_id']).agg(
#             converted_user_count_order=('userId', lambda x: x[df_tmp['converted_order'] > 0].nunique()),
#             converted_user_count_details=('userId', lambda x: x[df_tmp['converted_details_page'] > 0].nunique()),
#             converted_user_count_checkout=('userId', lambda x: x[df_tmp['converted_checkout'] > 0].nunique()),
#             total_user_count=('userId', 'nunique')
#         ).reset_index()

#         # Calculate funnel metrics
#         user_conversion['user_listing'] = user_conversion['total_user_count'] / user_conversion['total_user_count']
#         user_conversion['user_listing_to_details'] = user_conversion['converted_user_count_details'] / user_conversion['total_user_count']
#         user_conversion['user_listing_to_checkout'] = user_conversion['converted_user_count_checkout'] / user_conversion['total_user_count']
#         user_conversion['user_listing_to_transaction'] = user_conversion['converted_user_count_order'] / user_conversion['total_user_count']

    
#         # List of conversion stages
#         lst = [
#         round(user_conversion['user_listing'][0] * 100, 0),  # Convert to percentage and round
#         round(user_conversion['user_listing_to_details'][0] * 100, 0),
#         round(user_conversion['user_listing_to_checkout'][0] * 100, 0),
#         round(user_conversion['user_listing_to_transaction'][0] * 100, 0)
#         ]

#         # Data for funnel plot
#         funnel_data = dict(
#             number=lst,
#             stage=["Listing", "Listing_to_details", "Listing_to_checkout", "Listing_to_transaction"]
#         )

#         # Create funnel plot using plotly
#         fig = px.funnel(funnel_data, x='number', y='stage')
#         fig.update_layout(title='User conversion ' + i, width=750,height=400)
#         fig.show()

def user_conversion_funnel(df):
    
    # Create a list to store data for all entities
    funnel_data_list = []
    
    for i in df['global_entity_id'].unique():

        df_tmp = df[df['global_entity_id'] == i]

        # Groupby and aggregate data for conversion counts
        user_conversion = df_tmp.groupby(['global_entity_id']).agg(
            converted_user_count_order=('userId', lambda x: x[df_tmp['converted_order'] > 0].nunique()),
            converted_user_count_details=('userId', lambda x: x[df_tmp['converted_details_page'] > 0].nunique()),
            converted_user_count_checkout=('userId', lambda x: x[df_tmp['converted_checkout'] > 0].nunique()),
            total_user_count=('userId', 'nunique')
        ).reset_index()

        # Calculate funnel metrics
        user_conversion['user_listing'] = user_conversion['total_user_count'] / user_conversion['total_user_count']
        user_conversion['user_listing_to_details'] = user_conversion['converted_user_count_details'] / user_conversion['total_user_count']
        user_conversion['user_listing_to_checkout'] = user_conversion['converted_user_count_checkout'] / user_conversion['total_user_count']
        user_conversion['user_listing_to_transaction'] = user_conversion['converted_user_count_order'] / user_conversion['total_user_count']

        # Append the conversion stages for the current entity to the list
        funnel_data_list.append({
            'global_entity_id': i,
            'stage': 'Listing',
            'conversion_rate': round(user_conversion['user_listing'][0] * 100, 0)
        })
        funnel_data_list.append({
            'global_entity_id': i,
            'stage': 'Listing_to_details',
            'conversion_rate': round(user_conversion['user_listing_to_details'][0] * 100, 0)
        })
        funnel_data_list.append({
            'global_entity_id': i,
            'stage': 'Listing_to_checkout',
            'conversion_rate': round(user_conversion['user_listing_to_checkout'][0] * 100, 0)
        })
        funnel_data_list.append({
            'global_entity_id': i,
            'stage': 'Listing_to_transaction',
            'conversion_rate': round(user_conversion['user_listing_to_transaction'][0] * 100, 0)
        })

    # Create a DataFrame from the list
    funnel_data_df = pd.DataFrame(funnel_data_list)

    # Create a funnel plot for all entities
    fig = px.funnel(funnel_data_df, x='conversion_rate', y='stage', color='global_entity_id')
    fig.update_layout(title='User Conversion Funnel Across Entities', width=800, height=600)
    fig.show()


def plot_delivery_fee_boxplot(df, column_list):
    
    for i in df['global_entity_id'].unique():
        
        df_tmp = df[df['global_entity_id'] == i]
        
        if not all(col in df_tmp.columns for col in column_list):
            raise ValueError(f"Some columns in {column_list} do not exist in the DataFrame for entity {i}")
        
        stage_mapping = {col: col.split('_')[2].capitalize() if len(col.split('_')) > 2 else col for col in column_list}
        
        delivery_fee_mean = df_tmp[column_list]
        
        df_melted = delivery_fee_mean.melt(var_name='Stage', value_name='Delivery Fee (EUR)')
        
        df_melted['Stage'] = df_melted['Stage'].replace(stage_mapping)
        
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=df_melted, x='Stage', y='Delivery Fee (EUR)')
        
        plt.title(f'{i} Delivery Fees Across Different Stages', fontsize=16)
        
        plt.show()

def descriptive_stats(df):
    # Group the data by 'global_entity_id'
    grouped = df.groupby('global_entity_id') 

    # Create an empty dictionary to store the descriptive statistics
    grouped_descriptive_stats = {}

    # Define the manual order for sorting
    manual_order = [
        'delivery_fee_listing_eur_mean', 'delivery_fee_details_eur_mean', 'delivery_fee_checkout_eur_mean', 'delivery_fee_order_eur_mean',
        'delivery_fee_listing_eur_median', 'delivery_fee_details_eur_median', 'delivery_fee_checkout_eur_median', 'delivery_fee_order_eur_median',
        'delivery_fee_listing_eur_min', 'delivery_fee_details_eur_min', 'delivery_fee_checkout_eur_min', 'delivery_fee_order_eur_min',
        'delivery_fee_listing_eur_max', 'delivery_fee_details_eur_max', 'delivery_fee_checkout_eur_max', 'delivery_fee_order_eur_ma',
        'delivery_fee_listing_eur_stddev', 'delivery_fee_details_eur_stddev', 'delivery_fee_checkout_eur_stddev', 'delivery_fee_order_eur_stddev',
        'total_vendor', 'converted_details_page', 'converted_checkout', 'converted_order'
    ]

    # Loop through each group and calculate descriptive statistics
    for name, group in grouped:
        descriptive_stats = group.describe().transpose()

        # Reorder the DataFrame based on the manual order
        descriptive_stats = descriptive_stats.reindex(manual_order)

        # Style the descriptive statistics for better readability
        styled = descriptive_stats.style \
                        .format(precision=2, thousands=",", decimal=".") \
                        .format_index(str.upper, axis=1)
        
        grouped_descriptive_stats[name] = styled

    # Display each sorted group's styled descriptive statistics
    for name, styled_df in grouped_descriptive_stats.items():
        print(f"Group: {name}")
        display(styled_df)


def plot_entity_kde(df, delivery_fee_column, threshold=10, num_cols=4):
    """
    Plots a grid of KDE plots for each unique entity in the dataset filtered by a delivery fee threshold.
    
    Parameters:
    user_summary_df (pd.DataFrame): The DataFrame containing the data.
    delivery_fee_column (str): The column name for delivery fee to plot the KDE.
    threshold (float, optional): The threshold for filtering the delivery fees. Default is 10.
    num_cols (int, optional): Number of columns in the grid. Default is 4.
    
    Returns:
    None: Displays the KDE plots for each entity.
    """
    # Filter the dataset where the delivery fee is less than the specified threshold
    filtered_df = df[df[delivery_fee_column] < threshold]

    # Get the unique entity IDs
    entities = filtered_df['global_entity_id'].unique()

    # Define the number of rows for the grid based on the number of entities and columns
    num_entities = len(entities)
    num_rows = math.ceil(num_entities / num_cols)

    # Create a grid of subplots
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))

    # Flatten the axes array for easy iteration
    axes = axes.flatten()

    # Iterate over each entity and plot a KDE for its values
    for i, entity in enumerate(entities):
        subset = filtered_df[filtered_df['global_entity_id'] == entity]  # Filter data for each entity
        
        # Plot KDE for the entity on the corresponding subplot
        sns.kdeplot(subset[delivery_fee_column], label=f'Entity {entity}', fill=True, ax=axes[i])
        
        # Set title and legend
        axes[i].set_title(f'Entity {entity}')
        axes[i].legend()

    # Remove any unused subplots if the number of entities is less than the grid size
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust layout to avoid overlap
    plt.tight_layout()
    plt.show()


def plot_conversion(df):
        
    for i in df['global_entity_id'].unique():

            df_tmp = df[df['global_entity_id'] == i]
    
            # Create a figure with four subplots (2x2 layout)
            fig, axes = plt.subplots(2, 3, figsize=(14, 10))

            # Plot user_conversion_details as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='user_conversion_details', ax=axes[0, 0], label='Details Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='user_conversion_details', scatter=False, ax=axes[0, 0], color='blue')
            axes[0, 0].set_title('Details Conversion vs. Delivery Fee')
            axes[0, 0].set_xlabel('Delivery Fee (€)')
            axes[0, 0].set_ylabel('Details Conversion Rate')

            # Plot user_conversion_checkout as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='user_conversion_checkout', ax=axes[0, 1], label='Checkout Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='user_conversion_checkout', scatter=False, ax=axes[0, 1], color='blue')
            axes[0, 1].set_title('Checkout Conversion vs. Delivery Fee')
            axes[0, 1].set_xlabel('Delivery Fee (€)')
            axes[0, 1].set_ylabel('Checkout Conversion Rate')

            # Plot user_conversion_order as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='user_conversion_order', ax=axes[0, 2], label='Order Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='user_conversion_order', scatter=False, ax=axes[0, 2], color='blue')
            axes[0, 2].set_title('Order Conversion vs. Delivery Fee')
            axes[0, 2].set_xlabel('Delivery Fee (€)')
            axes[0, 2].set_ylabel('Order Conversion Rate')

            # Plot avg_conversion_details as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='avg_conversion_details', ax=axes[1, 0], label='Avg Details Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='avg_conversion_details', scatter=False, ax=axes[1, 0], color='orange')
            axes[1, 0].set_title('Avg Details Conversion vs. Delivery Fee')
            axes[1, 0].set_xlabel('Delivery Fee (€)')
            axes[1, 0].set_ylabel('Avg Details Conversion Rate')

            # Plot avg_conversion_checkout as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='avg_conversion_checkout', ax=axes[1, 1], label='Avg Checkout Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='avg_conversion_checkout', scatter=False, ax=axes[1, 1], color='orange')
            axes[1, 1].set_title('Avg Checkout Conversion vs. Delivery Fee')
            axes[1, 1].set_xlabel('Delivery Fee (€)')
            axes[1, 1].set_ylabel('Avg Checkout Conversion Rate')

            # Plot avg_conversion_order as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='avg_conversion_order', ax=axes[1, 2], label='Avg Order Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='avg_conversion_order', scatter=False, ax=axes[1, 2], color='orange')
            axes[1, 2].set_title('Avg Order Conversion vs. Delivery Fee')
            axes[1, 2].set_xlabel('Delivery Fee (€)')
            axes[1, 2].set_ylabel('Avg Order Conversion Rate')

            # Adjust layout and show plot
            plt.tight_layout()
            plt.show()

def plot_ch(df, column_list, num_cols=4):
    """
    Plots a grid of boxplots for delivery fees across different stages for each entity,
    keeping only data below the 95th percentile for each column.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column_list (list): The list of columns representing delivery fees at different stages.
    num_cols (int, optional): Number of columns in the grid. Default is 4.
    
    Returns:
    None: Displays the boxplots for each entity.
    """
    # Get unique entity IDs
    entities = df['global_entity_id'].unique()

    # Define the number of rows for the grid based on the number of entities and columns
    num_entities = len(entities)
    num_rows = math.ceil(num_entities / num_cols)

    # Create a grid of subplots
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))

    # Flatten the axes array for easy iteration
    axes = axes.flatten()

    for i, entity in enumerate(entities):
        df_tmp = df[df['global_entity_id'] == entity]
        
        # Check if all columns exist
        if not all(col in df_tmp.columns for col in column_list):
            raise ValueError(f"Some columns in {column_list} do not exist in the DataFrame for entity {entity}")

        # Filter values below the 95th percentile for each column
        df_filtered = df_tmp[column_list].apply(lambda x: x[x < x.quantile(0.95)])

        # Create a mapping for the stages
        stage_mapping = {col: col.split('_')[2].capitalize() if len(col.split('_')) > 2 else col for col in column_list}
        
        # Melt the filtered data for plotting
        df_melted = df_filtered.melt(var_name='Stage', value_name='Delivery Fee (EUR)')
        
        # Replace the stage names with the mapped values
        df_melted['Stage'] = df_melted['Stage'].replace(stage_mapping)

        # Plot the boxplot for the entity on the corresponding subplot
        sns.boxplot(data=df_melted, x='Stage', y='Delivery Fee (EUR)', ax=axes[i])
        
        # Set title
        axes[i].set_title(f'Entity {entity}')
    
    # Remove any unused subplots if the number of entities is less than the grid size
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust layout to avoid overlap
    plt.tight_layout()
    plt.show()


def plot_conversion(df):
        
    for i in df['global_entity_id'].unique():

            df_tmp = df[df['global_entity_id'] == i]

            # Create a figure with four subplots (2x2 layout)
            fig, axes = plt.subplots(2, 3, figsize=(14, 10))

            # Plot user_conversion_details as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_details', ax=axes[0, 0], label='Details Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_details', scatter=False, ax=axes[0, 0], color='blue')
            axes[0, 0].set_title('Details Conversion vs. Delivery Fee ' + i )
            axes[0, 0].set_xlabel('Delivery Fee (€)')
            axes[0, 0].set_ylabel('Details Conversion Rate')

            # Plot user_conversion_checkout as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_checkout', ax=axes[0, 1], label='Checkout Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_checkout', scatter=False, ax=axes[0, 1], color='blue')
            axes[0, 1].set_title('Checkout Conversion vs. Delivery Fee ' + i )
            axes[0, 1].set_xlabel('Delivery Fee (€)')
            axes[0, 1].set_ylabel('Checkout Conversion Rate')

            # Plot user_conversion_order as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_order', ax=axes[0, 2], label='Order Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_order', scatter=False, ax=axes[0, 2], color='blue')
            axes[0, 2].set_title('Order Conversion vs. Delivery Fee ' + i )
            axes[0, 2].set_xlabel('Delivery Fee (€)')
            axes[0, 2].set_ylabel('Order Conversion Rate')

            # Plot avg_conversion_details as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_details', ax=axes[1, 0], label='Avg Details Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_details', scatter=False, ax=axes[1, 0], color='orange')
            axes[1, 0].set_title('Avg Details Conversion vs. Delivery Fee ' + i )
            axes[1, 0].set_xlabel('Delivery Fee (€)')
            axes[1, 0].set_ylabel('Avg Details Conversion Rate')

            # Plot avg_conversion_checkout as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_checkout', ax=axes[1, 1], label='Avg Checkout Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_checkout', scatter=False, ax=axes[1, 1], color='orange')
            axes[1, 1].set_title('Avg Checkout Conversion vs. Delivery Fee ' + i )
            axes[1, 1].set_xlabel('Delivery Fee (€)')
            axes[1, 1].set_ylabel('Avg Checkout Conversion Rate')

            # Plot avg_conversion_order as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_order', ax=axes[1, 2], label='Avg Order Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_order', scatter=False, ax=axes[1, 2], color='orange')
            axes[1, 2].set_title('Avg Order Conversion vs. Delivery Fee ' + i )
            axes[1, 2].set_xlabel('Delivery Fee (€)')
            axes[1, 2].set_ylabel('Avg Order Conversion Rate')

            # Adjust layout and show plot
            plt.tight_layout()
            plt.show()


  WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
  AND pe.global_entity_id IN (""" + entity_d + """)
  AND global_entity_id IN (""" + entity_d + """)
  AND global_entity_id IN (""" + entity_d + """)
  WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
  AND pe.global_entity_id IN (""" + entity_d + """)
  AND global_entity_id IN (""" + entity_d + """)
  AND global_entity_id IN (""" + entity_d + """)


In [3]:
start_date = '2025-01-15'
end_date = '2024-02-15'

# entity_id = 'DJ_CZ','FO_NO','MJM_AT','NP_HU','OP_SE','PO_FI','YS_TR','EF_GR','FY_CY','FP_BD','FP_HK','FP_KH','FP_LA','FP_MM','FP_MY','FP_PH','FP_PK','FP_SG','FP_TH','FP_TW','HS_SA','AP_PA','PY_AR','PY_BO','PY_CL','PY_CR','PY_DO','PY_EC','PY_GT','PY_HN','PY_NI','PY_PE','PY_PY','PY_SV','PY_UY','PY_VE','HF_EG','TB_AE',
# 'TB_BH','TB_IQ','TB_JO','TB_KW','TB_OM','TB_QA'

entity_id = ('TB_OM','DJ_CZ')
entity_id_str = ",".join([f"'{entity}'" for entity in entity_id])

# PROJECT_ID = "logistics-customer-staging"
# bpd.options.bigquery.project = PROJECT_ID

# define a few things (project id, start date etc.)
project_id = "logistics-customer-staging"
client = bigquery.Client(project = project_id)

user_conversion_df = client.query(user_conversion(start_date, end_date,entity_id_str)).to_dataframe()
#user_summary_df = client.query(user_details(start_date, end_date,entity_id_str)).to_dataframe()

# user_conversion_df = bpd.read_gbq(user_conversion(start_date, end_date,entity_id_str))
# user_summary_df = bpd.read_gbq(user_details(start_date, end_date,entity_id_str))

user_conversion_df['user_conversion_details'] = user_conversion_df['converted_user_count_details'] / user_conversion_df['total_users']
user_conversion_df['user_conversion_checkout'] = user_conversion_df['converted_user_count_checkout'] / user_conversion_df['total_users']
user_conversion_df['user_conversion_order'] = user_conversion_df['converted_user_count_order'] / user_conversion_df['total_users']



BadRequest: 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/logistics-customer-staging/queries/0ba426ae-cd03-4694-ab2d-604d30f5ec2c?maxResults=0&location=US&prettyPrint=false: Resources exceeded during query execution: The query could not be executed in the allotted memory. Peak usage: 129% of limit.
Top memory consumer(s):
  query parsing and optimization: 93%
  other/unattributed: 7%


Location: US
Job ID: 0ba426ae-cd03-4694-ab2d-604d30f5ec2c
