In [16]:
import numpy as np
import pandas as pd
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px
import db_dtypes
import bigframes.pandas as bpd
from IPython.display import display, HTML
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import math
import statsmodels.api as sm


def user_details(start_date, end_date,entity_d):

    query = f"""
    WITH listing AS (    
        SELECT   global_entity_id
                ,country
                ,CAST(DATE_TRUNC(injestion_time, MONTH) AS DATE) session_month
                ,CAST(DATE_TRUNC(injestion_time, ISOWEEK) AS DATE) session_week
                ,CAST(DATE_TRUNC(injestion_time, DAY) AS DATE) session_day
                ,EXTRACT(HOUR FROM injestion_time) AS session_hour
                ,session_key 
                ,perseus_session_id
                ,chainId 
                ,shopId
                ,userId
                ,df_impressions 
        FROM (
        SELECT  session_key
                ,COALESCE(pe.global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,userId
                ,ingestion_timestamp injestion_time
                ,country
                ,COALESCE(chainId, JSON_VALUE(eventVariables_json, "$.chainId") ) AS chainId
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY ingestion_timestamp) row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")) df_raw
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events` pe
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'shop_impressions.loaded'
                AND screenType = 'shop_list'
                AND pe.global_entity_id IN (""" + entity_d + """)
                AND locationCity IS NOT NULL
                AND shopType = 'restaurants'
        )
        WHERE row_num = 1
        AND df_raw IS NOT NULL
        ORDER BY global_entity_id, session_key
    ), shop_details AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'shop_details.loaded'
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), checkout AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'checkout.loaded'
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), orders AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction IN ('transaction')
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), rates AS (
        SELECT cu.country_iso
            ,cu.currency_code
            ,tmp.fx_rate_eur
        FROM `fulfillment-dwh-production.cl.countries` cu
        JOIN ( 
            WITH latest_fx_rate AS (
            SELECT 
                currency_code,
                fx_rate_eur,
                calculated_at,
                ROW_NUMBER() OVER (PARTITION BY currency_code ORDER BY calculated_at DESC) AS rn
            FROM `fulfillment-dwh-production.curated_data_shared_coredata.fx_rates`
            )
            SELECT 
                currency_code,
                fx_rate_eur,
                calculated_at AS max_calculated_at
            FROM latest_fx_rate
            WHERE rn = 1
        ) tmp ON tmp.currency_code = cu.currency_code
        GROUP BY 1, 2, 3
        ORDER BY 1
    )
    SELECT      l.global_entity_id
                ,l.userId
                -- Delivery fee listing calculations
                ,AVG(l.df_impressions / r.fx_rate_eur) AS delivery_fee_listing_eur_mean
                ,APPROX_QUANTILES(l.df_impressions / r.fx_rate_eur, 100)[OFFSET(50)] AS delivery_fee_listing_eur_median
                ,MIN(l.df_impressions / r.fx_rate_eur) AS delivery_fee_listing_eur_min
                ,MAX(l.df_impressions / r.fx_rate_eur) AS delivery_fee_listing_eur_max
                ,STDDEV(l.df_impressions / r.fx_rate_eur) AS delivery_fee_listing_eur_stddev
                
                -- Delivery fee details page calculations
                ,AVG(sd.df_impressions / r.fx_rate_eur) AS delivery_fee_details_eur_mean
                ,APPROX_QUANTILES(sd.df_impressions / r.fx_rate_eur, 100)[OFFSET(50)] AS delivery_fee_details_eur_median
                ,MIN(sd.df_impressions / r.fx_rate_eur) AS delivery_fee_details_eur_min
                ,MAX(sd.df_impressions / r.fx_rate_eur) AS delivery_fee_details_eur_max
                ,STDDEV(sd.df_impressions / r.fx_rate_eur) AS delivery_fee_details_eur_stddev
                
                -- Delivery fee checkout calculations
                ,AVG(co.df_impressions / r.fx_rate_eur) AS delivery_fee_checkout_eur_mean
                ,APPROX_QUANTILES(co.df_impressions / r.fx_rate_eur, 100)[OFFSET(50)] AS delivery_fee_checkout_eur_median
                ,MIN(co.df_impressions / r.fx_rate_eur) AS delivery_fee_checkout_eur_min
                ,MAX(co.df_impressions / r.fx_rate_eur) AS delivery_fee_checkout_eur_max
                ,STDDEV(co.df_impressions / r.fx_rate_eur) AS delivery_fee_checkout_eur_stddev
                
                -- Delivery fee order calculations
                ,AVG(o.df_impressions / r.fx_rate_eur) AS delivery_fee_order_eur_mean
                ,APPROX_QUANTILES(o.df_impressions / r.fx_rate_eur, 100)[OFFSET(50)] AS delivery_fee_order_eur_median
                ,MIN(o.df_impressions / r.fx_rate_eur) AS delivery_fee_order_eur_min
                ,MAX(o.df_impressions / r.fx_rate_eur) AS delivery_fee_order_eur_ma
                ,STDDEV(o.df_impressions / r.fx_rate_eur) AS delivery_fee_order_eur_stddev
                ,COUNT(l.shopId) AS total_vendor
                ,SUM(CASE WHEN sd.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_details_page
                ,SUM(CASE WHEN co.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_checkout
                ,SUM(CASE WHEN o.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_order
    FROM listing l
    LEFT JOIN shop_details sd ON sd.global_entity_id = l.global_entity_id AND sd.session_key = l.session_key AND sd.shopId = l.shopId
    LEFT JOIN checkout co ON co.global_entity_id = l.global_entity_id AND co.session_key = l.session_key AND co.shopId = l.shopId
    LEFT JOIN orders o ON o.global_entity_id = l.global_entity_id AND o.session_key = l.session_key AND o.shopId = l.shopId
    LEFT JOIN rates r ON l.country = r.country_iso
    GROUP BY 1, 2
    """

    return query



def user_conversion(start_date, end_date,entity_d):

    query = f"""
    WITH listing AS (    
        SELECT   global_entity_id
                ,country
                ,CAST(DATE_TRUNC(injestion_time, MONTH) AS DATE) session_month
                ,CAST(DATE_TRUNC(injestion_time, ISOWEEK) AS DATE) session_week
                ,CAST(DATE_TRUNC(injestion_time, DAY) AS DATE) session_day
                ,EXTRACT(HOUR FROM injestion_time) AS session_hour
                ,session_key 
                ,perseus_session_id
                ,chainId 
                ,shopId
                ,userId
                ,df_impressions 
        FROM (
        SELECT  session_key
                ,COALESCE(pe.global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,userId
                ,ingestion_timestamp injestion_time
                ,country
                ,COALESCE(chainId, JSON_VALUE(eventVariables_json, "$.chainId") ) AS chainId
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY ingestion_timestamp) row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")) df_raw
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events` pe
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'shop_impressions.loaded'
                AND screenType = 'shop_list'
                AND pe.global_entity_id IN (""" + entity_d + """)
                AND locationCity IS NOT NULL
                AND shopType = 'restaurants'
        )
        WHERE row_num = 1
        AND df_raw IS NOT NULL
        ORDER BY global_entity_id, session_key
    ), shop_details AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'shop_details.loaded'
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), checkout AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction = 'checkout.loaded'
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), orders AS (
        SELECT  global_entity_id
                ,session_key
                ,perseus_session_id
                ,shopId
                ,df_impressions
        FROM (
        SELECT  session_key
                ,COALESCE(global_entity_id, JSON_VALUE(eventVariables_json, "$.globalEntityId") ) AS global_entity_id
                ,platform AS platform
                ,sessionId AS perseus_session_id
                ,ROW_NUMBER() OVER (PARTITION BY session_key, shopId ORDER BY "timestamp") row_num
                ,COALESCE(shopId, JSON_VALUE(eventVariables_json, "$.shopId") ) AS shopId
                ,COALESCE(CAST(NULLIF(REGEXP_EXTRACT(COALESCE(JSON_VALUE(eventVariables_json, "$.vendorDeliveryFee"), JSON_VALUE(eventVariables_json, "$.shopDeliveryFee")), r'([0-9]+\.?[0-9]*)'), '') AS FLOAT64), 0) df_impressions
        FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events`
        WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
                AND eventAction IN ('transaction')
                AND global_entity_id IN (""" + entity_d + """)
        )
        WHERE row_num = 1
        ORDER BY global_entity_id, session_key
    ), rates AS (
        SELECT cu.country_iso
            ,cu.currency_code
            ,tmp.fx_rate_eur
        FROM `fulfillment-dwh-production.cl.countries` cu
        JOIN ( 
            WITH latest_fx_rate AS (
            SELECT 
                currency_code,
                fx_rate_eur,
                calculated_at,
                ROW_NUMBER() OVER (PARTITION BY currency_code ORDER BY calculated_at DESC) AS rn
            FROM `fulfillment-dwh-production.curated_data_shared_coredata.fx_rates`
            )
            SELECT 
                currency_code,
                fx_rate_eur,
                calculated_at AS max_calculated_at
            FROM latest_fx_rate
            WHERE rn = 1
        ) tmp ON tmp.currency_code = cu.currency_code
        GROUP BY 1, 2, 3
        ORDER BY 1
    ), impression as (
    SELECT       l.global_entity_id
                ,l.country
                ,l.userId
                ,ROUND((l.df_impressions / r.fx_rate_eur) * 5) / 5 AS delivery_fee_listing
                ,ROUND((sd.df_impressions / r.fx_rate_eur) * 5) / 5 AS delivery_fee_details
                ,ROUND((co.df_impressions / r.fx_rate_eur) * 5) / 5 AS delivery_fee_checkout
                ,COUNT(l.shopId) AS total_vendor
                ,SUM(CASE WHEN sd.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_details_page
                ,SUM(CASE WHEN co.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_checkout
                ,SUM(CASE WHEN o.shopId IS NOT NULL THEN 1 ELSE 0 END) AS converted_order
                ,SUM(CASE WHEN sd.shopId IS NOT NULL THEN 1 ELSE 0 END) / COUNT(l.shopId) conversion_details
                ,SUM(CASE WHEN co.shopId IS NOT NULL THEN 1 ELSE 0 END) / COUNT(l.shopId) conversion_checkout
                ,SUM(CASE WHEN o.shopId IS NOT NULL THEN 1 ELSE 0 END) / COUNT(l.shopId) conversion_order
    FROM listing l
    LEFT JOIN shop_details sd ON sd.global_entity_id = l.global_entity_id AND sd.session_key = l.session_key AND sd.shopId = l.shopId
    LEFT JOIN checkout co ON co.global_entity_id = l.global_entity_id AND co.session_key = l.session_key AND co.shopId = l.shopId
    LEFT JOIN orders o ON o.global_entity_id = l.global_entity_id AND o.session_key = l.session_key AND o.shopId = l.shopId
    LEFT JOIN rates r ON l.country = r.country_iso
    GROUP BY 1, 2, 3, 4, 5, 6
    )
    select * 
    from (
    select  global_entity_id
        ,delivery_fee_listing
        ,delivery_fee_details
        ,delivery_fee_checkout
        ,avg(conversion_details) avg_conversion_details
        ,avg(conversion_checkout) avg_conversion_checkout
        ,avg(conversion_order) avg_conversion_order
        ,sum(total_vendor) vendors
        ,count(distinct userId) total_users
        ,count(distinct case when converted_details_page > 0 then userId end) converted_user_count_details
        ,count(distinct case when converted_checkout > 0 then userId end) converted_user_count_checkout
        ,count(distinct case when converted_order > 0 then userId end) converted_user_count_order
    FROM impression
    group by 1,2,3,4
    order by 1,2
    )
    
    where vendors > 100
    """

    return query


# def user_conversion_funnel(df):
    
#     for i in df['global_entity_id'].unique():

#         df_tmp = df[df['global_entity_id'] == i]

#         # Groupby and aggregate data for conversion counts
#         user_conversion = df_tmp.groupby(['global_entity_id']).agg(
#             converted_user_count_order=('userId', lambda x: x[df_tmp['converted_order'] > 0].nunique()),
#             converted_user_count_details=('userId', lambda x: x[df_tmp['converted_details_page'] > 0].nunique()),
#             converted_user_count_checkout=('userId', lambda x: x[df_tmp['converted_checkout'] > 0].nunique()),
#             total_user_count=('userId', 'nunique')
#         ).reset_index()

#         # Calculate funnel metrics
#         user_conversion['user_listing'] = user_conversion['total_user_count'] / user_conversion['total_user_count']
#         user_conversion['user_listing_to_details'] = user_conversion['converted_user_count_details'] / user_conversion['total_user_count']
#         user_conversion['user_listing_to_checkout'] = user_conversion['converted_user_count_checkout'] / user_conversion['total_user_count']
#         user_conversion['user_listing_to_transaction'] = user_conversion['converted_user_count_order'] / user_conversion['total_user_count']

    
#         # List of conversion stages
#         lst = [
#         round(user_conversion['user_listing'][0] * 100, 0),  # Convert to percentage and round
#         round(user_conversion['user_listing_to_details'][0] * 100, 0),
#         round(user_conversion['user_listing_to_checkout'][0] * 100, 0),
#         round(user_conversion['user_listing_to_transaction'][0] * 100, 0)
#         ]

#         # Data for funnel plot
#         funnel_data = dict(
#             number=lst,
#             stage=["Listing", "Listing_to_details", "Listing_to_checkout", "Listing_to_transaction"]
#         )

#         # Create funnel plot using plotly
#         fig = px.funnel(funnel_data, x='number', y='stage')
#         fig.update_layout(title='User conversion ' + i, width=750,height=400)
#         fig.show()

def user_conversion_funnel(df):
    
    # Create a list to store data for all entities
    funnel_data_list = []
    
    for i in df['global_entity_id'].unique():

        df_tmp = df[df['global_entity_id'] == i]

        # Groupby and aggregate data for conversion counts
        user_conversion = df_tmp.groupby(['global_entity_id']).agg(
            converted_user_count_order=('userId', lambda x: x[df_tmp['converted_order'] > 0].nunique()),
            converted_user_count_details=('userId', lambda x: x[df_tmp['converted_details_page'] > 0].nunique()),
            converted_user_count_checkout=('userId', lambda x: x[df_tmp['converted_checkout'] > 0].nunique()),
            total_user_count=('userId', 'nunique')
        ).reset_index()

        # Calculate funnel metrics
        user_conversion['user_listing'] = user_conversion['total_user_count'] / user_conversion['total_user_count']
        user_conversion['user_listing_to_details'] = user_conversion['converted_user_count_details'] / user_conversion['total_user_count']
        user_conversion['user_listing_to_checkout'] = user_conversion['converted_user_count_checkout'] / user_conversion['total_user_count']
        user_conversion['user_listing_to_transaction'] = user_conversion['converted_user_count_order'] / user_conversion['total_user_count']

        # Append the conversion stages for the current entity to the list
        funnel_data_list.append({
            'global_entity_id': i,
            'stage': 'Listing',
            'conversion_rate': round(user_conversion['user_listing'][0] * 100, 0)
        })
        funnel_data_list.append({
            'global_entity_id': i,
            'stage': 'Listing_to_details',
            'conversion_rate': round(user_conversion['user_listing_to_details'][0] * 100, 0)
        })
        funnel_data_list.append({
            'global_entity_id': i,
            'stage': 'Listing_to_checkout',
            'conversion_rate': round(user_conversion['user_listing_to_checkout'][0] * 100, 0)
        })
        funnel_data_list.append({
            'global_entity_id': i,
            'stage': 'Listing_to_transaction',
            'conversion_rate': round(user_conversion['user_listing_to_transaction'][0] * 100, 0)
        })

    # Create a DataFrame from the list
    funnel_data_df = pd.DataFrame(funnel_data_list)

    # Create a funnel plot for all entities
    fig = px.funnel(funnel_data_df, x='conversion_rate', y='stage', color='global_entity_id')
    fig.update_layout(title='User Conversion Funnel Across Entities', width=800, height=600)
    fig.show()


def plot_delivery_fee_boxplot(df, column_list):
    
    for i in df['global_entity_id'].unique():
        
        df_tmp = df[df['global_entity_id'] == i]
        
        if not all(col in df_tmp.columns for col in column_list):
            raise ValueError(f"Some columns in {column_list} do not exist in the DataFrame for entity {i}")
        
        stage_mapping = {col: col.split('_')[2].capitalize() if len(col.split('_')) > 2 else col for col in column_list}
        
        delivery_fee_mean = df_tmp[column_list]
        
        df_melted = delivery_fee_mean.melt(var_name='Stage', value_name='Delivery Fee (EUR)')
        
        df_melted['Stage'] = df_melted['Stage'].replace(stage_mapping)
        
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=df_melted, x='Stage', y='Delivery Fee (EUR)')
        
        plt.title(f'{i} Delivery Fees Across Different Stages', fontsize=16)
        
        plt.show()

def descriptive_stats(df):
    # Group the data by 'global_entity_id'
    grouped = df.groupby('global_entity_id') 

    # Create an empty dictionary to store the descriptive statistics
    grouped_descriptive_stats = {}

    # Define the manual order for sorting
    manual_order = [
        'delivery_fee_listing_eur_mean', 'delivery_fee_details_eur_mean', 'delivery_fee_checkout_eur_mean', 'delivery_fee_order_eur_mean',
        'delivery_fee_listing_eur_median', 'delivery_fee_details_eur_median', 'delivery_fee_checkout_eur_median', 'delivery_fee_order_eur_median',
        'delivery_fee_listing_eur_min', 'delivery_fee_details_eur_min', 'delivery_fee_checkout_eur_min', 'delivery_fee_order_eur_min',
        'delivery_fee_listing_eur_max', 'delivery_fee_details_eur_max', 'delivery_fee_checkout_eur_max', 'delivery_fee_order_eur_ma',
        'delivery_fee_listing_eur_stddev', 'delivery_fee_details_eur_stddev', 'delivery_fee_checkout_eur_stddev', 'delivery_fee_order_eur_stddev',
        'total_vendor', 'converted_details_page', 'converted_checkout', 'converted_order'
    ]

    # Loop through each group and calculate descriptive statistics
    for name, group in grouped:
        descriptive_stats = group.describe().transpose()

        # Reorder the DataFrame based on the manual order
        descriptive_stats = descriptive_stats.reindex(manual_order)

        # Style the descriptive statistics for better readability
        styled = descriptive_stats.style \
                        .format(precision=2, thousands=",", decimal=".") \
                        .format_index(str.upper, axis=1)
        
        grouped_descriptive_stats[name] = styled

    # Display each sorted group's styled descriptive statistics
    for name, styled_df in grouped_descriptive_stats.items():
        print(f"Group: {name}")
        display(styled_df)


def plot_entity_kde(df, delivery_fee_column, threshold=10, num_cols=4):
    """
    Plots a grid of KDE plots for each unique entity in the dataset filtered by a delivery fee threshold.
    
    Parameters:
    user_summary_df (pd.DataFrame): The DataFrame containing the data.
    delivery_fee_column (str): The column name for delivery fee to plot the KDE.
    threshold (float, optional): The threshold for filtering the delivery fees. Default is 10.
    num_cols (int, optional): Number of columns in the grid. Default is 4.
    
    Returns:
    None: Displays the KDE plots for each entity.
    """
    # Filter the dataset where the delivery fee is less than the specified threshold
    filtered_df = df[df[delivery_fee_column] < threshold]

    # Get the unique entity IDs
    entities = filtered_df['global_entity_id'].unique()

    # Define the number of rows for the grid based on the number of entities and columns
    num_entities = len(entities)
    num_rows = math.ceil(num_entities / num_cols)

    # Create a grid of subplots
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))

    # Flatten the axes array for easy iteration
    axes = axes.flatten()

    # Iterate over each entity and plot a KDE for its values
    for i, entity in enumerate(entities):
        subset = filtered_df[filtered_df['global_entity_id'] == entity]  # Filter data for each entity
        
        # Plot KDE for the entity on the corresponding subplot
        sns.kdeplot(subset[delivery_fee_column], label=f'Entity {entity}', fill=True, ax=axes[i])
        
        # Set title and legend
        axes[i].set_title(f'Entity {entity}')
        axes[i].legend()

    # Remove any unused subplots if the number of entities is less than the grid size
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust layout to avoid overlap
    plt.tight_layout()
    plt.show()


def plot_conversion(df):
        
    for i in df['global_entity_id'].unique():

            df_tmp = df[df['global_entity_id'] == i]
    
            # Create a figure with four subplots (2x2 layout)
            fig, axes = plt.subplots(2, 3, figsize=(14, 10))

            # Plot user_conversion_details as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='user_conversion_details', ax=axes[0, 0], label='Details Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='user_conversion_details', scatter=False, ax=axes[0, 0], color='blue')
            axes[0, 0].set_title('Details Conversion vs. Delivery Fee')
            axes[0, 0].set_xlabel('Delivery Fee (€)')
            axes[0, 0].set_ylabel('Details Conversion Rate')

            # Plot user_conversion_checkout as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='user_conversion_checkout', ax=axes[0, 1], label='Checkout Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='user_conversion_checkout', scatter=False, ax=axes[0, 1], color='blue')
            axes[0, 1].set_title('Checkout Conversion vs. Delivery Fee')
            axes[0, 1].set_xlabel('Delivery Fee (€)')
            axes[0, 1].set_ylabel('Checkout Conversion Rate')

            # Plot user_conversion_order as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='user_conversion_order', ax=axes[0, 2], label='Order Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='user_conversion_order', scatter=False, ax=axes[0, 2], color='blue')
            axes[0, 2].set_title('Order Conversion vs. Delivery Fee')
            axes[0, 2].set_xlabel('Delivery Fee (€)')
            axes[0, 2].set_ylabel('Order Conversion Rate')

            # Plot avg_conversion_details as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='avg_conversion_details', ax=axes[1, 0], label='Avg Details Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='avg_conversion_details', scatter=False, ax=axes[1, 0], color='orange')
            axes[1, 0].set_title('Avg Details Conversion vs. Delivery Fee')
            axes[1, 0].set_xlabel('Delivery Fee (€)')
            axes[1, 0].set_ylabel('Avg Details Conversion Rate')

            # Plot avg_conversion_checkout as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='avg_conversion_checkout', ax=axes[1, 1], label='Avg Checkout Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='avg_conversion_checkout', scatter=False, ax=axes[1, 1], color='orange')
            axes[1, 1].set_title('Avg Checkout Conversion vs. Delivery Fee')
            axes[1, 1].set_xlabel('Delivery Fee (€)')
            axes[1, 1].set_ylabel('Avg Checkout Conversion Rate')

            # Plot avg_conversion_order as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee', y='avg_conversion_order', ax=axes[1, 2], label='Avg Order Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee', y='avg_conversion_order', scatter=False, ax=axes[1, 2], color='orange')
            axes[1, 2].set_title('Avg Order Conversion vs. Delivery Fee')
            axes[1, 2].set_xlabel('Delivery Fee (€)')
            axes[1, 2].set_ylabel('Avg Order Conversion Rate')

            # Adjust layout and show plot
            plt.tight_layout()
            plt.show()

def plot_ch(df, column_list, num_cols=4):
    """
    Plots a grid of boxplots for delivery fees across different stages for each entity,
    keeping only data below the 95th percentile for each column.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column_list (list): The list of columns representing delivery fees at different stages.
    num_cols (int, optional): Number of columns in the grid. Default is 4.
    
    Returns:
    None: Displays the boxplots for each entity.
    """
    # Get unique entity IDs
    entities = df['global_entity_id'].unique()

    # Define the number of rows for the grid based on the number of entities and columns
    num_entities = len(entities)
    num_rows = math.ceil(num_entities / num_cols)

    # Create a grid of subplots
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))

    # Flatten the axes array for easy iteration
    axes = axes.flatten()

    for i, entity in enumerate(entities):
        df_tmp = df[df['global_entity_id'] == entity]
        
        # Check if all columns exist
        if not all(col in df_tmp.columns for col in column_list):
            raise ValueError(f"Some columns in {column_list} do not exist in the DataFrame for entity {entity}")

        # Filter values below the 95th percentile for each column
        df_filtered = df_tmp[column_list].apply(lambda x: x[x < x.quantile(0.95)])

        # Create a mapping for the stages
        stage_mapping = {col: col.split('_')[2].capitalize() if len(col.split('_')) > 2 else col for col in column_list}
        
        # Melt the filtered data for plotting
        df_melted = df_filtered.melt(var_name='Stage', value_name='Delivery Fee (EUR)')
        
        # Replace the stage names with the mapped values
        df_melted['Stage'] = df_melted['Stage'].replace(stage_mapping)

        # Plot the boxplot for the entity on the corresponding subplot
        sns.boxplot(data=df_melted, x='Stage', y='Delivery Fee (EUR)', ax=axes[i])
        
        # Set title
        axes[i].set_title(f'Entity {entity}')
    
    # Remove any unused subplots if the number of entities is less than the grid size
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust layout to avoid overlap
    plt.tight_layout()
    plt.show()


def plot_conversion(df):
        
    for i in df['global_entity_id'].unique():

            df_tmp = df[df['global_entity_id'] == i]

            # Create a figure with four subplots (2x2 layout)
            fig, axes = plt.subplots(2, 3, figsize=(14, 10))

            # Plot user_conversion_details as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_details', ax=axes[0, 0], label='Details Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_details', scatter=False, ax=axes[0, 0], color='blue')
            axes[0, 0].set_title('Details Conversion vs. Delivery Fee ' + i )
            axes[0, 0].set_xlabel('Delivery Fee (€)')
            axes[0, 0].set_ylabel('Details Conversion Rate')

            # Plot user_conversion_checkout as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_checkout', ax=axes[0, 1], label='Checkout Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_checkout', scatter=False, ax=axes[0, 1], color='blue')
            axes[0, 1].set_title('Checkout Conversion vs. Delivery Fee ' + i )
            axes[0, 1].set_xlabel('Delivery Fee (€)')
            axes[0, 1].set_ylabel('Checkout Conversion Rate')

            # Plot user_conversion_order as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_order', ax=axes[0, 2], label='Order Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='user_conversion_order', scatter=False, ax=axes[0, 2], color='blue')
            axes[0, 2].set_title('Order Conversion vs. Delivery Fee ' + i )
            axes[0, 2].set_xlabel('Delivery Fee (€)')
            axes[0, 2].set_ylabel('Order Conversion Rate')

            # Plot avg_conversion_details as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_details', ax=axes[1, 0], label='Avg Details Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_details', scatter=False, ax=axes[1, 0], color='orange')
            axes[1, 0].set_title('Avg Details Conversion vs. Delivery Fee ' + i )
            axes[1, 0].set_xlabel('Delivery Fee (€)')
            axes[1, 0].set_ylabel('Avg Details Conversion Rate')

            # Plot avg_conversion_checkout as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_checkout', ax=axes[1, 1], label='Avg Checkout Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_checkout', scatter=False, ax=axes[1, 1], color='orange')
            axes[1, 1].set_title('Avg Checkout Conversion vs. Delivery Fee ' + i )
            axes[1, 1].set_xlabel('Delivery Fee (€)')
            axes[1, 1].set_ylabel('Avg Checkout Conversion Rate')

            # Plot avg_conversion_order as scatter plot with regression
            sns.scatterplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_order', ax=axes[1, 2], label='Avg Order Conversion')
            sns.regplot(data=df_tmp, x='delivery_fee_listing', y='avg_conversion_order', scatter=False, ax=axes[1, 2], color='orange')
            axes[1, 2].set_title('Avg Order Conversion vs. Delivery Fee ' + i )
            axes[1, 2].set_xlabel('Delivery Fee (€)')
            axes[1, 2].set_ylabel('Avg Order Conversion Rate')

            # Adjust layout and show plot
            plt.tight_layout()
            plt.show()


  WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
  AND pe.global_entity_id IN (""" + entity_d + """)
  AND global_entity_id IN (""" + entity_d + """)
  AND global_entity_id IN (""" + entity_d + """)
  WHERE partition_date BETWEEN \"""" + start_date + """\" and \"""" + end_date + """\"
  AND pe.global_entity_id IN (""" + entity_d + """)
  AND global_entity_id IN (""" + entity_d + """)
  AND global_entity_id IN (""" + entity_d + """)


In [41]:
start_date = '2025-02-15'
end_date = '2025-02-28'

#entity_id = 'DJ_CZ','FO_NO','MJM_AT','NP_HU','OP_SE','PO_FI','YS_TR','EF_GR','FY_CY','FP_BD','FP_HK','FP_KH','FP_LA','FP_MM','FP_MY','FP_PH','FP_PK','FP_SG','FP_TH','FP_TW','HS_SA','AP_PA','PY_AR','PY_BO','PY_CL','PY_CR','PY_DO','PY_EC','PY_GT','PY_HN','PY_NI','PY_PE','PY_PY','PY_SV','PY_UY','PY_VE','HF_EG','TB_AE','TB_BH','TB_IQ','TB_JO','TB_KW','TB_OM','TB_QA'

entity_id = "FP_DE","FP_PK","FP_KH","FP_HK","HS_SA","AP_PA", "PY_AR", "PY_BO", "PY_CL", "PY_CR", "PY_DO", "PY_EC", "PY_GT", "PY_HN", "PY_NI", "PY_PE", "PY_PY", "PY_SV", "PY_UY", "PY_VE", "EF_GR", "FY_CY", "GV_MD", "GV_RO", "GV_BG", "GV_RS", "GV_HR", "GV_KE", "GV_NG", "GV_BA", "GV_CI", "GV_ME", "GV_UG", "YS_TR", "DJ_CZ", "FO_NO", "MJM_AT", "NP_HU", "OP_SE", "PO_FI", "FP_BD", "FP_LA", "FP_MM", "FP_MY", "FP_PH", "FP_SG", "FP_TH", "FP_TW", "HF_EG", "TB_AE", "TB_BH", "TB_IQ", "TB_JO", "TB_KW", "TB_OM", "TB_QA"

#entity_id = ('TB_OM','DJ_CZ')
entity_id_str = ",".join([f"'{entity}'" for entity in entity_id])

# PROJECT_ID = "logistics-customer-staging"
# bpd.options.bigquery.project = PROJECT_ID

# define a few things (project id, start date etc.)
project_id = "logistics-customer-staging"
client = bigquery.Client(project = project_id)

user_conversion_df = client.query(user_conversion(start_date, end_date,entity_id_str)).to_dataframe()
#user_summary_df = client.query(user_details(start_date, end_date,entity_id_str)).to_dataframe()

# user_conversion_df = bpd.read_gbq(user_conversion(start_date, end_date,entity_id_str))
# user_summary_df = bpd.read_gbq(user_details(start_date, end_date,entity_id_str))

user_conversion_df['user_conversion_details'] = user_conversion_df['converted_user_count_details'] / user_conversion_df['total_users']
user_conversion_df['user_conversion_checkout'] = user_conversion_df['converted_user_count_checkout'] / user_conversion_df['total_users']
user_conversion_df['user_conversion_order'] = user_conversion_df['converted_user_count_order'] / user_conversion_df['total_users']



In [42]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Assuming user_conversion_df is already defined in your environment
# and contains the columns: 'global_entity_id', 
# 'delivery_fee_details', 'delivery_fee_checkout', 'delivery_fee_order',
# 'avg_conversion_details', 'avg_conversion_checkout', 'avg_conversion_order'

# Initialize a DataFrame to store the elasticity coefficients for each y variable
elasticity_df = pd.DataFrame(columns=[
    'global_entity_id', 
    'avg_conversion_details', 
    'avg_conversion_checkout', 
    'avg_conversion_order'
])

# List of dependent variables (y variables)
dependent_vars = ['avg_conversion_details', 'avg_conversion_checkout', 'avg_conversion_order']

# Mapping from dependent variable to its corresponding stage-specific delivery fee column
fee_column_mapping = {
    'avg_conversion_details': 'delivery_fee_listing',
    'avg_conversion_checkout': 'delivery_fee_details',
    'avg_conversion_order': 'delivery_fee_checkout'
}

# Loop through each unique global_entity_id
for i in user_conversion_df['global_entity_id'].unique():
    
    # Filter data for the current entity and make a copy to avoid warnings
    user_df_clean = user_conversion_df[user_conversion_df['global_entity_id'] == i].copy()
    
    # Initialize a dictionary to store the entity ID and elasticity coefficients
    elasticity_row = {'global_entity_id': i}
    
    # Loop through each dependent variable
    for dep_var in dependent_vars:
        # Determine the corresponding fee column for the current stage
        fee_col = fee_column_mapping[dep_var]
        
        # Compute the stage-specific log delivery fee (avoiding log(0) by adding 1)
        user_df_clean['log_delivery_fee'] = np.log(user_df_clean[fee_col] + 1)
        
        # Apply log transformation to the dependent variable (avoid log(0) by adding 1)
        user_df_clean[dep_var] = np.log(user_df_clean[dep_var] + 1)
        
        # Add a constant column (intercept) for the regression
        user_df_clean['constant'] = 1.0

        # Ensure that the relevant columns are numeric
        user_df_clean[['log_delivery_fee', dep_var]] = user_df_clean[['log_delivery_fee', dep_var]].astype(float)
        
        # Replace infinite values with NaN and drop rows with NaNs in the key columns
        cleaned_data = user_df_clean.replace([np.inf, -np.inf], np.nan).dropna(subset=['log_delivery_fee', dep_var])
        
        # Check if there is enough data to run the regression
        if cleaned_data.shape[0] == 0:
            elasticity_row[dep_var] = np.nan
            continue
        
        # Set up the regression model with the constant and log_delivery_fee as predictors
        X_log = cleaned_data[['constant', 'log_delivery_fee']]
        y = cleaned_data[dep_var]
        
        # Fit the log-linear regression model
        model_log = sm.OLS(y, X_log).fit()
        
        # Store the elasticity coefficient (slope corresponding to log_delivery_fee)
        elasticity_row[dep_var] = model_log.params['log_delivery_fee']
    
    # Add the current entity's results to the elasticity DataFrame
    elasticity_df = pd.concat([elasticity_df, pd.DataFrame([elasticity_row])], ignore_index=True)

# Display the resulting DataFrame with elasticity coefficients
print(elasticity_df)


   global_entity_id  avg_conversion_details  avg_conversion_checkout  \
0             DJ_CZ               -0.164986                -0.067360   
1             EF_GR               -0.357137                 0.000000   
2             FO_NO               -0.089611                -0.109567   
3             FP_BD               -0.412253                -0.315583   
4             FP_DE               -0.200806                -0.569124   
5             FP_HK               -0.098844                -0.043877   
6             FP_KH               -0.244750                -0.296380   
7             FP_LA               -0.242189                -0.207426   
8             FP_MM               -0.253920                 0.050165   
9             FP_MY               -0.156446                -0.185381   
10            FP_PH               -0.195444                -0.238479   
11            FP_PK               -0.271608                -0.266019   
12            FP_SG               -0.197738                -0.18

  elasticity_df = pd.concat([elasticity_df, pd.DataFrame([elasticity_row])], ignore_index=True)


In [43]:
elasticity_df

Unnamed: 0,global_entity_id,avg_conversion_details,avg_conversion_checkout,avg_conversion_order
0,DJ_CZ,-0.164986,-0.06736,-0.040942
1,EF_GR,-0.357137,0.0,
2,FO_NO,-0.089611,-0.109567,-0.032204
3,FP_BD,-0.412253,-0.315583,-0.006774
4,FP_DE,-0.200806,-0.569124,0.0
5,FP_HK,-0.098844,-0.043877,-0.044798
6,FP_KH,-0.24475,-0.29638,-0.016292
7,FP_LA,-0.242189,-0.207426,0.04709
8,FP_MM,-0.25392,0.050165,0.207312
9,FP_MY,-0.156446,-0.185381,-0.049438


In [45]:
elasticity_df.to_csv('elasticity_df.csv', index=False)

In [None]:
listings_data = """

WITH listings_agg AS (
  SELECT
    dh_platform,
    global_entity_id, 
    platform,
    session_key,
    shopId,
    eventTimestamp,
    ARRAY_AGG(CASE WHEN ev.name = 'promisedDeliveryTimeRangeUpper' THEN ev.value END IGNORE NULLS) AS promised_upper_array,
    ARRAY_AGG(CASE WHEN ev.name = 'promisedDeliveryTimeRangeLower' THEN ev.value END IGNORE NULLS) AS promised_lower_array,
    ARRAY_AGG(CASE WHEN ev.name = 'locationCountry' THEN ev.value END IGNORE NULLS) AS location_country,
    ARRAY_AGG(CASE WHEN ev.name = 'locationCity' THEN ev.value END IGNORE NULLS) AS location_city,
    ARRAY_AGG(CASE WHEN ev.name = 'shopDeliveryFee' THEN ev.value END IGNORE NULLS) AS delivery_fee_array,
    ARRAY_AGG(CASE WHEN ev.name = 'serviceFee' THEN ev.value END IGNORE NULLS) AS service_fee_array,
    ARRAY_AGG(CASE WHEN ev.name = 'shopMinimumOrderValue' THEN ev.value END IGNORE NULLS) AS mov_array,
    ARRAY_AGG(CASE WHEN ev.name = 'verticalType' THEN ev.value END IGNORE NULLS) AS verticaltype_array,
    ARRAY_AGG(CASE WHEN ev.name = 'shopSponsoring' THEN ev.value END IGNORE NULLS) AS sponsor_array,
    ARRAY_AGG(CASE WHEN ev.name = 'shopListType' THEN ev.value END IGNORE NULLS) AS shoplisttype_array,
    ARRAY_AGG(CASE WHEN ev.name = 'shopType' THEN ev.value END IGNORE NULLS) AS shoptype_array,
    ARRAY_AGG(CASE WHEN ev.name = 'shopName' THEN ev.value END IGNORE NULLS) AS shopname_array,
    ARRAY_AGG(CASE WHEN ev.name = 'shopCuisine' THEN ev.value END IGNORE NULLS) AS cuisine_array,
    ARRAY_AGG(CASE WHEN ev.name = 'shopRank' THEN ev.value END IGNORE NULLS) AS rank_array,
    ARRAY_AGG(CASE WHEN ev.name = 'shopPosition' THEN ev.value END IGNORE NULLS) AS position_array,
    ARRAY_AGG(CASE WHEN ev.name = 'perseusClientIdNew' THEN ev.value END IGNORE NULLS) AS id_array,
    ARRAY_AGG(CASE WHEN ev.name = 'shopCategorySelected' THEN ev.value END IGNORE NULLS) AS category_array,
    ARRAY_AGG(CASE WHEN ev.name = 'pageType' THEN ev.value END IGNORE NULLS) AS pageType_array,
    ARRAY_AGG(CASE WHEN ev.name = 'userId' THEN ev.value END IGNORE NULLS) AS user_array,
    ARRAY_AGG(CASE WHEN ev.name = 'eventAction' THEN ev.value END IGNORE NULLS) AS event_array,
    ARRAY_AGG(CASE WHEN ev.name = 'hour-of-the-day' THEN ev.value END IGNORE NULLS) AS hotd_array,
    ARRAY_AGG(CASE WHEN ev.name = 'shopRatingQuality' THEN ev.value END IGNORE NULLS) AS shopRatingQuality_array
  FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events` pe, 
    UNNEST(pe.eventVariables) AS ev
  WHERE partition_date > '2025-02-02' and partition_date < '2025-02-15'
    AND eventAction = 'shop_impressions.loaded'
    AND pe.global_entity_id = 'EF_GR'
  GROUP BY dh_platform,global_entity_id,platform, session_key, shopId, eventTimestamp
), 
listings_raw as (
SELECT 
  dh_platform,
  global_entity_id,
  platform,
  session_key,
  shopId,
  eventTimestamp,
  promised_value AS promisedDeliveryTimeRangeUpper,
  IF(pos < ARRAY_LENGTH(promised_lower_array), promised_lower_array[OFFSET(pos)], NULL) AS promisedDeliveryTimeRangeLower,
  IF(pos < ARRAY_LENGTH(location_country), location_country[OFFSET(pos)], NULL) AS locationCountry,
  IF(pos < ARRAY_LENGTH(location_city), location_city[OFFSET(pos)], NULL) AS locationCity,
  IF(pos < ARRAY_LENGTH(delivery_fee_array), delivery_fee_array[OFFSET(pos)], NULL) AS shopDeliveryFee,
  IF(pos < ARRAY_LENGTH(service_fee_array), service_fee_array[OFFSET(pos)], NULL) AS serviceFee,
  IF(pos < ARRAY_LENGTH(mov_array), mov_array[OFFSET(pos)], NULL) AS shopMinimumOrderValue,
  IF(pos < ARRAY_LENGTH(verticaltype_array), verticaltype_array[OFFSET(pos)], NULL) AS verticalType,
  IF(pos < ARRAY_LENGTH(shoplisttype_array), shoplisttype_array[OFFSET(pos)], NULL) AS shopListType,
  IF(pos < ARRAY_LENGTH(sponsor_array), sponsor_array[OFFSET(pos)], NULL) AS shopSponsoring,
  IF(pos < ARRAY_LENGTH(shoptype_array), shoptype_array[OFFSET(pos)], NULL) AS shopType,
  IF(pos < ARRAY_LENGTH(shopname_array), shopname_array[OFFSET(pos)], NULL) AS shopName,
  IF(pos < ARRAY_LENGTH(cuisine_array), cuisine_array[OFFSET(pos)], NULL) AS shopCuisine,
  IF(pos < ARRAY_LENGTH(rank_array), rank_array[OFFSET(pos)], NULL) AS shopRank,
  IF(pos < ARRAY_LENGTH(position_array), position_array[OFFSET(pos)], NULL) AS shopPosition,
  IF(pos < ARRAY_LENGTH(id_array), id_array[OFFSET(pos)], NULL) AS perseusClientId,
  IF(pos < ARRAY_LENGTH(category_array), category_array[OFFSET(pos)], NULL) AS shopCategorySelected,
  IF(pos < ARRAY_LENGTH(pageType_array), pageType_array[OFFSET(pos)], NULL) AS pageType,
  IF(pos < ARRAY_LENGTH(user_array), user_array[OFFSET(pos)], NULL) AS userId,
  IF(pos < ARRAY_LENGTH(event_array), event_array[OFFSET(pos)], NULL) AS eventAction,
  IF(pos < ARRAY_LENGTH(hotd_array), hotd_array[OFFSET(pos)], NULL) AS hour_of_the_day,
  IF(pos < ARRAY_LENGTH(shopRatingQuality_array), shopRatingQuality_array[OFFSET(pos)], NULL) AS shopRatingQuality,
  row_number() over (partition by session_key, shopId order by eventTimestamp asc) as rn
FROM listings_agg,
UNNEST(promised_upper_array) AS promised_value WITH OFFSET AS pos
ORDER BY dh_platform, global_entity_id, platform, session_key, shopId,eventTimestamp, pos
), listings_final as (
select *
from listings_raw
where rn = 1
),
details_agg AS (
  SELECT 
    session_key session_key_details,
    shopId shopId_details,
    eventTimestamp timestamp_details,
    row_number() over (partition by session_key, shopId order by eventTimestamp) as rn_details
  FROM `fulfillment-dwh-production.curated_data_shared_coredata_tracking.perseus_events` pe, 
    UNNEST(pe.eventVariables) AS ev
  WHERE partition_date > '2025-02-02' and partition_date < '2025-02-15'
    AND eventAction = 'shop_details.loaded'
    AND pe.global_entity_id = 'EF_GR'
  GROUP BY session_key, shopId, eventTimestamp
),
details_final as (
select *
from details_agg
where rn_details = 1
)
select *
from listings_final lf
left join details_final df on df.session_key_details = lf.session_key and df.shopId_details = lf.shopId

""" 

In [None]:
project_id = "logistics-customer-staging"
client = bigquery.Client(project = project_id)

listings_df = client.query(listings_data).to_dataframe()

In [None]:
import pandas as pd
import numpy as np

# Assume listings_df is your DataFrame.

# Convert the relevant columns to numeric, coercing non-numeric values to NaN.
listings_df['promisedDeliveryTimeRangeUpper'] = pd.to_numeric(
    listings_df['promisedDeliveryTimeRangeUpper'], errors='coerce'
)
listings_df['promisedDeliveryTimeRangeLower'] = pd.to_numeric(
    listings_df['promisedDeliveryTimeRangeLower'], errors='coerce'
)
listings_df['shopDeliveryFee'] = pd.to_numeric(
    listings_df['shopDeliveryFee'], errors='coerce'
)

# Optional: Check if conversion worked as expected.
print(listings_df[['promisedDeliveryTimeRangeUpper', 'promisedDeliveryTimeRangeLower', 'shopDeliveryFee']].dtypes)

# Compute the average delivery time.
# (Here I'm taking the midpoint; if you want the sum, remove the division by 2.)
listings_df['average_delivery_time'] = (
    listings_df['promisedDeliveryTimeRangeUpper'] + listings_df['promisedDeliveryTimeRangeLower']
) / 2

epsilon = 1e-6  # small constant to avoid log(0)
listings_df['log_delivery_fee'] = np.log(listings_df['shopDeliveryFee'] + epsilon)
listings_df['log_promised_time'] = np.log(listings_df['average_delivery_time'] + epsilon)

# Add a new column 'click': 1 if session_key_details is not null, else 0
listings_df['click'] = listings_df['session_key_details'].notnull().astype(int)

# Check the resulting DataFrame
print(listings_df[['promisedDeliveryTimeRangeUpper', 
                   'promisedDeliveryTimeRangeLower', 
                   'average_delivery_time', 
                   'log_delivery_fee', 
                   'log_promised_time', 
                   'click']].head())


# # Aggregate the data at the session level
# session_df = listings_df.groupby(['session_key','locationCity']).agg(
#     avg_delivery_fee = ('shopDeliveryFee', 'mean'),
#     avg_delivery_time = ('average_delivery_time', 'mean'),
#     click_rate = ('click', 'mean'),    # proportion of shops clicked per session
#     num_exposures = ('session_key', 'count')  # number of shops viewed in the session
# ).reset_index()

# # For log-transformation, add a small constant to avoid log(0)
# epsilon = 1e-6
# session_df['log_avg_delivery_fee'] = np.log(session_df['avg_delivery_fee'] + epsilon)
# session_df['log_avg_delivery_time'] = np.log(session_df['avg_delivery_time'] + epsilon)

# print(session_df.head())

In [None]:
import pandas as pd
import statsmodels.api as sm

# Create dummy variables for locationCity, dropping the first category
location_dummies = pd.get_dummies(session_df['locationCity'], prefix='locationCity', drop_first=True)

# Combine numeric predictors with the dummy variables
X = pd.concat([session_df[['log_avg_delivery_fee', 'log_avg_delivery_time']], location_dummies], axis=1)
X = sm.add_constant(X)  # Adds the intercept term

# Define the dependent variable
y = session_df['click_rate']

# Debug: Print data types before conversion
print("Before conversion, X dtypes:")
print(X.dtypes)
print("\nBefore conversion, y dtype:")
print(y.dtype)

# Convert all columns to numeric, coercing errors to NaN
X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

# Debug: Check data types again
print("\nAfter pd.to_numeric, X dtypes:")
print(X.dtypes)
print("\nAfter pd.to_numeric, y dtype:")
print(y.dtype)

# Force conversion to float if needed
X = X.astype(float)
y = y.astype(float)

# Optionally, drop rows with missing values
X = X.dropna()
y = y.loc[X.index]

# Debug: Check shapes and dtypes one more time
print("\nFinal X dtypes:")
print(X.dtypes)
print("\nFinal X shape:", X.shape)
print("Final y shape:", y.shape)

# Fit the OLS model
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())


In [None]:
import statsmodels.api as sm

# Define the formula: add additional covariates as needed.
formula = "click ~ log_delivery_fee + log_promised_time"

# Fit a GEE model clustering on session_id
gee_model = sm.GEE.from_formula(formula,
                                groups="session_key",
                                data=listings_df,
                                family=sm.families.Binomial())
gee_result = gee_model.fit()

print(gee_result.summary())

In [None]:
# Create design matrices for fixed effects using patsy.
formula = "click ~ log_delivery_fee + log_promised_time"
y, X = patsy.dmatrices(formula, data, return_type='dataframe')

# Create a variance components design matrix for the random intercept by session.
vc_formula = "0 + C(session_key)"
vc_matrix = patsy.dmatrix(vc_formula, data, return_type='dataframe')
vc = {"session": vc_matrix}

# Fit the BinomialBayesMixedGLM model using variational Bayes
model = BinomialBayesMixedGLM(endog=y, exog=X, exog_vc=vc)
fit_result = model.fit_vb()

print(fit_result.summary())