In [3]:
import numpy as np
import pandas as pd
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind
import plotly.express as px
import db_dtypes
import bigframes.pandas as bpd
from IPython.display import display, HTML
import math
import statsmodels.api as sm
from typing import Union
import logging
import sys
from google.cloud.exceptions import NotFound
from datetime import datetime, timedelta
import time
import os
import json
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google.auth import default  # <-- Make sure to import this
import pandas as pd


daily_session_data = """


WITH test_setups AS(
    SELECT country_code, entity_id, test_name, test_id
    FROM `fulfillment-dwh-production.cl.dps_experiment_setups`
    GROUP BY country_code, entity_id, test_name, test_id
),
tmp as (

SELECT  
       sessions.dh_platform
      ,sessions.country_code
      ,sessions.created_at
      ,sessions.created_date
      ,sessions.test_id
      ,sessions.test_name
      ,sessions.test_type
      ,sessions.test_user_id
      ,sessions.test_variant
      ,sessions.test_vertical_parents
      ,sessions.session_id
      ,max(sessions.has_transaction) transacted
FROM `fulfillment-dwh-production.cl.dps_test_cvrs` sessions
LEFT JOIN `fulfillment-dwh-production.cl.dps_sessions_mapped_to_orders` map
    ON sessions.entity_id = map.entity_id
    AND sessions.session_id = map.fe_session_id
    AND sessions.test_user_id = map.perseus_client_id
LEFT JOIN test_setups
    ON REGEXP_REPLACE(sessions.test_name, ' ', '') = REGEXP_REPLACE(test_setups.test_name, ' ', '')
    AND sessions.country_code = test_setups.country_code
    AND sessions.entity_id = test_setups.entity_id
WHERE
    ----------------------------------------------------------------
    -- take into account the lag and the time of the traffic split
    -- skip the first day of the test
    ----------------------------------------------------------------
    created_at BETWEEN
        TIMESTAMP_SUB(var_split_created_at_start, INTERVAL var_days_lagged + 1 DAY)
        AND
        TIMESTAMP_SUB(var_split_created_at_end, INTERVAL var_days_lagged DAY)
    AND sessions.test_name = var_test_name
    AND (sessions.test_variant ='Control' OR STARTS_WITH(sessions.test_variant, 'Variation'))
    AND sessions.country_code = var_country_code
    AND sessions.entity_id = var_entity_id
    AND test_setups.test_id IS NOT NULL
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
QUALIFY COUNT(DISTINCT test_user_id) OVER (PARTITION BY session_id) = 1
)
SELECT 
  created_date,
  test_variant,
  COUNT(DISTINCT session_id) AS distinct_sessions,
  COUNT(*) AS total_rows,
  SUM(CASE WHEN transacted = TRUE THEN 1 ELSE 0 END) AS transacted_sessions,
  ROUND(
    COUNT(DISTINCT session_id) * 100.0 / SUM(COUNT(DISTINCT session_id)) OVER (),
    2
  ) AS pct_of_total_sessions
FROM tmp
GROUP BY 1, 2







"""









# Example observed session counts for a day
observed = [2500, 2600, 2400, 2500, 2300]  # for A, B, C, D, E
n_total = sum(observed)

# Expected counts assuming even split
expected = [n_total / 5] * 5

# Chi-squared test
chi2_stat, p_value = stats.chisquare(f_obs=observed, f_exp=expected)

print(f"Chi2 Statistic: {chi2_stat:.2f}, p-value: {p_value:.5f}")

if p_value < 0.05:
    print("⚠️ SRM detected!")
else:
    print("✅ No SRM.")

Chi2 Statistic: 21.14, p-value: 0.00030
⚠️ SRM detected!


In [5]:
def detect_srm(df, day_col='day', variant_col='variant', count_col='count', alpha=0.05):
    """
    Detects SRM (Sample Ratio Mismatch) on a daily level from a dataframe.

    Parameters:
    - df: pandas DataFrame with columns for day, variant, and count
    - day_col: name of the column representing the day
    - variant_col: name of the column representing the variant
    - count_col: name of the column representing the session count
    - alpha: significance level to declare SRM (default 0.05)

    Returns:
    - A DataFrame with day, chi2_statistic, p_value, and srm_detected (boolean)
    """
    results = []

    for day, group in df.groupby(day_col):
        observed = group[count_col].values
        n_total = observed.sum()
        k_variants = observed.size
        expected = [n_total / k_variants] * k_variants

        chi2_stat, p_value = stats.chisquare(f_obs=observed, f_exp=expected)
        srm_detected = p_value < alpha

        results.append({
            day_col: day,
            'chi2_statistic': chi2_stat,
            'p_value': p_value,
            'srm_detected': srm_detected
        })

    return pd.DataFrame(results)


data = {
    'day': [1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
    'variant': ['A', 'B', 'C', 'D', 'E', 'A', 'B', 'C', 'D', 'E'],
    'count': [2500, 2600, 2400, 2500, 2300, 2000, 2000, 2100, 1900, 2000]
}
df = pd.DataFrame(data)

In [6]:
srm_results = detect_srm(df)
print(srm_results)

   day  chi2_statistic   p_value  srm_detected
0    1       21.138211  0.000297          True
1    2       10.000000  0.040428          True


In [4]:
import matplotlib.pyplot as plt

# Suppose you have collected daily p-values
days = list(range(1, len(p_values) + 1))
plt.plot(days, p_values, marker='o')
plt.axhline(y=0.05, color='red', linestyle='--')
plt.xlabel('Day')
plt.ylabel('SRM p-value')
plt.title('Daily SRM Test p-values')
plt.show()

NameError: name 'p_values' is not defined