In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
from src.utils import *
import pingouin as pg
from scipy import stats
import numpy as np 
import pandas as pd
import tqdm
import warnings
warnings.filterwarnings("ignore")

# Intro

In [None]:
connector = Connector(
    json_credentials="/Users/s.lafaurie/.config/gcloud/application_default_credentials.json"
)

In [None]:
significance_runner = SignificanceRunner(
    connector
    , input_table_id= "_tb_dps_ab_test_significance_orders"
    , output_table_id= "test"
)
test_entity = "OP_SE"
test_pandas = significance_runner.import_df(test_entity, dataframe_type="Pandas")


In [None]:
class ContinuousMetrics:
    DELIVERY_FEE = "delivery_fee_local"
    GFV = "gfv_local"
    TRAVEL_TIME = "travel_time"
    DELIVERY_DISTANCE = "delivery_distance"
    FLEET_DELAY = "fleet_delay"
    DELIVERY_COSTS = "delivery_costs_local"
    REVENUE = "revenue_local"
    PROFIT = "profit_local"
    COMMISSION = "commission_local"

    @classmethod
    def return_continuous_metrics(cls):
        return [
            cls.DELIVERY_FEE,
            cls.GFV,
            cls.TRAVEL_TIME,
            cls.DELIVERY_DISTANCE,
            cls.FLEET_DELAY,
            cls.DELIVERY_COSTS,
            cls.REVENUE,
            cls.PROFIT,
            cls.COMMISSION,
        ]

class GroupingColumns:
    TEST_NAME: str = "test_name"
    VARIANT: str = "variant"
    TREATMENT: str = "treatment"
    TARGET_GROUP: str = "target_group"

    @classmethod
    def return_grouping_columns(cls):
        return [cls.TEST_NAME, cls.VARIANT, cls.TREATMENT]
    
    @classmethod
    def return_test_and_treatment_level_columns(cls):
        return [cls.TEST_NAME, cls.TREATMENT]
    
class Variant:
    CONTROL = "Control"
    VARIATION1 = "Variation1"
    VARIATION2 = "Variation2"
    VARIATION3 = "Variation3"
    VARIATION4 = "Variation4"
    VARIATION5 = "Variation5"
    VARIATION6 = "Variation6"
    VARIATION7 = "Variation7"
    VARIATION8 = "Variation8"


# Significansce ttest
- load a country
- filter by test with 2 variants
- filter a test and level of analysis
- for each column, unnest values
- check if values > 30 and still two variants remain
- run significance


In [None]:
test_with_two_variants = (
    test_pandas
    .loc[lambda df: df.n_variants_in_test == 2]
    .test_name
    .unique()
)


class TTestSignificance:

    def __init__(self):
        self.logger = logging.getLogger(__name__)

    def t_test(c_array:np.array, v_array:np.array) -> float:
        """The function performs the Welch's t-test on the control and variation arrays.
            It returns the p-value.
        """
        stat, p_value = stats.ttest_ind(c_array, v_array, equal_var=False, nan_policy='omit')
        return p_value

    def split_dataset(df:pd.DataFrame, column:str, variant_a:str, variant_b:str) -> tuple[np.array, np.array]:
        """The function splits the passed column into two passed arrays : variant_a and variant_b
        """
        variant_a = df[df['variant'] == variant_a][column].to_numpy()
        variant_b = df[df['variant'] == variant_b][column].to_numpy()
        return variant_a, variant_b

    def detect_outliers(data:pd.Series, q=0.01):
        ''' The function takes a pandas series and a quantile between 0 and 0.1 and returns a boolean that equals true
            where the value is out of bound, otherwise false.
        '''
        if q > 0.1 or q < 0:
            raise TypeError('percentile must be between 0 and 10')
        lower_limit = data.quantile(q)
        upper_limit = data.quantile(1 - q)
        clean_data = ~data.between(lower_limit, upper_limit)
        return clean_data


    def filter_outliers(df:pd.DataFrame, column:str, q=0.01):
        ''' The function takes a pandas dataframe and a column name and returns the dataframe without the outliers
        '''
        outliers = detect_outliers(df[column], q)
        return df[~outliers]


    def filter_data_to_test(df: pd.DataFrame, test_name:str) -> pd.DataFrame:
        return (
            df
            .loc[df["test_name"] == test_name]
        )

    def get_target_group_list_from_test(df:pd.DataFrame) -> list[str]:
        return ( df
                ["target_group"]
                .unique()
                .tolist()
            )

    def filter_data_to_treatment_group(df:pd.DataFrame, treatment:str):
        """
        The function filters the dataframe based on the treatment analysis.
        If "All" it returns all the data. 
        If "True" it returns the data where the treatment is True.
        If "Target Group X" it returns the data where the target_group is X.
        """
        if treatment != "True" and treatment != "All":
            test_target_group_list = get_target_group_list_from_test(df)
            if treatment not in test_target_group_list:
                raise TypeError("The treatment group does not exist in the test")
            return (
                df
                .loc[lambda df: df["target_group"] == treatment]
            )
        elif treatment == "True":
            return (
                df
                .loc[lambda df: df.treatment == True]
            )
        return df

    def explode_test_values(df:pd.DataFrame, current_metric) -> pd.DataFrame:
        """The function takes a dataframe and a metric and returns a dataframe with the exploded values
        """
        return (
            df
            [["test_name", "variant", current_metric]]
            .explode(current_metric)
            .astype({current_metric: "float64"})
        )

    def get_mean_and_count_by_variant(df:pd.DataFrame, 
                                    metric:str) -> pd.DataFrame:
        """The function takes a dataframe and a metric and returns a dataframe with the mean and count by variant
        """
        return (
            df
            .groupby(["test_name", "variant"], as_index=False)
            .agg(
                mean=(metric, "mean"),
                count=(metric, "count"),
            )
        )


    def run_ttest_on_single_metric(
            raw_df:pd.DataFrame, 
            test_name:str,
            treament:str,
            metric:str
        ) -> dict:
        """Run a TTest on a single metric.
        The function takes the raw dataframe loaded from BigQuery. 
        It first filters the data to the test_name, treatment and metric.
        Only then, it explodes the metric values into a long format and calculates
        the significance of that metrics. The result is a dictionary with the results of the
        t-test.
        
        It's intentional the design of having a single metric as input and load everything
        from the raw dataframe. This is to always have one variable that contains the long
        format of the data and optimize memory usage.

        Args:
            raw_df (pd.DataFrame): _description_
            test_name (str): _description_
            treament (str): _description_
            metric (str): _description_

        Returns:
            dict: _description_
        """
        result_dict = {}

        current_test_data = (raw_df
                        .pipe(filter_data_to_test, test_name)
                        .pipe(filter_data_to_treatment_group, treament)
                        .pipe(explode_test_values, metric)
                        .pipe(filter_outliers, metric, q=0.01)
                        )
        

        mean_and_count_vals = get_mean_and_count_by_variant(current_test_data, metric)
        control, variation = split_dataset(current_test_data, metric, "Control", "Variation1")
        p_value = t_test(control, variation)

        # fill the result dictionary
        result_dict["test_name"] = test_name
        result_dict["variant_a"] = "Control"
        result_dict["variant_b"] = "Variation1"
        result_dict["kpi_label"] = metric
        result_dict["anova_p_value"] = None
        result_dict["p_value"] = p_value
        result_dict["mean_a"] = mean_and_count_vals.loc[mean_and_count_vals["variant"] == "Control", "mean"].values[0]
        result_dict["mean_b"] = mean_and_count_vals.loc[mean_and_count_vals["variant"] == "Variation1", "mean"].values[0]
        result_dict["mean_a"] = mean_and_count_vals.loc[mean_and_count_vals["variant"] == "Control", "mean"].values[0]
        result_dict["mean_b"] = mean_and_count_vals.loc[mean_and_count_vals["variant"] == "Variation1", "mean"].values[0]
        result_dict["values_count_a"] = mean_and_count_vals.loc[mean_and_count_vals["variant"] == "Control", "count"].values[0]
        result_dict["values_count_b"] = mean_and_count_vals.loc[mean_and_count_vals["variant"] == "Variation1", "count"].values[0]
        result_dict["statistical_method"] = "Welch's t-test"
        result_dict["treatment"] = treament
        return result_dict    


In [None]:

test_name_filter = "AE_20230124_L_BC_P_PharmacyIncreaseMOV"
current_metric = "dps_delivery_fee_local"
treament="All"


results = significance_runner.run_ttest_on_test_at_all_levels(
    test_pandas, 
    test_name_filter
)

# ANOVA

In [None]:
test_pandas.info()

In [None]:
anova_tests = (
    test_pandas
    .query("n_variants_in_test > 2")
    .test_name
    .unique()
    .tolist()
)

anova_tests

In [None]:
(
    test_pandas
    .pipe(significance_runner.filter_data_to_test, test_name_filter)
    .n_variants_in_test
    .iloc[0]
)

In [None]:
current_metric = "dps_delivery_fee_local"
treament="True"



current_test_data = significance_runner.filter_dataframe_for_metric(
    test_pandas,
    test_name_filter,
    treament,
    current_metric
)

mean_and_counts = significance_runner.get_mean_and_count_by_variant(current_test_data, current_metric)
anova_result = pg.welch_anova(data=current_test_data, dv=current_metric, between="variant")
post_hoc_result = (
    pg.pairwise_gameshowell(data=current_test_data, dv=current_metric, between="variant", effsize="hedges")
    .assign(p_val_bonferroni=lambda df: pg.multicomp(df["pval"].values, alpha=0.05, method="bonf")[1])
    [["A", "B", "p_val_bonferroni"]]
)

result_dict = (
    post_hoc_result
    .merge(mean_and_counts, left_on=["A"], right_on=["variant"])
    .merge(mean_and_counts, left_on=["B"], right_on=["variant"], suffixes=("_a", "_b"))
    [["A", "B", "mean_a", "mean_b", "p_val_bonferroni", "count_a", "count_b"]]
    .rename(columns={"A": "variant_a", "B": "variant_b", "count_a": "values_count_a", "count_b": "values_count_b", "p_val_bonferroni": "p_value"})
    .to_dict(orient="records")
)

for single_dict in result_dict:
    single_dict["test_name"] = test_name_filter
    single_dict["kpi_label"] = current_metric
    single_dict["anova_p_value"] = anova_result["p-unc"].iloc[0]
    single_dict["statistical_method"] = "OneWay ANOVA"
    single_dict["treatment"] = treament


In [None]:



pd.DataFrame.from_dict(result_dict)

In [None]:
pg.multicomp(post_hoc_result.pval, alpha=0.05, method="bonf")[1]

In [None]:
# significance_runner.run_ttest_on_single_metric(
#     test_pandas,
#     test_name_filter,
#     treament,
#     current_metric
# )

# current_test_data = significance_runner.filter_dataframe_for_metric(
#     test_pandas,
#     test_name_filter,
#     treament,
#     current_metric
# )

# mean_and_count_vals = significance_runner.get_mean_and_count_by_variant(current_test_data, current_metric)

In [None]:
# mean_and_count_vals

In [None]:
# pg.welch_anova(data=df_anova, dv="delivery_fee_local", between="variant")
# pg.pairwise_gameshowell(data=df_anova, dv="delivery_fee_local", between="variant")

In [None]:
test_name_filter = "AE_20230217_R_DZ_O_UAQ -SBF"
results = significance_runner.run_significance_on_test_at_all_levels(
    test_pandas,
    test_name_filter
)

# Test all methods

In [None]:
def flatten_list(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

pd.DataFrame.from_dict(flatten_list(results))

# Full Run

In [13]:
# test_entity = "TB_OM"


# test_pandas = significance_runner.import_df(dataframe_type="Pandas")


In [40]:
open(".env").read().split("=")[1]

'/Users/s.lafaurie/.config/gcloud/application_default_credentials.json'

In [36]:

def run_significance_on_talabat(
    runner: SignificanceRunner,
    entities_list: list[str] = MENAEntities.return_talabat_entities()
) -> pd.DataFrame:
    results_dataframe_list = []
    for entity_i in tqdm.tqdm(entities_list, desc=f"Running All Talabat entities", position=0):
        try:
            current_entity_df = runner.import_df(entity_i)
            result_i = runner.run_significance_on_entity(current_entity_df, entity_i)
            results_dataframe_list.append(result_i)
        except Exception as e:
            runner.logger.info(f"Error in {entity_i}")
            runner.logger.info(e)
    runner.logger.info(f"Finished running significance on Talabat")
    return pd.concat(results_dataframe_list, ignore_index=True)


def load_significance_results_to_biquery(
    runner: SignificanceRunner,
    results: pd.DataFrame
):
    runner.logger.info(f"Loading results to BigQuery")
    runner.connector.load_table_to_bq_from_dataframe(
        table_id = runner.output_table_id,
        df = results
    )
    runner.logger.info(f"Finished loading results to BigQuery")

In [33]:
# connector.load_table_to_bq_from_dataframe(
#     table_id="_sl_dps_ab_test_significance_orders_results",
#     df=results
# )

In [37]:
connector = Connector(
    json_credentials="/Users/s.lafaurie/.config/gcloud/application_default_credentials.json"
)

significance_runner = SignificanceRunner(
    connector
    , input_table_id= "_dps_ab_test_significance_orders"
    , output_table_id= "_sl_dps_ab_test_significance_orders_results"
)


results = run_significance_on_talabat(
    significance_runner,
    ["TB_QA"]
)

load_significance_results_to_biquery(
    significance_runner,
    results
)



Running All Talabat entities:   0%|          | 0/1 [00:00<?, ?it/s]2023-04-26 08:44:37,343 - INFO - Query ran successfully
2023-04-26 08:44:37,343 - INFO - Query ran successfully
2023-04-26 08:44:37,343 - INFO - Query ran successfully
2023-04-26 08:44:37,343 - INFO - Query ran successfully
2023-04-26 08:44:37,343 - INFO - Query ran successfully
2023-04-26 08:44:37,343 - INFO - Query ran successfully


Job ID 6c4ecb72-5cfa-46ce-a55c-b300324ce124 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


2023-04-26 08:44:39,842 - INFO - Max number of 45 iterations for QA_20230323_R_BJ_R_RamadanTOD
2023-04-26 08:44:39,842 - INFO - Max number of 45 iterations for QA_20230323_R_BJ_R_RamadanTOD
2023-04-26 08:44:39,842 - INFO - Max number of 45 iterations for QA_20230323_R_BJ_R_RamadanTOD
2023-04-26 08:44:39,842 - INFO - Max number of 45 iterations for QA_20230323_R_BJ_R_RamadanTOD
2023-04-26 08:44:39,842 - INFO - Max number of 45 iterations for QA_20230323_R_BJ_R_RamadanTOD
2023-04-26 08:44:39,842 - INFO - Max number of 45 iterations for QA_20230323_R_BJ_R_RamadanTOD

[A
[A
[A
[A
[A
[A
[A2023-04-26 08:44:44,748 - INFO - Max number of 81 iterations for QA_20230217_R_AB_O_PromoDFCampaign
2023-04-26 08:44:44,748 - INFO - Max number of 81 iterations for QA_20230217_R_AB_O_PromoDFCampaign
2023-04-26 08:44:44,748 - INFO - Max number of 81 iterations for QA_20230217_R_AB_O_PromoDFCampaign
2023-04-26 08:44:44,748 - INFO - Max number of 81 iterations for QA_20230217_R_AB_O_PromoDFCampaign
20

In [41]:
entity_id = "TB_KW"
test_pandas = significance_runner.import_df(dataframe_type="Pandas", entity_id= entity_id)
significance_runner.run_significance_on_entity(
    test_pandas,
    entity_id
)

2023-04-26 09:52:50,792 - INFO - Query ran successfully
2023-04-26 09:52:50,792 - INFO - Query ran successfully
2023-04-26 09:52:50,792 - INFO - Query ran successfully
2023-04-26 09:52:50,792 - INFO - Query ran successfully
2023-04-26 09:52:50,792 - INFO - Query ran successfully
2023-04-26 09:52:50,792 - INFO - Query ran successfully


Job ID 98d9831a-eae0-4ac2-97b6-af89283efbf1 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


2023-04-26 09:52:56,474 - INFO - Max number of 288 iterations for KW_20230217_R_BJ_O_DF_B/A _Test
2023-04-26 09:52:56,474 - INFO - Max number of 288 iterations for KW_20230217_R_BJ_O_DF_B/A _Test
2023-04-26 09:52:56,474 - INFO - Max number of 288 iterations for KW_20230217_R_BJ_O_DF_B/A _Test
2023-04-26 09:52:56,474 - INFO - Max number of 288 iterations for KW_20230217_R_BJ_O_DF_B/A _Test
2023-04-26 09:52:56,474 - INFO - Max number of 288 iterations for KW_20230217_R_BJ_O_DF_B/A _Test
2023-04-26 09:52:56,474 - INFO - Max number of 288 iterations for KW_20230217_R_BJ_O_DF_B/A _Test

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A2023-04-26 09:53:12,761 - INFO - Max number of 162 iterations for KW_20230301_D_G_R_TmartBBDF_SalmiyaEgaila
2023-04-26 09:53:12,761 - INFO - Max number of 162 iterations for KW_20230301_D_G_R_TmartBBDF_SalmiyaEgaila
2023-04-26 09:53:12,761 - INFO - Max number of 162 iteration

Unnamed: 0,variant_a,variant_b,kpi_label,anova_p_value,p_value,mean_a,mean_b,values_count_a,values_count_b,test_name,treatment,statistical_method,status,country_code
0,Control,Variation1,delivery_fee_local,,3.473653e-35,0.452033,0.461719,628868,623753,KW_20230217_R_BJ_O_DF_B/A _Test,All,Welch's T-Test,running,kw
1,Control,Variation1,gfv_local,,9.350504e-02,5.664775,5.676207,616383,611294,KW_20230217_R_BJ_O_DF_B/A _Test,All,Welch's T-Test,running,kw
2,Control,Variation1,travel_time,,3.750834e-07,8.488101,8.556044,361770,357012,KW_20230217_R_BJ_O_DF_B/A _Test,All,Welch's T-Test,running,kw
3,Control,Variation1,delivery_distance,,4.406217e-08,4.385129,4.423350,361571,356752,KW_20230217_R_BJ_O_DF_B/A _Test,All,Welch's T-Test,running,kw
4,Control,Variation1,fleet_delay,,4.850912e-01,9.346675,9.349819,582273,576752,KW_20230217_R_BJ_O_DF_B/A _Test,All,Welch's T-Test,running,kw
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1930,Control,Variation5,commission_local,0.347521,1.000000e+00,0.265039,0.266725,2310,2061,KW_20230217_R_BJ_O_DF_Reduction_Area_Pricing,Target Group 9,OneWay ANOVA,ended,kw
1931,Variation1,Variation5,commission_local,0.347521,1.000000e+00,0.278291,0.266725,1966,2061,KW_20230217_R_BJ_O_DF_Reduction_Area_Pricing,Target Group 9,OneWay ANOVA,ended,kw
1932,Variation2,Variation5,commission_local,0.347521,1.000000e+00,0.271754,0.266725,2047,2061,KW_20230217_R_BJ_O_DF_Reduction_Area_Pricing,Target Group 9,OneWay ANOVA,ended,kw
1933,Variation3,Variation5,commission_local,0.347521,1.000000e+00,0.251083,0.266725,2411,2061,KW_20230217_R_BJ_O_DF_Reduction_Area_Pricing,Target Group 9,OneWay ANOVA,ended,kw


In [None]:
test_name = "OM_20230322_L_JB_P_RamadanNFV"
treatment = "True"
metric = "delivery_costs_local"

test_data = (test_pandas
.pipe(significance_runner.filter_data_to_test, test_name)
.pipe(significance_runner.filter_data_to_treatment_group, treatment)
.pipe(significance_runner.explode_test_values, metric)
# .pipe(significance_runner.filter_outliers, metric, q=0.01)
)

In [None]:
significance_runner.detect_outliers(test_data[metric]).sum()

In [None]:
significance_runner.filter_outliers(test_data, metric).sum()

In [None]:
(
    results
    # .query('test_name=="OM_20230323_R_BJ_R_RamadanTODFood"')
    .assign(
    p_value = lambda df: df["p_value"].map("{:.3f}".format),
    )
    .to_excel(f"{test_entity}_significance_results_20230425.xlsx", index=False)
)

# checks
- % of errors / total significances run
- where does the error comes from
- ~~expected total number of significances makes sense~~
- ~~run anova/t-tests using calculator to confirm if values makes sense~~
- Research the differences between Significance and DPS AB Test dash (values in significance are higher, why?)