In [6]:
import os
import numpy as np
import pandas as pd

# Adjust working directory so that Python sees 'src' as a package
if not os.path.exists('src'):
    os.chdir("..")
    
from src.ab_testing import ABTest

In [20]:
cleaned_df = pd.read_csv('data/user_agg_cleaned.csv')

  cleaned_df = pd.read_csv('data/user_agg_cleaned.csv')


In [21]:
cleaned_df.head()

Unnamed: 0,fullVisitorId,visitId,visitNumber,date,pageviews,timeOnSite,transactions,totalTransactionRevenue,trafficSource,trafficMedium,trafficCampaign,country,city
0,6241439595091955523,1472141622,1,2016-08-25,1.0,0.0,0.0,0.0,google,organic,NotSet,Canada,NotSet
1,9613025776425213347,1472118423,1,2016-08-25,1.0,0.0,0.0,0.0,Others,organic,NotSet,China,NotSet
2,633905821044866910,1472114673,1,2016-08-25,1.0,0.0,0.0,0.0,google,organic,NotSet,Spain,NotSet
3,479739850734519152,1472193794,1,2016-08-25,1.0,0.0,0.0,0.0,google,organic,NotSet,Hungary,NotSet
4,5518305200549853272,1472122087,1,2016-08-25,1.0,0.0,0.0,0.0,google,organic,NotSet,Ukraine,NotSet


In [22]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   fullVisitorId            903653 non-null  object 
 1   visitId                  903653 non-null  int64  
 2   visitNumber              903653 non-null  int64  
 3   date                     903653 non-null  object 
 4   pageviews                903653 non-null  float64
 5   timeOnSite               903653 non-null  float64
 6   transactions             903653 non-null  float64
 7   totalTransactionRevenue  903653 non-null  float64
 8   trafficSource            903653 non-null  object 
 9   trafficMedium            903653 non-null  object 
 10  trafficCampaign          903653 non-null  object 
 11  country                  903653 non-null  object 
 12  city                     903653 non-null  object 
dtypes: float64(4), int64(2), object(7)
memory usage: 89.6+ MB


In [16]:
ab = ABTest(
    df=cleaned_df,
    control_filter={"trafficSource": "google"},
    test_filter={"trafficSource": "Others"}
)

In [17]:
result = ab.run_test(
    column="totalTransactionRevenue",
    test_type="t_test",
    transform="winsor",
    zero_inflation=True
)
result

{'test_type': 't_test',
 'column': 'totalTransactionRevenue',
 'transform': 'winsor',
 'zero_inflation': True,
 'alpha': 0.05,
 'zero_test': {'control_zero_rate': 0.9903265915119364,
  'test_zero_rate': 0.99020101052079,
  'chi2_statistic': np.float64(0.05074048615993102),
  'p_value': np.float64(0.8217797460069721),
  'table': [[238946, 2334], [45271, 448]]},
 'main_test': {'sample_sizes': (2334, 448),
  'test_statistic': np.float64(-9.950384748810551),
  'p_value': np.float64(6.047905473131331e-23),
  'control_mean': 89.28247879177377,
  'test_mean': 159.34348883928573}}

In [19]:
back = pd.read_csv('data/cleaned_sessions.csv')
print("Number of unique fullVisitorId:", back["fullVisitorId"].nunique())
print("Number of rows:", len(back))

  back = pd.read_csv('data/cleaned_sessions.csv')


Number of unique fullVisitorId: 735539
Number of rows: 903653


In [24]:
from src.hypothesis_recommendation import run_recommendation_test
from src.ab_test_reporting import interpret_ab_results

user_df = pd.read_csv("data/user_agg_cleaned.csv")  # Already aggregated at user level
result = run_recommendation_test(
    user_df,
    user_id_col="fullVisitorId",
    metric_col="totalTransactionRevenue",
    test_type="t_test",
    transform="log",          # e.g., log transform if skewed
    zero_inflation=True
)

print(interpret_ab_results(result))


  user_df = pd.read_csv("data/user_agg_cleaned.csv")  # Already aggregated at user level


A/B Test on 'totalTransactionRevenue' using 't_test' test.
Applied transform='log', zero_inflation=True. (alpha=0.05)
Zero-proportion test: control_zero_rate=98.71%, test_zero_rate=98.74%, p-value=0.242
Control Mean=4.21, Test Mean=4.20, p-value=0.6106
No statistically significant difference detected.


In [25]:
from src.hypothesis_pricing import run_pricing_test
from src.ab_test_reporting import interpret_ab_results

user_df = pd.read_csv("data/user_agg_cleaned.csv")
res = run_pricing_test(
    user_df,
    threshold=200.0,
    metric_col="transactions",
    test_type="mannwhitney",  # robust for skew
    transform="none",
    zero_inflation=True
)

print(interpret_ab_results(res))


  user_df = pd.read_csv("data/user_agg_cleaned.csv")


A/B Test on 'transactions' using 'mannwhitney' test.
Applied transform='none', zero_inflation=True. (alpha=0.05)
Zero-proportion test: control_zero_rate=98.91%, test_zero_rate=0.00%, p-value=0.000
Control Median=1.00, Test Median=1.00, p-value=0.0000
Statistically significant difference.


In [27]:
import pandas as pd
from src.hypothesis_recommendation import run_recommendation_test
from src.hypothesis_pricing import run_pricing_test
from src.hypothesis_cross_selling import run_cross_sell_test
from src.ab_test_reporting import interpret_ab_results

# 1. Load user-level data
user_df = pd.read_csv("data/user_agg_cleaned.csv")

# 2. Product Recommendation Hypothesis
res_reco = run_recommendation_test(
    user_df,
    metric_col="totalTransactionRevenue",
    test_type="t_test",
    transform="log",
    zero_inflation=True
)
print("== Product Recommendation Hypothesis ==")
print(interpret_ab_results(res_reco))

# 3. Dynamic Pricing Hypothesis
res_price = run_pricing_test(
    user_df,
    threshold=300.0,     # maybe define a different threshold
    metric_col="transactions",
    test_type="mannwhitney",
    transform="none",
    zero_inflation=True
)
print("\n== Dynamic Pricing Hypothesis ==")
print(interpret_ab_results(res_price))



  user_df = pd.read_csv("data/user_agg_cleaned.csv")


== Product Recommendation Hypothesis ==
A/B Test on 'totalTransactionRevenue' using 't_test' test.
Applied transform='log', zero_inflation=True. (alpha=0.05)
Zero-proportion test: control_zero_rate=98.71%, test_zero_rate=98.74%, p-value=0.242
Control Mean=4.21, Test Mean=4.20, p-value=0.6106
No statistically significant difference detected.

== Dynamic Pricing Hypothesis ==
A/B Test on 'transactions' using 'mannwhitney' test.
Applied transform='none', zero_inflation=True. (alpha=0.05)
Zero-proportion test: control_zero_rate=98.85%, test_zero_rate=0.00%, p-value=0.000
Control Median=1.00, Test Median=1.00, p-value=0.0000
Statistically significant difference.
