# README

##### Where to download data: https://github.com/patiegm/Udacity_Data_Analysis_Nanodegree/tree/master/Analyze%20AB%20Test%20Results

# Context

In [1]:
from scipy import stats
from scipy.stats import t
from scipy.stats import norm
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import numpy as np

# Load data

In [2]:
data = pd.read_csv('ab_data.csv')
data

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
...,...,...,...,...,...
294473,751197,2017-01-03 22:28:38.630509,control,old_page,0
294474,945152,2017-01-12 00:51:57.078372,control,old_page,0
294475,734608,2017-01-22 11:45:03.439544,control,old_page,0
294476,697314,2017-01-15 01:20:28.957438,control,old_page,0


In [3]:
country = pd.read_csv('countries.csv')
country

Unnamed: 0,user_id,country
0,834778,UK
1,928468,US
2,822059,UK
3,711597,UK
4,710616,UK
...,...,...
290579,653118,US
290580,878226,UK
290581,799368,UK
290582,655535,CA


In [4]:
data = pd.merge(data,country, on = 'user_id',how = 'inner')
data

Unnamed: 0,user_id,timestamp,group,landing_page,converted,country
0,851104,2017-01-21 22:11:48.556739,control,old_page,0,US
1,804228,2017-01-12 08:01:45.159739,control,old_page,0,US
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0,US
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0,US
4,864975,2017-01-21 01:52:26.210827,control,old_page,1,US
...,...,...,...,...,...,...
294473,751197,2017-01-03 22:28:38.630509,control,old_page,0,US
294474,945152,2017-01-12 00:51:57.078372,control,old_page,0,US
294475,734608,2017-01-22 11:45:03.439544,control,old_page,0,US
294476,697314,2017-01-15 01:20:28.957438,control,old_page,0,US


# Clean data

### remove users with mixed group

In [5]:
def remove_mixed_assignment(df):
    df1 = df[['user_id','group']].groupby('user_id').nunique().reset_index()
    df2 = pd.merge(df,df1,on = ['user_id'],how = 'left')
    return df2[df2['group_y'] == 1][['user_id','country','timestamp','converted','landing_page','group_x']].rename(columns={'group_x':'group'})

In [6]:
data1 = remove_mixed_assignment(data)
data1

Unnamed: 0,user_id,country,timestamp,converted,landing_page,group
0,851104,US,2017-01-21 22:11:48.556739,0,old_page,control
1,804228,US,2017-01-12 08:01:45.159739,0,old_page,control
2,661590,US,2017-01-11 16:55:06.154213,0,new_page,treatment
3,853541,US,2017-01-08 18:28:03.143765,0,new_page,treatment
4,864975,US,2017-01-21 01:52:26.210827,1,old_page,control
...,...,...,...,...,...,...
294473,751197,US,2017-01-03 22:28:38.630509,0,old_page,control
294474,945152,US,2017-01-12 00:51:57.078372,0,old_page,control
294475,734608,US,2017-01-22 11:45:03.439544,0,old_page,control
294476,697314,US,2017-01-15 01:20:28.957438,0,old_page,control


In [7]:
def remove_exposure_bugs(df):
    df1 = df[(df['group'] == 'control')&(df['landing_page'] == 'new_page')][['user_id','group']]
    df2 = df[(df['group'] == 'treatment')&(df['landing_page'] == 'old_page')][['user_id','group']]
    df3 = pd.concat([df1,df2])
    df4 = pd.merge(df,df3,on = ['user_id'],how = 'left')
    return df4[df4['group_y'].isna()][['user_id','country','timestamp','converted','landing_page','group_x']].rename(columns={'group_x':'group'})
    

In [8]:
data2 = remove_exposure_bugs(data1)
data2

Unnamed: 0,user_id,country,timestamp,converted,landing_page,group
0,851104,US,2017-01-21 22:11:48.556739,0,old_page,control
1,804228,US,2017-01-12 08:01:45.159739,0,old_page,control
2,661590,US,2017-01-11 16:55:06.154213,0,new_page,treatment
3,853541,US,2017-01-08 18:28:03.143765,0,new_page,treatment
4,864975,US,2017-01-21 01:52:26.210827,1,old_page,control
...,...,...,...,...,...,...
290683,751197,US,2017-01-03 22:28:38.630509,0,old_page,control
290684,945152,US,2017-01-12 00:51:57.078372,0,old_page,control
290685,734608,US,2017-01-22 11:45:03.439544,0,old_page,control
290686,697314,US,2017-01-15 01:20:28.957438,0,old_page,control


In [9]:
def consoliadte_exposure_bugs(df):
    df1 = df.groupby(['user_id','country','group','landing_page']).agg({'timestamp':['min','max'],'converted':['count','sum']})
    df1.columns = df1.columns.droplevel(0)
    df2 = df1.reset_index()
    df2['converted'] = df2.apply(lambda x: int(x['sum']>0),axis = 1)
    df2['conversion_rate'] = 1.0*df2['sum']/df2['count']
    return df2
    

In [10]:
data3 = consoliadte_exposure_bugs(data2)

In [11]:
data3

Unnamed: 0,user_id,country,group,landing_page,min,max,count,sum,converted,conversion_rate
0,630000,US,treatment,new_page,2017-01-19 06:26:06.548941,2017-01-19 06:26:06.548941,1,0,0,0.0
1,630001,US,treatment,new_page,2017-01-16 03:16:42.560309,2017-01-16 03:16:42.560309,1,1,1,1.0
2,630002,US,control,old_page,2017-01-19 19:20:56.438330,2017-01-19 19:20:56.438330,1,0,0,0.0
3,630003,US,treatment,new_page,2017-01-12 10:09:31.510471,2017-01-12 10:09:31.510471,1,0,0,0.0
4,630004,US,treatment,new_page,2017-01-18 20:23:58.824994,2017-01-18 20:23:58.824994,1,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
286686,945994,UK,control,old_page,2017-01-03 14:41:21.565258,2017-01-03 14:41:21.565258,1,0,0,0.0
286687,945996,US,treatment,new_page,2017-01-09 18:58:19.952277,2017-01-09 18:58:19.952277,1,0,0,0.0
286688,945997,US,control,old_page,2017-01-04 06:56:24.658147,2017-01-04 06:56:24.658147,1,0,0,0.0
286689,945998,CA,control,old_page,2017-01-16 07:08:02.207969,2017-01-16 07:08:02.207969,1,0,0,0.0


# T test

In [12]:
def perform_t_test(df):
    n_treatment = df[df['group']=='treatment']['user_id'].count() # number of users in treatment
    n_control = df[df['group'] == 'control']['user_id'].count() # number of users in control group
    
    # conversion rate of treatment group
    p_treatment = 1.0*df[df['group']== 'treatment']['converted'].sum()/n_treatment
    p_control = 1.0*df[df['group'] == 'control']['converted'].sum()/n_control
    
    # variance
    var_treatment = p_treatment*(1-p_treatment)
    var_control = p_control*(1-p_control)
    
    # diff
    p_delta = p_treatment - p_control
    print(p_delta)
    
    # standard error
    pooled_se = np.sqrt(var_treatment/n_treatment + var_control/n_control)
    
    # t statistic
    t_stat = p_delta/pooled_se
    
    # degree of freedom
    # we don't assume same var in the two group so we do not use n1+n2-1 as dof 
    dof = (var_treatment/n_treatment+var_control/n_control)**2/(var_treatment**2/(n_treatment**2*(n_treatment-1))+var_control**2/(n_control**2*(n_control-1)))
    
    
    # calculate p value for two-tailed t test
    pvalue = 2*t.cdf(-abs(t_stat),dof)
    print(pvalue)
    
    # calculate upper and lower bound of confidence interval
    lob = p_delta - t.ppf(0.975,dof)*pooled_se
    hib = p_delta +t.ppf(0.975,dof)*pooled_se
    print(lob)
    print(hib)
    

### Key formulas used in Welch's t-test for proportions

1. **Conversion rate (sample proportion)**  
   $$
   \hat{p}_{treatment} = \frac{\text{conversions in treatment}}{n_{treatment}}, 
   \quad 
   \hat{p}_{control} = \frac{\text{conversions in control}}{n_{control}}
   $$

2. **Sample variance (Bernoulli variance)**  
   $$
   \hat{Var}_{treatment} = \hat{p}_{treatment} \left(1 - \hat{p}_{treatment}\right), 
   \quad
   \hat{Var}_{control} = \hat{p}_{control} \left(1 - \hat{p}_{control}\right)
   $$

3. **Difference in proportions**  
   $$
   \Delta p = \hat{p}_{treatment} - \hat{p}_{control}
   $$

4. **Pooled standard error**  
   $$
   SE = \sqrt{\frac{\hat{Var}_{treatment}}{n_{treatment}} + \frac{\hat{Var}_{control}}{n_{control}}}
   $$

5. **t-statistic**  
   $$
   t = \frac{\Delta p}{SE}
   $$

6. **Welch–Satterthwaite degrees of freedom**  
   $$
   \nu = 
   \frac{\left(\frac{\hat{Var}_{treatment}}{n_{treatment}} + \frac{\hat{Var}_{control}}{n_{control}}\right)^2}
   {\frac{\left(\frac{\hat{Var}_{treatment}}{n_{treatment}}\right)^2}{n_{treatment}-1} + \frac{\left(\frac{\hat{Var}_{control}}{n_{control}}\right)^2}{n_{control}-1}}
   $$

7. **Two-tailed p-value**  
   $$
   p = 2 \cdot P\big(T \leq -|t|\big), 
   \quad T \sim t(\nu)
   $$

8. **95% Confidence Interval (CI)**  
   $$
   CI = \Delta p \; \pm \; t_{0.975, \nu} \cdot SE
   $$


In [13]:
perform_t_test(data3)

-0.0014478458686042056
0.23202039224946788
-0.0038221778568021604
0.0009264861195937492


##### confidence interval contains 0, means there is no significant difference

# Z test

In [14]:
def perform_z_test(df):
    # sample size
    n_treatment = df[df['group'] == 'treatment']['user_id'].count()
    n_control = df[df['group'] == 'control']['user_id'].count()
    
    #proportion
    p_tre = 1.0*df[df['group']=='treatment']['converted'].sum()/n_treatment
    p_ctr = 1.0*df[df['group']=='control']['converted'].sum()/n_control
    p_delta = p_tre - p_ctr
    print(p_delta)
    
    # p value
    p = 1.0*(df[df['group']=='treatment']['converted'].sum()+df[df['group']=='control']['converted'].sum())/(n_treatment+n_control)
    
    # assuming same variance
    var = p*(1-p)
    se = np.sqrt(var*(1/n_treatment+1/n_control))
    z = p_delta/se
    
    
    p_value = 2*norm.cdf(-abs(z))
    print(p_value)
    
    lob = p_delta - norm.ppf(0.975)*se
    hib = p_delta +norm.ppf(0.975)*se
    print(lob)
    print(hib)
    
    

### Standard Error formulas: Z-test vs T-test

1. **Welch’s T-test (no equal variance assumption)**  

The standard error is computed **separately** for each group:

$$
SE_{T} = \sqrt{\frac{\hat{p}_{treatment}(1 - \hat{p}_{treatment})}{n_{treatment}} \;+\; \frac{\hat{p}_{control}(1 - \hat{p}_{control})}{n_{control}}}
$$

- Does **not assume equal variance**.  
- Uses group-specific variances (Bernoulli variance for each proportion).  
- More conservative, especially for unequal sample sizes or variances.  

---

2. **Two-proportion Z-test (pooled variance assumption)**  

The standard error is based on the **pooled proportion** $\hat{p}$:

$$
\hat{p} = \frac{x_{treatment} + x_{control}}{n_{treatment} + n_{control}}
$$

$$
SE_{Z} = \sqrt{\hat{p}(1 - \hat{p}) \left(\frac{1}{n_{treatment}} + \frac{1}{n_{control}}\right)}
$$

- Assumes both groups have the **same underlying variance**.  
- Common in large-sample settings.  

---


✅ **Summary**:  
- Z-test → pooled estimate of variance, valid when variances are equal and $n$ is large.  
- T-test (Welch) → separate variance estimates, safer when variances differ or $n$ is small.  


In [15]:
perform_z_test(data3)

-0.0014478458686042056
0.232019670931933
-0.003822169192095711
0.0009264774548872999


##### Z检验 (perform_z_test)

方法：大样本情况下的比例检验
与T检验区别：使用正态分布而非t分布
结果：与T检验基本一致，说明样本量足够大



# Linear regression

In [20]:
formula = 'converted ~ C(group)+C(country)'
lm1 = ols(formula,data3).fit()
print(lm1.summary())

                            OLS Regression Results                            
Dep. Variable:              converted   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.350
Date:                Wed, 10 Sep 2025   Prob (F-statistic):              0.256
Time:                        22:30:35   Log-Likelihood:                -83970.
No. Observations:              286691   AIC:                         1.679e+05
Df Residuals:                  286687   BIC:                         1.680e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.11

# Heterogeneous Treatment Effect

In [18]:
formula = 'converted ~ C(group)+C(country)+C(group)*C(country)'
lm2 = ols(formula,data3).fit()
print(lm2.summary())

                            OLS Regression Results                            
Dep. Variable:              converted   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.323
Date:                Wed, 10 Sep 2025   Prob (F-statistic):              0.251
Time:                        20:57:41   Log-Likelihood:                -83969.
No. Observations:              286691   AIC:                         1.680e+05
Df Residuals:                  286685   BIC:                         1.680e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------

Heterogeneous Treatment Effect Analysis
Objective: To determine if the new page's effect differs across user countries.

Model: converted ~ C(group) + C(country) + C(group)*C(country)

Meaning of Interaction Term: The term C(group)*C(country) tests whether the impact of the new page (the treatment) varies depending on the user's country.

Practical Value:

Identifying Benefiting Segments: This helps identify which user groups respond most positively to the new page.

Informing Personalization Strategy: The results can guide tailored strategies for different user segments.

Result: The interaction term was not statistically significant, indicating that the treatment effect was similar across all countries.

In [21]:
anovaResults = anova_lm(lm1,lm2)
print(anovaResults)

   df_resid           ssr  df_diff   ss_diff        F    Pr(>F)
0  286687.0  30154.037295      0.0       NaN      NaN       NaN
1  286685.0  30153.767849      2.0  0.269446  1.28087  0.277797


Hypothesis Testing: Comparing Nested Linear Models

Objective: To determine if the interaction terms are necessary.

Comparison: We compare a complex model with interaction terms against a simple model without interaction terms.

Result: The F-test p-value is 0.278, which is greater than the significance level of 0.05. This indicates that the interaction terms are not statistically significant.

Conclusion: We fail to reject the null hypothesis. Therefore, the simple model is preferred as it is not significantly outperformed by the more complex model.

# Overall Conclusion


Key Findings:

The new page showed a 0.14% decrease in conversion rate compared to the old page.

The difference was not statistically significant (p-value = 0.232 > 0.05).

Recommendation: There is not enough evidence to support the new page being better. It is advised to either keep the old page or continue with further optimization.

Key Considerations
Sample Size: With over 280,000 users, the sample size was sufficient.

Experimental Design: The use of random assignment helped to control for confounding variables.

Data Quality: Data reliability was ensured through a thorough cleaning process.

Multi-faceted Validation: The results were consistent across various tests, including T-tests, Z-tests, and regression analysis.

Practical Significance: Although the difference was not statistically significant, a -0.14% effect could still be practically significant from a business perspective.