## This file runs over dataset (PBI input as pandas dataframe )

### Below cells for standalone testing

In [130]:
import pandas as pd
import numpy as np

from scipy import stats

In [131]:
# script to download file from PBI
# df.to_csv(r"C:\Users\v-suljai\Desktop\Python Scripts\t.csv", index=False)

In [132]:
data = pd.read_csv("AdobeData6.csv")
data.head()

Unnamed: 0,Control_ExpID,Cart_Adds,Cart_Removal,Unique_Visitors,Market (v27),Orders,Checkouts,Treatment_ExpID
0,EX:sfwaaa,270187,78374,9349007,en-us,215055,156015,EX:sfwaaa
1,EX:sfwaab,270060,77973,9348316,en-us,215730,156562,EX:sfwaab
2,EX:20199642t1,259739,72622,9122288,en-us,213711,154251,EX:20199642t1
3,EX:20199642C,259385,71999,9124103,en-us,213264,154598,EX:20199642C
4,EX:19844185c,249609,70166,8734133,en-us,206181,148817,EX:19844185c


In [133]:
## main function to run Ttest

def run_ttest(uu_control, success_control, uu_treatment, success_treatment, alpha):
    
    print ("Control Data:", uu_control, success_control)
    print ("Treatment Data:", uu_treatment, success_treatment)
    
    # Calculate conversion rate
    cvr_control =0
    cvr_treatment =0
    if uu_control != 0:
        cvr_control = success_control/uu_control
    if uu_treatment != 0:
        cvr_treatment = success_treatment/uu_treatment
    
    # standard deviation
    std_dev_control = np.sqrt(cvr_control*(1-cvr_control))
    std_dev_treatment = np.sqrt(cvr_treatment*(1-cvr_treatment))

    print ("Standard deviation for Control %.2f & Treatment %.2f" % (std_dev_control, std_dev_treatment))
    
    # Standard Error
    std_error_control = np.sqrt( np.power(std_dev_control,2)/uu_control)
    std_error_treatment = np.sqrt( np.power(std_dev_treatment,2)/uu_treatment)
    
    print ("Standard error for Control %.2f & Treatment %.2f" % (std_error_control, std_error_treatment))
    
    #signal
    signal_control = cvr_control/std_error_control
    signal_treatment = cvr_treatment/std_error_treatment
    
    # cutoff
    cf_control = 1.96*(std_dev_control/np.sqrt(uu_control))
    cf_treatment = 1.96*(std_dev_treatment/np.sqrt(uu_treatment))
    
    # Calculate confidence interval for Control's conversion rate
    cvr_conf_interval_control_low = cvr_control - cf_control
    cvr_conf_interval_control_high = cvr_control + cf_control
    
    print ("Confidence Interval for Control low: {:.2%} high: {:.2%}".format(cvr_conf_interval_control_low, cvr_conf_interval_control_high))
    
    cvr_conf_interval_treatment_low = cvr_treatment - cf_treatment
    cvr_conf_interval_treatment_high = cvr_treatment + cf_treatment
    print ("Confidence Interval for treatment low: {:.2%} high {:.2%}".format(cvr_conf_interval_treatment_low, cvr_conf_interval_treatment_high))
        
    lift=0
    if cvr_control!=0:
        lift = (cvr_treatment-cvr_control)/cvr_control
    print ("lift: {:.2%}".format(lift))
    
    # confidence interval for lift
    lift_conf_interval_low = (cvr_conf_interval_treatment_low - cvr_conf_interval_control_high)/cvr_conf_interval_control_high
    lift_conf_interval_high = (cvr_conf_interval_treatment_high- cvr_conf_interval_control_low)/cvr_conf_interval_control_low
    print ("Confidence Interval for lift low {:.2%} high {:.2%}".format(lift_conf_interval_low, lift_conf_interval_high))
        
    # calculate standard error
    sed = np.sqrt(std_error_control**2 + std_error_treatment**2)
    
    # calculate t-stat
    t_stat = (cvr_control - cvr_treatment) / sed
    print ("t-stat is %.2f" %(t_stat))
    
    alpha = 0.05
    df = uu_treatment+uu_control - 2

    # calculate critical value
    cv = stats.t.ppf(1.0 - alpha, df)
    print ("Critical Value %.2f " % (cv))
    
    # calculate p-value
    p = (1.0 - stats.t.cdf(abs(t_stat), df)) * 2.0
    print ("THE P-VALUE: %.5f" % (p))
    
    # interpret via critical value
    if abs(t_stat) <= cv:
        print('Accept null hypothesis that the means are equal.')
    else:
        print('Reject the null hypothesis that the means are equal.')
        
    # interpret via p-value
    if p > alpha:
        print('Accept null hypothesis that the means are equal.')
    else:
        print('Reject the null hypothesis that the means are equal.')
        
    return ('%.3f'%(lift),'%.3f'%(p))

### Steps to extract Experiment IDs

In [136]:
# Steps to extract experiment ID

lst = data.loc[:,"Control_ExpID"].unique().tolist()

lst_control = []
for i in range(len(lst)):
    if 'c' in lst[i][-1].lower():
        lst_control.append(lst[i])
print (sorted(lst_control))   

lst_treatment = []
for i in range(len(lst)):
    if ('t' in lst[i][-1].lower()) or ('t' in lst[i][-2].lower()):
        lst_treatment.append(lst[i])
print (sorted(lst_treatment))  

lst_control = sorted (lst_control)
lst_treatment = sorted (lst_treatment)

for c in range(len(lst_control)):
    for t in range(len(lst_treatment)):
        if lst_control[c][3:11] in lst_treatment[t]:
            control = lst_control[c]
            treatment = lst_treatment[t]
            
            control_df = data[(data['Control_ExpID']==control) & (data['Market (v27)']=='en-us')]
            treatment_df = data[(data['Control_ExpID']==treatment) & (data['Market (v27)']=='en-us')]
                

['EX:19637668c', 'EX:19844185c', 'EX:19975527c', 'EX:19994289c', 'EX:20199642C', 'EX:20285521c']
['EX:19338270T1', 'EX:19338270T2', 'EX:19338270T3', 'EX:19975527t1', 'EX:19994289t1', 'EX:20199642t1', 'EX:20285521t1', 'EX:surcomm-001t1', 'ex:19844185t1']


In [138]:
def start_ttest(control, treatment):
    uu_control = control['Unique_Visitors'].sum()
    ATC_control = control['Cart_Adds'].sum()
    Checkouts_control = control['Checkouts'].sum()
    Orders_control = control['Order'].sum()

    uu_treatment = treatment['Unique_Visitors'].sum()
    ATC_treatment = treatment['Cart_Adds'].sum()
    Checkouts_treatment = treatment['Checkouts'].sum()
    Orders_treatment = treatment['Order'].sum()
    
    lift, p=run_ttest(uu_control, ATC_control, uu_treatment, ATC_treatment, 0.95)
    lift, p=run_ttest(uu_control, Checkouts_control, uu_treatment, Checkouts_treatment, 0.95)
    lift, p=run_ttest(uu_control, Orders_control, uu_treatment, Orders_treatment, 0.95)

In [140]:
#stats_exp_data = pd.DataFrame()
data1 = ['EX:20199642c']
cols = ['ExpID_Control']
stats_data = pd.DataFrame(data = data1, columns=cols)

stats_data['ExpID_Treatment'] = 'EX:20199642t1'
stats_data['KPI'] = 'Orders'

stats_data['UU_control'] = uu_control
stats_data['UU_treatment'] = uu_treatment
stats_data['ATC_control'] = ATC_control
stats_data['ATC_treatment'] = ATC_treatment
stats_data['CVR Lift'] = lift

stats_data['p_val'] = p
stats_data

NameError: name 'uu_control' is not defined

https://machinelearningmastery.com/how-to-code-the-students-t-test-from-scratch-in-python/

## Steps for testing script in PBI 

In [10]:
dataset = data

In [63]:
dataset.rename(index=str,columns={"Order":"Orders"}, inplace=True)
dataset.head()

Unnamed: 0,Date Granularity.Level 1: Year,Date Granularity.Level 2: Month,Date Granularity.Level 3: Day,Control_ExpID,Cart_Adds,Cart_Removal,Checkouts,Orders,Unique_Visitors,Market (v27),Treatment_ExpID
0,2019,1,27,EX:sfwaaa,18070,5513,16882,18339,591901,en-us,EX:sfwaaa
1,2019,1,27,EX:sfwaab,17872,5627,17105,18618,591782,en-us,EX:sfwaab
2,2019,1,27,EX:20199642t1,17488,5079,16327,17879,576690,en-us,EX:20199642t1
3,2019,1,27,EX:19844185c,17421,5263,16386,17993,577820,en-us,EX:19844185c
4,2019,1,27,EX:19975527t1,17355,5163,16185,17822,578066,en-us,EX:19975527t1


In [55]:
d =dataset['Market (v27)'].apply(lambda x: x.lower())
d.head()

0    en-us
1    en-us
2    en-us
3    en-us
4    en-us
Name: Market (v27), dtype: object

In [129]:
# 'dataset' holds the input data for this script

import pandas as pd
import numpy as np

from scipy import stats

                
#######################################################################################
# Functions
#######################################################################################

def run_ttest1(uu_control, success_control, uu_treatment, success_treatment, alpha):
    
    # Calculate conversion rate
    cvr_control =0
    cvr_treatment =0
    if uu_control != 0:
        cvr_control = success_control/uu_control
    if uu_treatment != 0:
        cvr_treatment = success_treatment/uu_treatment
    
    # standard deviation
    std_dev_control = np.sqrt(cvr_control*(1-cvr_control))
    std_dev_treatment = np.sqrt(cvr_treatment*(1-cvr_treatment))

    ### print ("Standard deviation for Control %.2f & Treatment %.2f" % (std_dev_control, std_dev_treatment))
    
    # Standard Error
    std_error_control = np.sqrt( np.power(std_dev_control,2)/uu_control)
    std_error_treatment = np.sqrt( np.power(std_dev_treatment,2)/uu_treatment)
    
    ### print ("Standard error for Control %.2f & Treatment %.2f" % (std_error_control, std_error_treatment))
    
    #signal
    signal_control = cvr_control/std_error_control
    signal_treatment = cvr_treatment/std_error_treatment
    
    # cutoff
    cf_control = 1.96*(std_dev_control/np.sqrt(uu_control))
    cf_treatment = 1.96*(std_dev_treatment/np.sqrt(uu_treatment))
    
    # Calculate confidence interval for Control's conversion rate
    cvr_conf_interval_control_low = cvr_control - cf_control
    cvr_conf_interval_control_high = cvr_control + cf_control
    
    #### print ("Confidence Interval for Control low: {:.2%} high: {:.2%}".format(cvr_conf_interval_control_low, cvr_conf_interval_control_high))
    
    cvr_conf_interval_treatment_low = cvr_treatment - cf_treatment
    cvr_conf_interval_treatment_high = cvr_treatment + cf_treatment
    #### print ("Confidence Interval for treatment low: {:.2%} high {:.2%}".format(cvr_conf_interval_treatment_low, cvr_conf_interval_treatment_high))
        
    lift=0
    if cvr_control!=0:
        lift = (cvr_treatment-cvr_control)/cvr_control
    ### print ("lift: {:.2%}".format(lift))
    
    # confidence interval for lift
    lift_conf_interval_low = (cvr_conf_interval_treatment_low - cvr_conf_interval_control_high)/cvr_conf_interval_control_high
    lift_conf_interval_high = (cvr_conf_interval_treatment_high- cvr_conf_interval_control_low)/cvr_conf_interval_control_low
    ### print ("Confidence Interval for lift low {:.2%} high {:.2%}".format(lift_conf_interval_low, lift_conf_interval_high))
        
    # calculate standard error
    sed = np.sqrt(std_error_control**2 + std_error_treatment**2)
    
    # calculate t-stat
    t_stat = (cvr_control - cvr_treatment) / sed
    ### print ("t-stat is %.2f" %(t_stat))
    
    alpha = 0.05
    df = uu_treatment+uu_control - 2

    # calculate critical value
    cv = stats.t.ppf(1.0 - alpha, df)
    ### print ("Critical Value %.2f " % (cv))
    
    # calculate p-value
    p = (1.0 - stats.t.cdf(abs(t_stat), df)) * 2.0
    ### print ("THE P-VALUE: %.5f" % (p))
    
    # interpret via critical value
    ### if abs(t_stat) <= cv:
        ### print('Accept null hypothesis that the means are equal.')
    ### else:
        ### print('Reject the null hypothesis that the means are equal.')
        
    # interpret via p-value
    ### if p > alpha:
        ### print('Accept null hypothesis that the means are equal.')
    ### else:
        ### print('Reject the null hypothesis that the means are equal.')
        
    return (lift, p)


# function for calling per KPI

def get_ttest1(control, treatment):
    # get all KPIs for control
    #print (control['Control_ExpID'].tolist())
    uu_control = control['Unique_Visitors'].sum()
    ATC_control = control['Cart_Adds'].sum()
    Checkouts_control = control['Checkouts'].sum()
    Orders_control = control['Orders'].sum()

    # get all KPIs from Treatment
    uu_treatment = treatment['Unique_Visitors'].sum()
    ATC_treatment = treatment['Cart_Adds'].sum()
    Checkouts_treatment = treatment['Checkouts'].sum()
    Orders_treatment = treatment['Orders'].sum()

    # run statistical test for ATC
    lift, p=run_ttest1(uu_control, ATC_control, uu_treatment, ATC_treatment, 0.95)

    # write to dataframe
    data1 = control['Control_ExpID'].tolist() #['EX:20199642c']
    cols = ['ExpID_Control']
    stats_data = pd.DataFrame(data = data1, columns=cols)

    stats_data['ExpID_Treatment'] = treatment['Treatment_ExpID'].tolist() #'EX:20199642t1'
    stats_data['KPI'] = 'Add-to-Carts'

    stats_data['UU_control'] = uu_control
    stats_data['UU_treatment'] = uu_treatment
    stats_data['KPI_control'] = ATC_control
    stats_data['KPI_treatment'] = ATC_treatment
    stats_data['CVR Lift'] = lift

    stats_data['p_val'] = p
    #print  (stats_data)
    
    # checkout
    lift, p=run_ttest1(uu_control, Checkouts_control, uu_treatment, Checkouts_treatment, 0.95)

    # write to dataframe
    data1 = control['Control_ExpID'].tolist()  #['EX:20199642c']
    cols = ['ExpID_Control']
    stats_data_ck = pd.DataFrame(data = data1, columns=cols)
    
    stats_data_ck['ExpID_Treatment'] = treatment['Treatment_ExpID'].tolist() #'EX:20199642t1'
    stats_data_ck['KPI'] = 'Checkouts'

    stats_data_ck['UU_control'] = uu_control
    stats_data_ck['UU_treatment'] = uu_treatment
    stats_data_ck['KPI_control'] = Checkouts_control
    stats_data_ck['KPI_treatment'] = Checkouts_treatment
    stats_data_ck['CVR Lift'] = lift

    stats_data_ck['p_val'] = p

    #Order
    lift, p=run_ttest1(uu_control, Orders_control, uu_treatment, Orders_treatment, 0.95)

    # write to dataframe
    data1 = control['Control_ExpID'].tolist()  #['EX:20199642c']
    cols = ['ExpID_Control']
    stats_data_or = pd.DataFrame(data = data1, columns=cols)

    stats_data_or['ExpID_Treatment'] = treatment['Treatment_ExpID'].tolist() #'EX:20199642t1'    
    stats_data_or['KPI'] = 'Orders'

    stats_data_or['UU_control'] = uu_control
    stats_data_or['UU_treatment'] = uu_treatment
    stats_data_or['KPI_control'] = Orders_control
    stats_data_or['KPI_treatment'] = Orders_treatment
    stats_data_or['CVR Lift'] = lift

    stats_data_or['p_val'] = p

    #print (stats_data)
    stats_data=stats_data.append(stats_data_ck, ignore_index=True)
    #print ( stats_data)
    
    stats_data =stats_data.append(stats_data_or, ignore_index=True)
    #print ( stats_data)
    
    return stats_data

############################################################################################
#control = dataset[dataset['Control_ExpID']=='EX:20199642c']
#treatment = dataset[dataset['Treatment_ExpID']=='EX:20199642t1']
############################################################################################

lst = dataset.loc[:,"Control_ExpID"].unique().tolist()

lst_control = []
for i in range(len(lst)):
    if 'c' in lst[i][-1].lower():
        lst_control.append(lst[i])
### print (sorted(lst_control))   

lst_treatment = []
for i in range(len(lst)):
    if ('t' in lst[i][-1].lower()) or ('t' in lst[i][-2].lower()):
        lst_treatment.append(lst[i])
### print (sorted(lst_treatment))  

lst_control = sorted (lst_control)
lst_treatment = sorted (lst_treatment)

control_exp=pd.DataFrame(lst_control)
treatment_exp=pd.DataFrame(lst_treatment)

dataset['Market (v27)'] = dataset['Market (v27)'].apply(lambda x: x.lower())
stats_data1 = pd.DataFrame()

for c in range(len(lst_control)):
    for t in range(len(lst_treatment)):
        if lst_control[c][3:8] in lst_treatment[t]:
            control = lst_control[c]
            treatment = lst_treatment[t]
             
            #control_df= dataset[(dataset['Control_ExpID']==control)]
            control_df = dataset[(dataset['Control_ExpID']==control) & (dataset['Market (v27)']=='en-us')]
            treatment_df = dataset[(dataset['Treatment_ExpID']==treatment) & (dataset['Market (v27)']=='en-us')]
            
            if control_df.shape[0]>0 and treatment_df.shape[0]>0:
                control_df.drop(control_df.index[1:], inplace=True)
                treatment_df.drop(treatment_df.index[1:], inplace=True)
                #print (control_df.shape, treatment_df.shape)
                if stats_data1.shape[0]==0:
                    stats_data1 = get_ttest1(control_df, treatment_df)
                else:    
                    stats_data1 = stats_data1.append(get_ttest1(control_df, treatment_df), ignore_index=True)
                break
    #break
                #print (stats_data.head())
print (stats_data1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


      ExpID_Control   ExpID_Treatment           KPI  UU_control  UU_treatment  \
0      EX:18255570c     EX:18255570T1  Add-to-Carts      465315        471671   
1      EX:18255570c     EX:18255570T1     Checkouts      465315        471671   
2      EX:18255570c     EX:18255570T1        Orders      465315        471671   
3      EX:19844185c     EX:19844185t1  Add-to-Carts      577820        707813   
4      EX:19844185c     EX:19844185t1     Checkouts      577820        707813   
5      EX:19844185c     EX:19844185t1        Orders      577820        707813   
6      EX:19975527c     EX:19975527t1  Add-to-Carts      576648        578066   
7      EX:19975527c     EX:19975527t1     Checkouts      576648        578066   
8      EX:19975527c     EX:19975527t1        Orders      576648        578066   
9      EX:19976714c     EX:19976714T3  Add-to-Carts       72166         85427   
10     EX:19976714c     EX:19976714T3     Checkouts       72166         85427   
11     EX:19976714c     EX:1

In [None]:
stats_data_or

In [None]:
dataset.to_csv