In [67]:
from tests.get_test_data import get_data_input_to_stats
import statsmodels.api as sm
import statsmodels.stats.proportion as prop
import datetime
import pandas as pd
import scipy.stats as stats

In [2]:
# df and metrics are the 2 parameters that will be input into the stats step
df, metrics = get_data_input_to_stats()

In [3]:
# df is the raw event-level table output from the SQL query
df.columns

Index(['TEST_CELL', 'OID', 'DT', 'MATCH_TYPE_ID', 'MATCH_TYPE', 'SESSION_ID',
       'ACCEPTS', 'CLOSED_LEADS', 'WON_LEADS', 'CALL_TRACKING_LEADS',
       'CALLED_LEADS', 'CONNECTIONS', 'LEADS_W_CREDIT_REQ', 'LEADS_W_APPR_CR',
       'NET_REV', 'GROSS_REV', 'DT2'],
      dtype='object')

In [4]:
# metrics is a dictionary, with each key being a metric that we want to track
# Each metric is mapped to a dictionary with the following keys:
#   - numerator_column: the column name in the dataframe for the numerator of the metric
#   - denominator_column: the column name of the denom, or COUNT, which refers to the number of records (basically, means it's an average)
#   - type: either ratio or average. ratios are always binary metrics; averages could be either binary or continuous
#   - function: the actual function you can apply to the dataframe to return the metric column
#     example: df['accept_rate'] = df.apply(metrics['accept_rate']['function'], axis=1)
metrics

{'accept_rate': {'denominator_column': 'COUNT',
  'function': <function ab_testing_import.test_setup.ABTest._get_metric_function.<locals>.<lambda>>,
  'numerator_column': 'ACCEPTS',
  'type': 'average'},
 'call_rate': {'denominator_column': 'CALL_TRACKING_LEADS',
  'function': <function ab_testing_import.test_setup.ABTest._get_metric_function.<locals>.<lambda>>,
  'numerator_column': 'CALLED_LEADS',
  'type': 'ratio'},
 'connection_rate': {'denominator_column': 'CALL_TRACKING_LEADS',
  'function': <function ab_testing_import.test_setup.ABTest._get_metric_function.<locals>.<lambda>>,
  'numerator_column': 'CONNECTIONS',
  'type': 'ratio'},
 'contact_on_call_rate': {'denominator_column': 'CALLED_LEADS',
  'function': <function ab_testing_import.test_setup.ABTest._get_metric_function.<locals>.<lambda>>,
  'numerator_column': 'CONNECTIONS',
  'type': 'ratio'},
 'credit_approval_rate': {'denominator_column': 'LEADS_W_CREDIT_REQ',
  'function': <function ab_testing_import.test_setup.ABTest._

In [7]:
df['DT'] = df['DT'].dt.floor('d')
####

start_date = df['DT'].min()
end_date = df['DT'].max()
####

num_of_days = (end_date - start_date).days
####

end_dates = []
for i in range(0,num_of_days + 1):
    end_dates.append(start_date + (datetime.timedelta(days = i)))
####

df_list = []
for date in end_dates:
    df_list.append((df[df['DT'] <= date]))
####

df1 = df_list[0]

#####
for k, v in metrics.items():
       df1[k] = df1.apply(v['function'], axis=1)

[    TEST_CELL        OID         DT  MATCH_TYPE_ID  \
0        Test  118665639 2018-06-29             23   
1        Test  118665582 2018-06-29             39   
2        Test  118666305 2018-06-29             38   
3        Ctrl  118665578 2018-06-29             39   
4        Ctrl  118666357 2018-06-29             25   
5        Ctrl  118666728 2018-06-29             38   
6        Test  118666349 2018-06-29             25   
7        Test  118665614 2018-06-29             10   
8        Ctrl  118666925 2018-06-29              9   
9        Test  118666486 2018-06-29             25   
10       Test  118666680 2018-06-29             39   
11       Test  118667686 2018-06-29             23   
12       Test  118667716 2018-06-29             10   
13       Test  118668051 2018-06-29              9   
14       Test  118668764 2018-06-29             27   
15       Ctrl  118669000 2018-06-29             23   
16       Ctrl  118668992 2018-06-29             23   
17       Test  118668905 20

KeyError: ('COUNT', 'occurred at index 0')

In [None]:
binary_metrics = []
cont_metrics = []

for k, v in metrics.items():
    if (v['type']) == 'ratio':
        binary_metrics.append(k)
    else:
         if len(df1[k].unique()) == 2:
            binary_metrics.append(k)
         else:
            cont_metrics.append(k)
        
print(binary_metrics)
print(cont_metrics)

In [None]:
def prop_test(df, metric, metric_definition):
    test_cells = df['TEST_CELL'].unique()
    num = metric_definition['numerator_column']
    denom = metric_definition['denominator_column']
    df = df.groupby('TEST_CELL').sum()
    df[metric] = df.apply(metric_definition['function'], axis=1)
    z_score, p_value = sm.stats.proportions_ztest([df.loc[test_cells[0], num], df.loc[test_cells[1], num]],
                                                  [df.loc[test_cells[0], denom], df.loc[test_cells[1], denom]])
    rows = []
    
    for test_cell in test_cells:
        data = {'TEST_CELL': test_cell, 'METRIC_NAME' : metric, 'METRIC_VALUE' : df.loc[test_cell, metric],
               'P_VALUE': p_value, 'UPPER_CI': None, 'LOWER_CI' : None}
        rows.append(data)
    return pd.DataFrame(rows)
    
dfs = []

for metric in binary_metrics:
    dfs.append(prop_test(df1, metric, metrics[metric]))

pd.concat(dfs)
    


In [None]:
def do_stats(df, metrics):
    binary_metrics = []
    cont_metrics = []

    for k, v in metrics.items():
        if (v['type']) == 'ratio':
            binary_metrics.append(k)
        else:
            if len(df1[k].unique()) == 2:
                binary_metrics.append(k)
            else:
                cont_metrics.append(k)
    dfs = []

    for metric in binary_metrics:
        dfs.append(prop_test(df, metric, metrics[metric]))
        
    dfs = pd.concat(dfs).reset_index(drop = True)
    dfs['DT'] = df['DT'].max()
    return dfs
    
dfs = []

for df in df_list:
    dfs.append(do_stats(df,metrics))

dfs = pd.concat(dfs).reset_index(drop = True) 
print(dfs)  

In [None]:
df1['COUNT'] = 1

In [64]:
prop.proportion_confint(680, 1000)

(0.6510880398557441, 0.708911960144256)

In [100]:
import numpy as np

p1 = .68
n1 = 1000
p2 = .67
n2 = 100

ub = (p1 - p2) + 1.96 * np.sqrt( ((p1 * (1- p1)) / n1) + ((p2 * (1- p2)) / n2) )
avg = (p1 - p2)
lb = (p1 - p2) - 1.96 * np.sqrt( ((p1 * (1- p1)) / n1) + ((p2 * (1- p2)) / n2) )

print(lb, avg, ub)

-0.08659042271364174 0.010000000000000009 0.10659042271364176


In [103]:
stats.norm.ppf(.95)
stats.norm.cdf(1.666)

0.9521432927825391

In [99]:
z_score, p_value = sm.stats.proportions_ztest([680, 67], [1000, 100])
p_value
z_score

0.2042437092171444

In [102]:
sm.stats.proportions_ztest(10,100, value = .05)
# sm.stats.proportion_confint(10, 100, alpha = .05)
# sm.stats.proportion_effectsize(.6,.5)

(1.6666666666666665, 0.0955807045456294)