In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import os
from numpy import std, mean, sqrt

In [2]:
def load_data(ps):
    # test data
    test_df = pd.read_csv('data/'+str(ps)+'_test_exp.csv', header=None)
    test_df.rename(columns={2: 'condition', 3: 'completion'}, inplace=True)
    print 'The avg completion rate in treatment {}'.format(test_df[test_df['condition'] == 1]['completion'].mean())
    print 'The avg completion rate in control {}'.format(test_df[test_df['condition'] == 0]['completion'].mean())
    return test_df

### Post-model analysis

In [3]:
def post_analysis(file_name, test_df):
    #test_df = load_data(ps)
    res_df = pd.read_csv(file_name, header=None)
    res_df = res_df.rename(columns={0: 'f', 1: 'cf'})
    concated_test_df = pd.concat([test_df, res_df], axis=1)
    concated_test_df['treatment_effect'] = np.where(concated_test_df['condition']==1, concated_test_df['f']-concated_test_df['cf'], \
                                           concated_test_df['cf']-concated_test_df['f'])
    concated_test_df['potential_treatment_outcome'] = np.where(concated_test_df['condition']==1, concated_test_df['f'], \
                                           concated_test_df['cf'])
    concated_test_df['potential_control_outcome'] = np.where(concated_test_df['condition']==0, concated_test_df['f'], \
                                           concated_test_df['cf'])
    # recommended condition
    concated_test_df['recommended_condition'] = np.where(concated_test_df['treatment_effect']>0, 1, 0)
    return concated_test_df

In [4]:
def cohen_d(x,y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    print 'x group: # {} \t mean {} \t std {}'.format(nx, mean(x), std(x, ddof=1))
    print 'y group: # {} \t mean {} \t std {}'.format(ny, mean(y), std(y, ddof=1))
    return (mean(x) - mean(y)) / sqrt(((nx-1)*std(x, ddof=1) ** 2 + (ny-1)*std(y, ddof=1) ** 2) / dof)

In [11]:
def calculate_completion(file_name, test_df):
    concated_df = post_analysis(file_name, test_df)
    
    matched_df = concated_df[((concated_df['condition'] == 1) & (concated_df['recommended_condition'] == 1)) | ((concated_df['condition'] == 0) & (concated_df['recommended_condition'] == 0))]
    unmatched_df = concated_df[((concated_df['condition'] == 1) & (concated_df['recommended_condition'] == 0)) | ((concated_df['condition'] == 0) & (concated_df['recommended_condition'] == 1))]
    def print_out(x, y, x_name, y_name):
        print stats.ttest_ind(x,y)
        print x_name
        print len(x)
        print y_name
        print len(y)
        print 'Effect size: '
        print cohen_d(x.tolist(), y.tolist())
        print '*'*10

    print 'Comparison between matched and unmatched'
    print_out(matched_df['completion'], unmatched_df['completion'], 'Matched group: ', 'Unmatched group: ')
    
    print 'Comparison between matched and actual treatment'
    print_out(matched_df['completion'], concated_df[concated_df['condition']==1]['completion'],\
              'Matched group', 'Actual treatment group')
    
    print 'Comparison between matched treatment and actual treatment'
    print_out(matched_df[matched_df['condition']==1]['completion'],\
              concated_df[concated_df['condition']==1]['completion'], 'Matched treatment group', 'Actual treatment group')
    
    return concated_df[['condition', 'recommended_condition', 'completion', 'potential_treatment_outcome', 'potential_control_outcome', 'treatment_effect']]

In [12]:
test_ps = 263115
test_folder_path = 'results/archived/263115/'
name = 'results_20180220_143942-509542'
def generate_final_table(test_ps, test_folder_path, folder_name):
    file_name = test_folder_path + folder_name + '/y_pred.test_1.csv'
    print file_name
    test_df = load_data(test_ps)
    post_df = calculate_completion(file_name, test_df)
    post_df.to_csv(str(test_ps)+'-final-table.csv', index=False)
generate_final_table(test_ps, test_folder_path, name)

results/archived/263115/results_20180220_143942-509542/y_pred.test_1.csv
The avg completion rate in treatment 0.886075949367
The avg completion rate in control 0.885714285714
Comparison between matched and unmatched
Ttest_indResult(statistic=1.8417487106980484, pvalue=0.0675271817243208)
Matched group: 
75
Unmatched group: 
74
Effect size: 
x group: # 75 	 mean 0.933333333333 	 std 0.251123601167
y group: # 74 	 mean 0.837837837838 	 std 0.371115572112
0.30177066308267425
**********
Comparison between matched and actual treatment
Ttest_indResult(statistic=1.01643253057439, pvalue=0.31103862126915677)
Matched group
75
Actual treatment group
79
Effect size: 
x group: # 75 	 mean 0.933333333333 	 std 0.251123601167
y group: # 79 	 mean 0.886075949367 	 std 0.319749171413
0.1638682210798815
**********
Comparison between matched treatment and actual treatment
Ttest_indResult(statistic=1.4148101794404826, pvalue=0.15994874017655095)
Matched treatment group
33
Actual treatment group
79
Effect

In [13]:
test_ps = 263052
test_folder_path = 'results/archived/263052/'
name = 'results_20180220_105747-063679'
generate_final_table(test_ps, test_folder_path, name)

results/archived/263052/results_20180220_105747-063679/y_pred.test_1.csv
The avg completion rate in treatment 0.657142857143
The avg completion rate in control 0.633928571429
Comparison between matched and unmatched
Ttest_indResult(statistic=2.157769164128804, pvalue=0.03205432159553306)
Matched group: 
103
Unmatched group: 
114
Effect size: 
x group: # 103 	 mean 0.718446601942 	 std 0.451956004456
y group: # 114 	 mean 0.578947368421 	 std 0.495907799775
0.2933349502638108
**********
Comparison between matched and actual treatment
Ttest_indResult(statistic=0.9511757831233874, pvalue=0.3426297980370282)
Matched group
103
Actual treatment group
105
Effect size: 
x group: # 103 	 mean 0.718446601942 	 std 0.451956004456
y group: # 105 	 mean 0.657142857143 	 std 0.476940800805
0.13191044645790248
**********
Comparison between matched treatment and actual treatment
Ttest_indResult(statistic=3.899506422829131, pvalue=0.00014220551792667189)
Matched treatment group
55
Actual treatment grou

In [None]:
# compute all results under a folder
ps = 263115
folder_path = 'results/archived/263115/'
test_df = load_data(ps)
for root, dirs, files in os.walk(folder_path):
    for name in dirs:
        if 'results' in name:
            print 'folder name: {}'.format(name)
            file_name = folder_path + name + '/y_pred.test_1.csv'
            try:
                calculate_completion(file_name, test_df)
            except:
                print 'Hello'
            print '*'*10