In [65]:
import pandas as pd
import numpy as np
from scipy import stats
import os
from numpy import std, mean, sqrt
import cPickle as pickle

In [2]:
def load_data(ps, ftype='test'):
    # load either test data or train data
    test_df = pd.read_csv('data/'+str(ps)+'_'+ftype+'_exp.csv', header=None)
    test_df.rename(columns={2: 'condition', 3: 'completion'}, inplace=True)
    print 'The avg completion rate in treatment {}'.format(test_df[test_df['condition'] == 1]['completion'].mean())
    print 'The avg completion rate in control {}'.format(test_df[test_df['condition'] == 0]['completion'].mean())
    return test_df

### Post-model analysis

In [3]:
def post_analysis(res_df, test_df):
    #test_df = load_data(ps)
    #res_df = pd.read_csv(file_name, header=None)
    res_df = res_df.rename(columns={0: 'f', 1: 'cf'})
    concated_test_df = pd.concat([test_df, res_df], axis=1)
    concated_test_df['treatment_effect'] = np.where(concated_test_df['condition']==1, concated_test_df['f']-concated_test_df['cf'], \
                                           concated_test_df['cf']-concated_test_df['f'])
    concated_test_df['potential_treatment_outcome'] = np.where(concated_test_df['condition']==1, concated_test_df['f'], \
                                           concated_test_df['cf'])
    concated_test_df['potential_control_outcome'] = np.where(concated_test_df['condition']==0, concated_test_df['f'], \
                                           concated_test_df['cf'])
    # recommended condition
    concated_test_df['recommended_condition'] = np.where(concated_test_df['treatment_effect']>0, 1, 0)
    return concated_test_df

In [4]:
def cohen_d(x,y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    print 'x group: # {} \t mean {} \t std {}'.format(nx, mean(x), std(x, ddof=1))
    print 'y group: # {} \t mean {} \t std {}'.format(ny, mean(y), std(y, ddof=1))
    return (mean(x) - mean(y)) / sqrt(((nx-1)*std(x, ddof=1) ** 2 + (ny-1)*std(y, ddof=1) ** 2) / dof)

In [45]:
def calculate_completion(res_df, data_df, verbose=1, i_subset=None):
    concated_df = post_analysis(res_df, data_df)
    if i_subset is not None:
        concated_df = concated_df.iloc[concated_df.index.get_indexer(i_subset)]
    matched_df = concated_df[((concated_df['condition'] == 1) & (concated_df['recommended_condition'] == 1)) | ((concated_df['condition'] == 0) & (concated_df['recommended_condition'] == 0))]
    unmatched_df = concated_df[((concated_df['condition'] == 1) & (concated_df['recommended_condition'] == 0)) | ((concated_df['condition'] == 0) & (concated_df['recommended_condition'] == 1))]
    def print_out(x, y, x_name, y_name):
        print stats.ttest_ind(x,y)
        print x_name
        print len(x)
        print y_name
        print len(y)
        print 'Effect size: '
        print cohen_d(x.tolist(), y.tolist())
        print '*'*10
        nx = len(x)
        ny = len(y)
    if verbose:
        print 'Comparison between treatment and control'
        print_out(concated_df[concated_df['condition']==1]['completion'],\
                  concated_df[concated_df['condition']==0]['completion'],\
                  'Treatment group', 'Control group')
        
        print 'Comparison between matched and unmatched'
        print_out(matched_df['completion'], unmatched_df['completion'], 'Matched group: ', 'Unmatched group: ')

        print 'Comparison between matched and actual treatment'
        print_out(matched_df['completion'], concated_df[concated_df['condition']==1]['completion'],\
                  'Matched group', 'Actual treatment group')

        print 'Comparison between matched and actual control'
        print_out(matched_df['completion'],\
                  concated_df[concated_df['condition']==0]['completion'], 'Matched group', 'Actual control group')
    
    cr = matched_df['completion'].mean()
    return concated_df[['condition', 'recommended_condition', 'completion',\
                        'potential_treatment_outcome', 'potential_control_outcome',\
                        'treatment_effect']], cr

In [46]:
def generate_final_table(test_ps, test_folder_path, folder_name, i_out):
    file_name = test_folder_path + folder_name + '/result.test.npz'
    # load test data
    data_df = load_data(test_ps)
    # load predictions on test
    result = load_result_file(file_name)
    preds = pd.DataFrame(result['pred'][:,:,0,i_out])
    post_df, _ = calculate_completion(preds, data_df)
    post_df.to_csv(test_folder_path + '/'+str(test_ps)+'-final-table.csv', index=False)

In [7]:
def load_result_file(file):
    arr = np.load(file)

    D = dict([(k, arr[k]) for k in arr.keys()])

    return D

In [8]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [9]:
def find_best_config(folder_path):
    sorted_config_file = folder_path + '/configs_sorted.txt'
    config_dict = {}
    with open(sorted_config_file) as f:
        for line in f:
            for ite in line.split(','):
                ite = ite.strip()
                pair = ite.split('=')
                config_dict[pair[0]] = float(pair[1])
            break
    return config_dict

In [10]:
def load_evaluation(eval_file):
    eval_results, configs = pickle.load(open(eval_file, "rb"))
    i_sel = np.argmin(eval_results['valid']['policy_risk'], 2)
    

In [107]:
ps = 303899
folder_path = 'results/sea/'+str(ps)+'/'
config_dict = find_best_config(folder_path)
data_df = load_data(ps, 'train')
cr_max = 0
result_name = ''
max_i_out = 0
for root, dirs, files in os.walk(folder_path):
    for name in dirs:
        if 'results_2' in name:
            config_file = folder_path + name + '/config.txt'
            res_config = {}
            with open(config_file) as f:
                for line in f:
                    line = line.strip()
                    pair = line.split(':')
                    if is_number(pair[1]):
                        res_config[pair[0]] = float(pair[1])
            found = True
            # check if matched the best config
            for key in config_dict.keys():
                if res_config[key] != config_dict[key]:
                    found = False
                    break
            if found:
                result_name = name
                result_file = folder_path + name + '/result.npz'
                result = load_result_file(result_file)
                preds = result['pred']
                n_units, _, n_rep, n_outputs = preds.shape
                i_subset = result['val'][0].tolist()
                for i_out in range(n_outputs):
                    try:
                        _, cr = calculate_completion(pd.DataFrame(preds[:,:,0,i_out]), data_df, 0, i_subset)
                        if cr > cr_max:
                            cr_max = cr
                            max_i_out = i_out
                    except Exception as e: 
                        print(e)
                        break

# once the best config is found then compute the results on testing data
print cr_max
print max_i_out
# print result_name
# print max_i_out
generate_final_table(ps, folder_path, result_name, max_i_out)

The avg completion rate in treatment 0.85652173913
The avg completion rate in control 0.900432900433
0.8372093023255814
13
The avg completion rate in treatment 0.903765690377
The avg completion rate in control 0.864035087719
Comparison between treatment and control
Ttest_indResult(statistic=1.341792109212716, pvalue=0.18031826042837046)
Treatment group
239
Control group
228
Effect size: 
x group: # 239 	 mean 0.903765690377 	 std 0.295531218606
y group: # 228 	 mean 0.864035087719 	 std 0.343505431023
0.12421593131805926
**********
Comparison between matched and unmatched
Ttest_indResult(statistic=0.8168869816015373, pvalue=0.4144113304332846)
Matched group: 
232
Unmatched group: 
235
Effect size: 
x group: # 232 	 mean 0.896551724138 	 std 0.305201951303
y group: # 235 	 mean 0.872340425532 	 std 0.334422660856
0.07560360277524651
**********
Comparison between matched and actual treatment
Ttest_indResult(statistic=-0.2606171899770463, pvalue=0.7945021194891249)
Matched group
232
Actua

### using rebar to calculate standard error

In [84]:
def calculate_confidence_interval(x, y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    print 'x group: # {} \t mean {} \t std {}'.format(nx, mean(x), std(x, ddof=1))
    print 'y group: # {} \t mean {} \t std {}'.format(ny, mean(y), std(y, ddof=1))
    # standard error
    se = np.sqrt(np.var(x) / nx + np.var(y) / ny)
    # confidence interval
    print stats.t.interval(0.95, dof, loc=mean(y)-mean(x), scale=se)

In [108]:
print folder_path
print result_name
file_name = folder_path + result_name + '/result.test.npz'
# load test data
data_df = load_data(ps)
# load predictions on test
result = load_result_file(file_name)
preds = pd.DataFrame(result['pred'][:,:,0,i_out])
concated_df = post_analysis(preds, data_df)

results/sea/303899/
results_20180404_223710-765816
The avg completion rate in treatment 0.903765690377
The avg completion rate in control 0.864035087719


In [109]:
x = concated_df[concated_df['condition'] == 0]['completion']
y = concated_df[concated_df['condition'] == 1]['completion']
calculate_confidence_interval(x, y)

x group: # 228 	 mean 0.864035087719 	 std 0.343505431023
y group: # 239 	 mean 0.903765690377 	 std 0.295531218606
(-0.018535266035555754, 0.09799647135009734)


In [110]:
# rebar
concated_df['rebar'] = concated_df['completion'] - \
    ((concated_df['potential_control_outcome']+concated_df['potential_treatment_outcome'])/2)
#concated_df['rebar'] = concated_df['completion'] - concated_df['potential_control_outcome']
x = concated_df[concated_df['condition'] == 0]['rebar']
y = concated_df[concated_df['condition'] == 1]['rebar']
calculate_confidence_interval(x, y)
print stats.t.interval(0.95, len(concated_df)-1, loc=mean(concated_df['treatment_effect']), scale=stats.sem(concated_df['treatment_effect']))

x group: # 228 	 mean -0.0198218512692 	 std 0.341787613751
y group: # 239 	 mean 0.0170406060114 	 std 0.314552417479
(-0.022818120552162827, 0.09654303511336602)
(-0.06119618920356391, -0.03290342184036615)


In [106]:
print mean(concated_df['treatment_effect'])
print concated_df['treatment_effect'].mean()

-0.06510247
-0.06510247


In [104]:
concated_df.head()

Unnamed: 0,0,1,condition,completion,4,5,6,7,8,9,...,18,19,20,f,cf,treatment_effect,potential_treatment_outcome,potential_control_outcome,recommended_condition,rebar
0,2280,119747,1,1,0,0.653171,0.977011,0.0,0.0,0.0,...,0.524691,0.0,0.0,0.019668,0.922546,-0.902878,0.019668,0.922546,0,0.528893
1,2282,125427,1,1,0,0.713937,0.875,-0.631573,0.875,-0.631573,...,0.04321,0.098592,0.033333,0.00381,0.163985,-0.160174,0.00381,0.163985,0,0.916102
2,2285,125676,0,1,0,0.69949,0.888889,-0.020667,0.923077,-0.028615,...,0.098765,0.169014,0.033333,0.164135,0.003812,-0.160324,0.003812,0.164135,0,0.916026
3,2286,125816,0,1,0,0.702575,0.9,-0.180614,0.857143,-0.196454,...,0.166667,0.253521,0.133333,0.98726,0.997491,0.010231,0.997491,0.98726,1,0.007624
4,2287,125820,1,1,0,0.699217,1.0,0.065645,1.0,0.144419,...,0.339506,0.352113,0.0,0.997447,0.986313,0.011134,0.997447,0.986313,1,0.00812


In [None]:
# test on evaluation.npz
result_file = 'results/sea/263052/results_20180331_081922-987867/result.test.npz'
data_df = load_data(263052, 'test')
result = load_result_file(result_file)
preds = result['pred']
n_units, _, n_rep, n_outputs = preds.shape
#i_subset = result['val'][0].tolist()
i_subset = None
for i_out in range(n_outputs):
    _, cr = calculate_completion(pd.DataFrame(preds[:,:,0,i_out]), data_df, 0, i_subset)
    if cr > cr_max:
        cr_max = cr
        result_name = name
        max_i_out = i_out
    print cr

In [51]:
generate_final_table(263052, 'results/saved/263052/', 'SEA-results_20180220_105747-063679', -1)

The avg completion rate in treatment 0.609523809524
The avg completion rate in control 0.652173913043
Comparison between treatment and control
Ttest_indResult(statistic=-0.615850962007453, pvalue=0.5387110562047797)
Treatment group
105
Control group
92
Effect size: 
x group: # 105 	 mean 0.609523809524 	 std 0.490196940314
y group: # 92 	 mean 0.652173913043 	 std 0.478890260404
-0.08794682673762517
**********
Comparison between matched and unmatched
Ttest_indResult(statistic=0.9113742153493164, pvalue=0.3632234923111105)
Matched group: 
92
Unmatched group: 
105
Effect size: 
x group: # 92 	 mean 0.663043478261 	 std 0.475259880626
y group: # 105 	 mean 0.6 	 std 0.492247592485
0.13014913535118489
**********
Comparison between matched and actual treatment
Ttest_indResult(statistic=0.7754729588346174, pvalue=0.4389996567455109)
Matched group
92
Actual treatment group
105
Effect size: 
x group: # 92 	 mean 0.663043478261 	 std 0.475259880626
y group: # 105 	 mean 0.609523809524 	 std 0.4