In [None]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.stats import norm
from scipy.stats import t as t_dist
import numpy as np
from scipy.stats import ttest_ind
from copy import deepcopy
import statsmodels.api as sm
import numpy as np

In [None]:
raw_data = pd.read_csv('criteo-uplift-v2.1.csv')

In [None]:
features = [f'f{i}' for i in range(12)]
feature_data = raw_data[features].copy()
feature_data = np.array(feature_data)

In [None]:
median  = np.median(feature_data[:,0])
list_1 = list(np.where(feature_data[:,0] > median)[0])
list_2 = list(np.where(feature_data[:,0] <= median)[0])

In [None]:
binary_groups = np.zeros((feature_data.shape[0],feature_data.shape[1]))
for j in range(feature_data.shape[1]):
    median  = np.median(feature_data[:,j])
    list_1 = list(np.where(feature_data[:,j] > median)[0])
    list_2 = list(np.where(feature_data[:,j] < median)[0])
    list_3 = list(np.where(feature_data[:,j] == median)[0])
    if len(list_3) != 0:
        sum_1 = len(list_1)
        sum_2 = len(list_2)
        if sum_1 > sum_2:
            a = sum_1 - sum_2
            # generate random permutation, then split
            perm = np.random.permutation(list_3)
            split_idx = int((len(list_3) + a) / 2)

            list_temp1 = perm[:split_idx]  # select first half
            list_temp2 = perm[split_idx:]  # remaining half

            # directly extend list_2 and list_1
            list_2.extend(list_temp1)
            list_1.extend(list_temp2)
        else:
            a = sum_2 - sum_1
            # generate random permutation, then split
            perm = np.random.permutation(list_3)
            split_idx = int((len(list_3) + a) / 2)

            list_temp1 = perm[:split_idx]  # select first half
            list_temp2 = perm[split_idx:]

            # directly extend list_1 and list_2
            list_1.extend(list_temp1)
            list_2.extend(list_temp2)
    binary_groups[list_1,j] = 1
    binary_groups[list_2,j] = 0   


In [None]:
subgroup_labels = np.apply_along_axis(lambda row: ''.join(row.astype(str)), axis=1, arr=binary_groups)

In [None]:
raw_data['subgroup_labels'] = pd.Series(subgroup_labels, dtype="category")

In [None]:
pop_data_list = []

for outcome in ["visit"]:
    pop_data = []
    for subgroup, group_df in raw_data.groupby("subgroup_labels"):
        treated = np.array(group_df[group_df["treatment"] == 1][outcome].values)
        control = np.array(group_df[group_df["treatment"] == 0][outcome].values)

        
        if len(treated) + len(control) > 1000:
            pop_data.append([treated, control])
    pop_data_list.append(pop_data)


pop_data_visit = pop_data_list[0]
true_tao_visit = []
for i in range(len(pop_data_visit)):
    tau1 = pop_data_visit[i][0]
    tau2 = pop_data_visit[i][1]
    true_tao_visit.append(np.mean(tau1) - np.mean(tau2))
true_tao1_visit = np.array(true_tao_visit)

weight_visit = []
for i in range(len(pop_data_visit)):
    tau1 = pop_data_visit[i][0]
    tau2 = pop_data_visit[i][1]
    weight_visit.append(len(tau1) + len(tau2))
weight_visit = np.array(weight_visit)
weight_visit = weight_visit/np.sum(weight_visit)

In [None]:
plt.hist(true_tao1_visit, color = '#8CA5EA' , bins=20)
plt.xlabel('True HTEs')
plt.ylabel("Number")
plt.savefig('true_ate_in_criteo.png', dpi=300, bbox_inches='tight')

In [None]:
T = 1000
N_list = [10,15,20,25,30]
cost = np.zeros((len(N_list),4,T))
decision_right_rate = np.zeros((len(N_list),3,2,T))
#N = 20
true_tao1 = deepcopy(true_tao1_visit)
pop_data = deepcopy(pop_data_visit)
weight = deepcopy(weight_visit)


K = true_tao1.shape[0]
tau_min = 0
true_tao = true_tao1 - tau_min
optimal_cost = np.sum(true_tao[np.argwhere(true_tao>0)]*weight[np.argwhere(true_tao>0)])

estimated_tau = np.zeros((len(N_list),T,K))
calculte_zeros = np.zeros((2,T))
alpha = 0.05
Z = norm.ppf( 1 - alpha/2)
for idx, N in enumerate(N_list):
    for t in tqdm(range(T)):
        #hist_data = np.zeros((K,N,2))
        tao_hat = np.zeros(K)
        variance = np.zeros(K)
        #upper_bound_list = np.zeros(K)
        p_value_list = np.ones(K)
        
        group_hist = []
        for k in range(K):
            group_1 = np.array(np.random.choice(pop_data[k][0],int(N/2),replace=True))
            group_0 = np.array(np.random.choice(pop_data[k][1],N - int(N/2),replace=True))
            group_hist.append([group_1,group_0])
            
            diff_mean = group_1.mean() - group_0.mean()
            if np.std(group_1) == 0 and np.std(group_0) == 0:
                p_value = 0
            else:
                t_stat, p_value = ttest_ind(group_1, group_0, equal_var = False) 
            tao_hat[k] = diff_mean
            p_value_list[k] = p_value
            variance[k] = N*(group_1.var(ddof=1) / len(group_1) + group_0.var(ddof=1) / len(group_0))

        decision1 = np.intersect1d(np.argwhere(p_value_list<alpha), np.argwhere(tao_hat>tau_min))
        #decision1 = np.intersect1d(decision1,select_list)
            
        tao_0 = np.mean(tao_hat)
        numerator = np.mean(variance)
        denumerator = np.mean((tao_hat - tao_0)**2) - numerator/N

        bayesian_tao = np.zeros(K)
        bayesian_beta = np.zeros(K)
        #p_value_list_bayesian = np.zeros(K)
        decision3 = []
        # #bayesian_variance = np.zeros(K)
        for k in range(K):
            if denumerator <= 0:
                theta = 1
                posteri_mean = tao_hat[k]*theta + (1 - theta)*tao_0
                posteri_var = variance[k]/N
                if posteri_mean > norm.ppf(0.975)*np.sqrt(posteri_var):
                    decision3.append(k)

            else:
                bayesian_beta[k] = max(variance[k]/denumerator,0)
                theta = N/(N+bayesian_beta[k])
                posteri_mean = tao_hat[k]*theta + (1 - theta)*tao_0
                posteri_var = theta*variance[k]/N
                if posteri_mean > norm.ppf(0.975)*np.sqrt(posteri_var):
                    decision3.append(k)

             


        

        

        if tao_0 == 0 or denumerator == 0:
            beta = 0
        else:
            beta = numerator/denumerator + Z*np.sqrt(N*numerator)/tao_0
        
        beta = max(0,beta)
        #print(beta)
        theta = N/(N+beta)
        tao_shunken_hat = np.zeros(K)
        p_value_list_shrunken = np.ones(K)

        for k in range(K):
            group_11 = theta*group_hist[k][0] + (1-theta)*tao_0
            group_00 = theta*group_hist[k][1]
            if np.std(group_11) == 0 and np.std(group_00) == 0:
                p_value = 0
            else:
                t_stat, p_value = ttest_ind(group_11, group_00, equal_var = False)

            diff_mean1 = group_11.mean() - group_00.mean()
            tao_shunken_hat[k] = diff_mean1
            p_value_list_shrunken[k] = p_value
        decision2 = np.intersect1d(np.argwhere(p_value_list_shrunken<alpha), np.argwhere(tao_shunken_hat>tau_min))
      
        cost[idx,0,t] = np.sum(true_tao[decision1]*weight[decision1])/optimal_cost
        cost[idx,1,t] = np.sum(true_tao[decision2]*weight[decision2])/optimal_cost
        cost[idx,2,t] = np.sum(true_tao[decision3]*weight[decision3])/optimal_cost


        negative_right1 = 0
        negative_right2 = 0
        negative_right3 = 0
        positive_right1 = 0
        positive_right2 = 0
        positive_right3 = 0
        for k in range(K):
            if true_tao[k] < 0 and k not in decision1:
                negative_right1 += 1
            if true_tao[k] < 0 and k not in decision2:
                negative_right2 += 1
            if true_tao[k] < 0 and k not in decision3:
                negative_right3 += 1
            if true_tao[k] > 0 and k in decision1:
                positive_right1 += 1
            if true_tao[k] > 0 and k in decision2:
                positive_right2 += 1
            if true_tao[k] > 0 and k in decision3:
                positive_right3 += 1

        decision_right_rate[idx,0,0,t] = positive_right1/(len(np.argwhere(true_tao>0)))
        decision_right_rate[idx,1,0,t] = positive_right2/(len(np.argwhere(true_tao>0)))
        decision_right_rate[idx,2,0,t] = positive_right3/(len(np.argwhere(true_tao>0)))
        decision_right_rate[idx,0,1,t] = negative_right1/(len(np.argwhere(true_tao<0)))
        decision_right_rate[idx,1,1,t] = negative_right2/(len(np.argwhere(true_tao<0)))
        decision_right_rate[idx,2,1,t] = negative_right3/(len(np.argwhere(true_tao<0)))