In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from scipy.stats import norm
from scipy.optimize import minimize
from copy import deepcopy
from sklearn.linear_model import LinearRegression
import torch
import torch.nn as nn
#import scipy linear regression
import statsmodels.api as sm
from scipy.stats import t as t_dist

## 1 Boxplot in basic setting 

In [None]:
K = 100 # number of experiments
T = 1000

N = 10 # number of samples per experiment
Z = t_dist.ppf(0.975,N)
anchor_mean = 1
std_niose = 3

cost = np.zeros((2,T))
accuracy = np.zeros((2,1000))
recall = np.zeros((2,1000))
FPR = np.zeros((2,1000))
precision = np.zeros((2,1000))
for t in tqdm(range(T)):

    # generate true ATEs
    true_tao = np.random.normal(anchor_mean, 3, K )
    constant = np.random.normal(anchor_mean, 3)
    optimal_cost = np.sum(true_tao[np.argwhere(true_tao>0)])

    # generate data
    feature = np.ones((N+K, K+1)) 
    feature[:, 1:] = np.random.binomial(1, 0.5, (N+K,K))

    # generate label
    label = constant + np.dot(feature[:,1:], true_tao) + np.random.normal(0, std_niose, N+K)

    #IHT
    model = sm.OLS(label, feature).fit()
    estimated_tao = model.params
    p_value = model.pvalues[1:]
    cov_matrix = model.cov_params()

    estimated_variance = np.zeros(K)
    for i in range(K):
        estimated_variance[i] = cov_matrix[i+1,i+1]

    hat_ATE = estimated_tao[1:]

    
    #estimated cost
    decision1 = np.argwhere(hat_ATE>(Z*np.sqrt(estimated_variance)))
    cost[0,t] = np.sum(true_tao[decision1])/optimal_cost

    
    #DRT
    tau_0 = np.mean(hat_ATE)
    if tau_0 == 0:
        cost[1,t] = cost[0,t]
        continue
    beta = N*np.mean(estimated_variance)/(np.var(hat_ATE) - np.mean(estimated_variance)) +  Z*N*np.sqrt(np.mean(estimated_variance))/tau_0
    beta = max(0,beta)
    theta = N/(N+beta)
    shrunken_ATE = theta*hat_ATE + (1-theta)*tau_0
    decision2 = np.argwhere(shrunken_ATE>(theta*Z*np.sqrt(estimated_variance)))

    cost[1,t] = np.sum(true_tao[decision2])/optimal_cost
    

    

    for k in range(K):
        if (true_tao[k] < 0 and k not in decision1) or (true_tao[k] > 0 and k in decision1):
            accuracy[0,t] += 1
        if (true_tao[k] < 0 and k not in decision2) or (true_tao[k] > 0 and k in decision2):
            accuracy[1,t] += 1
        if true_tao[k] > 0 and k in decision1:
            recall[0,t] += 1
        if true_tao[k] > 0 and k in decision2:
            recall[1,t] += 1
        if true_tao[k] < 0 and k in decision1:
            FPR[0,t] += 1
        if true_tao[k] <0 and k in decision2:
            FPR[1,t] += 1
    if recall[0,t] + FPR[0,t] == 0:
        precision[0,t] = 1
    else:
        precision[0,t] = recall[0,t]/(recall[0,t] + FPR[0,t] )
    precision[1,t] = recall[1,t]/(recall[1,t] +FPR[1,t] )
    accuracy[:,t] = accuracy[:,t]/K
    recall[:,t] = recall[:,t]/(len(np.argwhere(true_tao>0)))
    FPR[:,t] = FPR[:,t]/(len(np.argwhere(true_tao<0)))

In [None]:
fig, ax = plt.subplots(figsize = (8,6))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
data = [cost[1,:] - cost[0,:],accuracy[1,:] - accuracy[0,:],recall[1,:] - recall[0,:],FPR[0,:] - FPR[1,:],precision[1,:] - precision[0,:]]

bp = plt.boxplot(data,showfliers=False,showmeans=True,patch_artist=True)

colors = [ '#9DB4CE','#EDA1A4','#FCB462','#7BC4C5','#893E81']
colors1 = ['#A3A5A6' ,'#A3A5A6','#FFE8CE','#D9EEEE','#DCA5C3']

for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_edgecolor(color)
    
for whisker, color in zip(bp['whiskers'], [colors[i // 2] for i in range(len(bp['whiskers']))]):
    whisker.set_color(color)

for cap, color in zip(bp['caps'], [colors[i // 2] for i in range(len(bp['caps']))]):
    cap.set_color(color)

for median, color in zip(bp['medians'], colors1):
    median.set_color(color)

for flier, color in zip(bp['fliers'], [colors[i // 2] for i in range(len(bp['fliers']))]):
    flier.set_markerfacecolor(color)
    flier.set_markeredgecolor(color)

plt.ylim(-1.1,1)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.xticks([1, 2,3,4,5], ['OR','Accuracy','Recall','Specificity','Precision'],fontsize=15)


plt.savefig('boxplot_basic_overlap.png',dpi = 300,bbox_inches = 'tight')
plt.show()

## 2 Cost calculation

#### 2.1, Change with anchor mean

In [None]:
#K = 100 # number of experiments
K = 5
T = 1000
N = 10
Z = t_dist.ppf(0.975,N)
std_niose = 3
anchor_set = [1,2,3,4,5]

cost = np.zeros((len(anchor_set),3,T))
for idx,anchor_mean in enumerate(anchor_set):
    for t in tqdm(range(T)):

        # generate true ATEs
        true_tao = np.random.normal(anchor_mean, 3, K )
        constant = np.random.normal(anchor_mean, 3)
        optimal_cost = np.sum(true_tao[np.argwhere(true_tao>0)])

        # generate data
        feature = np.ones((N+K, K+1)) 
        feature[:, 1:] = np.random.binomial(1, 0.5, (N+K,K))

        # generate label
        label = constant + np.dot(feature[:,1:], true_tao) + np.random.normal(0, std_niose, N+K)

        #estimated tao
        model = sm.OLS(label, feature).fit()
        estimated_tao = model.params
        p_value = model.pvalues[1:]
        cov_matrix = model.cov_params()

        estimated_variance = np.zeros(K)
        for i in range(K):
            estimated_variance[i] = cov_matrix[i+1,i+1]
        

        hat_ATE = estimated_tao[1:]

        
        #estimated cost
        decision1 = np.argwhere(hat_ATE>(Z*np.sqrt(estimated_variance)))
        if optimal_cost == 0:
            if len(decision1) == 0:
                cost[idx,0,t] = 1
            else:
                cost[idx,0,t] = 0
        else:
            cost[idx,0,t] = np.sum(true_tao[decision1])/optimal_cost
        tau_0 = np.mean(hat_ATE)

        # Bayesian decision rule
        bayesian_tao = np.zeros(K)
        bayesian_beta = np.zeros(K)
        denumerator = np.var(hat_ATE) - np.mean(estimated_variance)
        decision3 = []
  
        for k in range(K):
            if denumerator <= 0:
                theta = 1
                posteri_mean = hat_ATE[k]*theta + (1 - theta)*tau_0
                posteri_var = 1/(1/estimated_variance[k])
                dist = norm(loc=posteri_mean, scale=np.sqrt(posteri_var))
                prob = dist.sf(0)  # survival function: P(X > x)
                if prob > 1 - 0.025:
                    decision3.append(k)
            else:
                bayesian_beta[k] = max(N*estimated_variance[k]/denumerator,0)
                theta = N/(N+bayesian_beta[k])
                posteri_mean = hat_ATE[k]*theta + (1 - theta)*tau_0
                posteri_var = 1/(1/denumerator+ 1/estimated_variance[k])
                dist = norm(loc=posteri_mean, scale=np.sqrt(posteri_var))

                prob = dist.sf(0)  # survival function: P(X > x)
                if prob > 1 - 0.025:
                    decision3.append(k)
        if optimal_cost == 0:
            if len(decision3) == 0:
                cost[idx,2,t] = 1
            else:
                cost[idx,2,t] = 0
        else:
            cost[idx,2,t] = np.sum(true_tao[decision3])/optimal_cost

        
        
        if tau_0 == 0:
            cost[idx,1,t] = cost[idx,0,t]
            continue
        beta = N*np.mean(estimated_variance)/(np.var(hat_ATE) - np.mean(estimated_variance)) +  Z*N*np.sqrt(np.mean(estimated_variance))/tau_0
        beta = max(0,beta)
        theta = N/(N+beta)
        shrunken_ATE = theta*hat_ATE + (1-theta)*tau_0
        decision2 = np.argwhere(shrunken_ATE>(theta*Z*np.sqrt(estimated_variance)))

        if optimal_cost == 0:
            if len(decision2) == 0:
                cost[idx,1,t] = 1
            else:
                cost[idx,1,t] = 0
        else:
            cost[idx,1,t] = np.sum(true_tao[decision2])/optimal_cost

In [None]:
plt.figure(figsize=(8,6))
cost1 = np.mean(cost,axis=2)
plt.xlabel(r'The value $\tau_0$',fontsize=15)
plt.ylabel('Optimality Ratio (OR)',fontsize=15)
y3 = cost1[:,1]
plt.plot(anchor_set, y3,  color = '#495373',marker = "s",label = "DTR")

y3 = cost1[:,2]
plt.plot(anchor_set, y3, color = '#E3738B',marker = "s",label = "Bayesian",linestyle = '--')

y3 = cost1[:,0]
plt.plot(anchor_set, y3, color = '#8CA5EA',linestyle = '--', marker='o',label = "IHT")


plt.xticks([1,2,3,4,5],fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize=15)
plt.grid()
plt.savefig('performance_compare_with_tau0_correlation_K_{}.png'.format(K),dpi=300,bbox_inches = 'tight')
plt.show()

#### 2.2, Change with variacne of error term

In [None]:
# parameters setting
K = 100
T = 1000
N = 10
Z = t_dist.ppf(0.975,N)
anchor_mean  = 1
std_noise_set = [1,2,3,4,5]


cost = np.zeros((len(std_noise_set),3,T))
anchor_mean = 1
for idx,std_niose in enumerate(std_noise_set):
    for t in tqdm(range(T)):
        # generate true ATEs
        true_tao = np.random.normal(anchor_mean, 3, K )
        constant = np.random.normal(anchor_mean, 3)
        optimal_cost = np.sum(true_tao[np.argwhere(true_tao>0)])

        # generate data
        feature = np.ones((N+K+1, K+1)) 
        feature[:, 1:] = np.random.binomial(1, 0.5, (N+K+1,K))

        # generate label
        label = constant + np.dot(feature[:,1:], true_tao) + np.random.normal(0, std_niose, N+K+1)

        #estimated tao
        model = sm.OLS(label, feature).fit()
        estimated_tao = model.params
        p_value = model.pvalues[1:]
        cov_matrix = model.cov_params()

        estimated_variance = np.zeros(K)
        for i in range(K):
            estimated_variance[i] = cov_matrix[i+1,i+1]


        hat_ATE = estimated_tao[1:]

        #estimated cost
        decision1 = np.argwhere(hat_ATE>(Z*np.sqrt(estimated_variance)))
        if optimal_cost == 0:
            if len(decision1) == 0:
                cost[idx,0,t] = 1
            else:
                cost[idx,0,t] = 0
        else:
            cost[idx,0,t] = np.sum(true_tao[decision1])/optimal_cost
    
        
        tau_0 = np.mean(hat_ATE)
        if tau_0 == 0:
            cost[idx,1,t] = cost[idx,0,t]
            continue
        beta = N*np.mean(estimated_variance)/(np.var(hat_ATE) - np.mean(estimated_variance)) +  Z*N*np.sqrt(np.mean(estimated_variance))/tau_0
        beta = max(0,beta)
        theta = N/(N+beta)
        shrunken_ATE = theta*hat_ATE + (1-theta)*tau_0
        decision2 = np.argwhere(shrunken_ATE>(Z*np.sqrt(estimated_variance)*theta))
        if optimal_cost == 0:
            if len(decision2) == 0:
                cost[idx,1,t] = 1
            else:
                cost[idx,1,t] = 0
        else:
            cost[idx,1,t] = np.sum(true_tao[decision2])/optimal_cost

In [None]:
y1 = np.mean(cost[:,0,:],axis=1)
y2 = np.mean(cost[:,1,:],axis=1)
plt.figure(figsize=(8,6))
plt.xlabel(r"The standard deviation of noise $\sigma$",fontsize=15)
plt.ylabel('Value of Data Pooling (VDP)',fontsize=15)

plt.plot(std_noise_set, [y2[i]/y1[i] - 1 for i in range(5)], color = '#8CA5EA',linestyle = '--', marker='o')

plt.xticks(std_noise_set,fontsize=15)
plt.yticks(fontsize=15)
plt.grid()
plt.savefig('performance_compare_with_sigma1_correlation_K_{}.png'.format(K),dpi=300,bbox_inches = 'tight')
plt.show()