In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from scipy.stats import norm
from scipy.optimize import minimize
from copy import deepcopy
from sklearn.linear_model import LinearRegression
import torch
import torch.nn as nn

In [None]:
# custom loss function 
def custom_loss(output, target, treatment):
    error = torch.sum(output*treatment,dim=1)
    loss = torch.mean((target - error)**2)
    return loss

# function to calculate the Phi value
def Phi(x,theta,Gamma,treatment,y,k,device):
    with torch.no_grad():
        theta.eval()
        a = torch.tensor(x, dtype=torch.float32).to(device)
        output = theta(a).detach().cpu().numpy()
    indicator = np.zeros(output.shape[0])
    indicator[k + 1] = 1
    term1 = np.dot(output,indicator)
    term2 = (indicator.reshape(1,-1)) @ (np.linalg.inv(Gamma)) @ (2*(np.dot(output,treatment) - y)*treatment.reshape(-1,1))

    return term1 - term2

# function to split the data into S parts
def data_split(S,Hist_feature,Hist_treatment,Hist_label):
    N = Hist_feature.shape[0]
    number_per_split = int(N/S)
    data_index = np.arange(N)
    np.random.shuffle(data_index)
    feature_list = []
    treatment_list = []
    label_list = []

    for i in range(S):
        feature_list.append(Hist_feature[data_index[i*number_per_split:(i+1)*number_per_split],:])
        treatment_list.append(Hist_treatment[data_index[i*number_per_split:(i+1)*number_per_split],:])
        label_list.append(Hist_label[data_index[i*number_per_split:(i+1)*number_per_split]])
    
    return feature_list,treatment_list,label_list

# A simple two-layer fully connected neural network
class TwoLayerFCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TwoLayerFCN, self).__init__()
        # first layer: input layer to hidden layer
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        # second layer: hidden layer to output layer
       
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # activation function
        self.activation = nn.ReLU()

    def forward(self, x):
        # forward pass through the network
        x = self.fc1(x)          
        x = self.activation(x)   
        x = self.fc2(x)
        return x

# Function to train the model
def train_model(S,s,feature_list,treatment_list,label_list,dim_feature,K,device):
    train_data_list = []
    train_label_list = []
    train_treatment_list = []
    for i in range(S):
        if i != s:
            train_data_list.append(feature_list[i])
            train_label_list.append(label_list[i])
            train_treatment_list.append(treatment_list[i])
    train_data_array = np.concatenate(train_data_list, axis=0)
    train_label_array = np.concatenate(train_label_list, axis=0)
    train_treatment_array = np.concatenate(train_treatment_list, axis=0)

   
    data = train_data_array
    label = train_label_array
    treatment = train_treatment_array
    data = torch.tensor(data, dtype=torch.float32).to(device)
    label = torch.tensor(label, dtype=torch.float32).to(device)
    treatment = torch.tensor(treatment, dtype=torch.float32).to(device)
    model = TwoLayerFCN(dim_feature, K+10, K+1).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    batch_size = 32

    for epoch in range(10):
        index = np.arange(data.shape[0])
        np.random.shuffle(index)
        batch_data = []
        for i in range(data.shape[0]//batch_size):
            if (i+1)*batch_size < data.shape[0]:
                batch_data.append((data[index[i*batch_size:(i+1)*batch_size],:],treatment[index[i*batch_size:(i+1)*batch_size],:],label[index[i*batch_size:(i+1)*batch_size]]))
            else:
                batch_data.append((data[index[i*batch_size:],:],treatment[index[i*batch_size:],:],label[index[i*batch_size:]]))
        for i in range(len(batch_data)):
            model.train()
            #optimizer.zero_grad()
            data1,treatment1,label1 = batch_data[i]
            output = model(data1)
            #print(output.shape,treatment.shape)
            loss = custom_loss(output, label1, treatment1)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
   
    estimated_Gamma = 2 * np.dot(train_treatment_array.T, train_treatment_array) / train_treatment_array.shape[0] 
    
    return model, estimated_Gamma

In [None]:
dim_feature = 4 # number of features
K = 5 # number of treatments
Z = norm.ppf(0.975) # 95% confidence interval
N = 100 # number of samples

T = 1000 # number of iterations
S = 2 # number of splits
number_per_split = int((K+N)/S)   #number of samples per split
noise_list = [1,2,3,4,5] # noise level

true_ate_array   = np.zeros((T,K))
estimated_phi = np.zeros((len(noise_list),T,K,S,number_per_split))
device = torch.device('cpu')



for t in tqdm(range(T)):
# constant linear true function
    constant_linear = np.random.uniform(-0.3,0.5,dim_feature)

    # linear true function
    cofficent_linear = np.random.uniform(-0.3,0.5,(K,dim_feature))

    true_ate = np.sum(cofficent_linear[:,:]*0.5, axis=1)
    true_ate_array[t] = true_ate


    #generate data
    Hist_feature = np.random.uniform(0,1,(K+N,dim_feature))
 

    

    Hist_treatment = np.random.binomial(1,0.5,(K+N,K+1))
    Hist_treatment[:,0] = 1

    for id,std in enumerate(noise_list):

        #generate label
        Hist_label = np.dot(Hist_feature,constant_linear.T) + np.sum(np.dot(Hist_feature,cofficent_linear.T)*Hist_treatment[:,1:],axis = 1) + np.random.normal(0,std,K+N)
        #split data
        feature_list,treatment_list,label_list = data_split(S,Hist_feature,Hist_treatment,Hist_label)

        #train model
        for s in range(S):
            model,estimated_Gamma = train_model(S,s,feature_list,treatment_list,label_list,dim_feature,K,device)
            #print("yes")
            feature_ate = feature_list[s]
            treatment_ate = treatment_list[s]
            label_ate = label_list[s]
            output = model(torch.tensor(feature_ate, dtype=torch.float32).to(device)).cpu().detach().numpy()
            indicator_matrix = np.eye(output.shape[1])[:, 1:K+1]
            term1 = np.dot(output, indicator_matrix)
            term2 = indicator_matrix.T @ np.linalg.inv(estimated_Gamma) @ (2*(np.sum(output*treatment_ate,axis = 1) - label_ate)*treatment_ate.T)  # shape: (K, N)
            estimated_phi[id,t,:,s,:] = term1.T - term2

In [None]:
cost = np.zeros((5,2,T))
for i in range(5):
    for t in range(T):
        hat_ATE = np.mean(estimated_phi[i,t],axis=(1,2))
        estimated_variance1 = np.mean((estimated_phi[i,t] - np.tile(hat_ATE.reshape(K,1,1),(S,number_per_split)))**2,axis=(1,2))
        estimated_variance = np.mean((estimated_phi[i,t] - np.tile(hat_ATE.reshape(K,1,1),(S,number_per_split)))**2)
        true_tao = true_ate_array[t]
        optimal_cost = np.sum(true_tao[np.argwhere(true_tao>0)])

        estimated_variance = estimated_variance*N/(N+K)

        decision1 = np.argwhere(hat_ATE>(Z*np.sqrt(estimated_variance1))/np.sqrt(N))
        
        if optimal_cost == 0:
            if len(decision1) == 0:
                cost[i,0,t] = 1
            else:
                cost[i,0,t] = 0
        else:
            cost[i,0,t] = np.sum(true_tao[decision1])/optimal_cost
       
        #DPTR
        anchor_tau = np.mean(hat_ATE)
        if anchor_tau == 0:
            cost[i,1,t] = cost[i,0,t]
            continue
        beta = estimated_variance/(np.mean((hat_ATE - anchor_tau)**2) - estimated_variance/N) + Z*np.sqrt(N*estimated_variance)/anchor_tau
        theta = N/(N+beta)
        hat_ATE_shrunken = theta*hat_ATE + (1-theta)*anchor_tau
        decision2 = np.argwhere(hat_ATE_shrunken>(theta*Z*np.sqrt(estimated_variance1))/np.sqrt(N))
        if optimal_cost == 0:
            if len(decision2) == 0:
                cost[i,1,t] = 1
            else:
                cost[i,1,t] = 0
        else:
            cost[i,1,t] = np.sum(true_tao[decision2])/optimal_cost