In [3]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', 100)

#### preprocess data 

In [2]:
df = pd.read_csv('GerberGreenLarimer_APSR_2008_social_pressure.csv')
df['treatment'] = np.where(df.treatment == ' Control',0,1)
df['voted'] = np.where(df.voted == 'Yes', 1, 0)
df['sex'] = np.where(df.sex == 'male',1, 0)
df['g2000'] = np.where(df.g2000 == 'yes', 1, 0)
df['g2002'] = np.where(df.g2002 == 'yes', 1, 0)
df['g2004'] = np.where(df.g2004 == 'yes', 1, 0)
df['p2000'] = np.where(df.p2000 == 'yes', 1, 0)
df['p2002'] = np.where(df.p2002 == 'yes', 1, 0)
df['p2004'] = np.where(df.p2004 == 'Yes', 1, 0)

cts_variables_names = ["yob","treatment","cluster","hh_id","hh_size","numberofnames","p2004_mean","g2004_mean"]
binary_variables_names = ["sex","g2000", "g2002", "p2000", "p2002", "p2004"]
# for column in binary_variables_names:
#     if column == 'sex':
#         df[column] = np.where(df[column] == ' male',1,0)
#     else:
#         df[column] = df[column].str.lower()
#         df[column] = np.where(df[column] == ' yes',1,0)
scaled_cts_covariates = StandardScaler().fit_transform(df[cts_variables_names])
binary_covariates = df[binary_variables_names]
d = pd.DataFrame(np.concatenate((scaled_cts_covariates, binary_covariates), axis=1), 
                        columns=cts_variables_names+binary_variables_names, index=df.index)
d["W"] = df["treatment"]
d["Y"] = df["voted"]


#### resample from GOTV 

In [10]:
class resample_from_GOTV:
    '''
    the data must be preprocessed, with d['W'] = the treatment indicator and d['Y'] = the outcome 
    '''
    def __init__(self, data):
        self.data = data
        self.d_w1 = self.data[self.data['W'] == 1]
        self.d_w0 = self.data[self.data['W'] == 0]
        
    def get_treat_control_equalsize(self, n_sample = 10000):
        # get # treatment = # control 
        
        return self.d_w0.iloc[:n_sample//2], self.d_w1.iloc[:n_sample//2]
    
    def get_treat_control_diffsize(self, n_sample = 10000, ratio = 0.05):
        # get # treatment << # control 
        n_w1 = int(n_sample*ratio)
        n_w0 = n_sample - n_w1
        
        return self.d_w0.iloc[:n_w0], self.d_w1.iloc[:n_w1]
        
        

In [11]:
resample_class = resample_from_GOTV(d)
d_w0, d_w1 = resample_class.get_treat_control_diffsize()

#### resample from synthetic data

In [14]:
random.uniform(0,0.1)

0.03347229955231084

In [4]:
class resample_from_synthetic_data:
    def __init__(self, n_sample):
        self.n_sample = n_sample
        
    def get_linear_dist_label(self,x, w):
        # a linear func to generate label
        # when 0 < x < 0.8,  return 0, else return 1. 20% of having label = 1 
        generated_num = x 
        if w == 0:
            if generated_num > 0.8:
                return 1
            return 0
        else:
            if generated_num > 0.5:
                return 1
            return 0
        
    def get_complex_dist_label(self,x):
        # a complex func  abs(x*x*x - x) is a quadratic func when x ∈ (0,1)
        # < 20 % change of returning 0
        generated_num = np.abs(x*x*x - x) 
        
        if generated_num > 0.36:
            return 0
        return 1
    
    def get_data_with_diff_distribution(self,ratio = 0.2):
        total_size = self.n_sample
        n_w1 = int(total_size * ratio)
        n_w0 = total_size - n_w1
        df = pd.DataFrame()
        W = []
        Y0 = []
        Y1 = []
        X = []
        Y = []
        for i in range(total_size):
            
            ran_num = random.uniform(0,1)
            X.append(ran_num)
            Y0.append(self.get_linear_dist_label(ran_num,0))
            Y1.append(self.get_complex_dist_label(ran_num))
            if i < n_w0:
                W.append(0)
                Y.append(self.get_linear_dist_label(ran_num,0))
            else:
                W.append(1)
                Y.append(self.get_complex_dist_label(ran_num))
        
        df['W'] = W
        df['Y0'] = Y0
        df['Y1'] = Y1
        df['X'] = X
        df['Y'] = Y
        
        return df
        
    def get_data_with_same_distribution(self,ratio = 0.2):
        total_size = self.n_sample
        n_w1 = int(total_size * ratio)
        n_w0 = total_size - n_w1
        df = pd.DataFrame()
        W = []
        Y0 = []
        Y1 = []
        X = []
        Y = []
        for i in range(total_size):
            
            ran_num = random.uniform(0,1)
            X.append(ran_num)
            Y0.append(self.get_linear_dist_label(ran_num,0))
            Y1.append(self.get_linear_dist_label(ran_num,1))
            if i < n_w0:
                W.append(0)
                Y.append(self.get_linear_dist_label(ran_num,0))
            else:
                W.append(1)
                Y.append(self.get_linear_dist_label(ran_num,1))
        
        df['W'] = W
        df['Y0'] = Y0
        df['Y1'] = Y1
        df['X'] = X
        df['Y'] = Y
        
        return df     
    
    
    def get_data_with_zero_treatment_effect(self,ratio = 0.2):
        total_size = self.n_sample
        n_w1 = int(total_size * ratio)
        n_w0 = total_size - n_w1
        df = pd.DataFrame()
        W = []
        Y0 = []
        Y1 = []
        X = []
        Y = []
        for i in range(total_size):
            while True:
                ran_num = random.uniform(0,1)
                if self.get_linear_dist_label(ran_num,0) != self.get_linear_dist_label(ran_num,1):
                    continue
                else:
                    break
            X.append(ran_num)
            Y0.append(self.get_linear_dist_label(ran_num,0))
            Y1.append(self.get_linear_dist_label(ran_num,1))
            if i < n_w0:
                W.append(0)
                Y.append(self.get_linear_dist_label(ran_num,0))
            else:
                W.append(1)
                Y.append(self.get_linear_dist_label(ran_num,1))
        
        df['W'] = W
        df['Y0'] = Y0
        df['Y1'] = Y1
        df['X'] = X
        df['Y'] = Y
        
        return df     
            
        
        

In [5]:
resample_sync = resample_from_synthetic_data(10000)
df = resample_sync.get_data_with_zero_treatment_effect()

In [6]:
for i in  range(len(df)):
    if df.iloc[i].Y0 != df.iloc[i].Y1:
        print(df.iloc[i])

In [7]:
df

Unnamed: 0,W,Y0,Y1,X,Y
0,0,0,0,0.177085,0
1,0,0,0,0.020114,0
2,0,0,0,0.239876,0
3,0,0,0,0.294959,0
4,0,0,0,0.010985,0
...,...,...,...,...,...
9995,1,0,0,0.118749,0
9996,1,0,0,0.320573,0
9997,1,0,0,0.046747,0
9998,1,1,1,0.846893,1
