## Import packages and initial DataFrame

In [1]:
import pandas as pds
import numpy as np
import textdistance
import statsmodels.api as sm
import scipy
import math
import tqdm
import os
import seaborn as sns
from random import choices
from matplotlib import pyplot as plt
import scipy.interpolate as interpolate
import defs
from multiprocessing.pool import Pool
import simu

In [2]:
simu.do_simulation

<function simu.do_simulation(DF, nbr_in_common, sizeA, sizeB, identifiers, aaa, methods, a_sigma, b_sigma, a_sigma2, b_sigma2, a, b, alpha_pi, beta_pi, nbr_iter, approx_integral, covariates, montecarlo_size_for_coverage)>

In [2]:
name_DF = 'DF_N=4401_2023-01-16.csv'
DF = pds.read_csv(os.path.join('..', 'datasets', name_DF), delimiter = ',')
DF = DF[~DF.duplicated()] # delete duplicates
DF = DF.dropna() # delete NaN values
DF['was_assigned_female'] = DF['was_assigned_female'].astype('int32') # turn was_born_female into int type (once Nan values have been removed)
DF = DF.drop(['name', 'family_name'], axis=1)

# generate covariates
DF['X1'] = 2020 - DF['birth_year'] # age
DF['X2'] = np.random.normal(loc = 2.5, scale = 1, size = DF.shape[0])
DF['X3'] = np.random.normal(loc = 0, scale = 1, size = DF.shape[0])
DF['X4'] = np.random.normal(loc = 1, scale = 1, size = DF.shape[0])
DF['X5'] = np.random.normal(loc = 1, scale = 1, size = DF.shape[0])

# generate treatment
DF['treatment'] = np.random.binomial(n = 1, p = 1 / ( 1 + np.exp(0.1*DF.X1 -0.2*DF.X2 +0.3*DF.X3 -0.4*DF.X4 +0.5*DF.X5) )) # probability depending on covariates

# generate outcome
residual_errors = np.random.normal(size = DF.shape[0])
aaa = 5.5
bbb = 0.01
ccc = 0.08
ddd = 0.7

ate_on_DF = aaa * 2.5
DF['Y'] = - 10 + aaa*DF['treatment']*DF['X2'] + bbb*np.exp(DF['X4']) + ccc*DF['X3']*DF['X1'] + ddd*DF['X5'] 

# generate new categorical variables for linkage
DF['id1'] = np.random.choice(np.arange(10), size = DF.shape[0]).astype(int)
DF['id2'] = np.random.choice(np.arange(15), size = DF.shape[0]).astype(int)
DF['id3'] = np.random.choice(np.arange(5), size = DF.shape[0]).astype(int)

identifiers = {'was_assigned_female':'strict','country':'strict','birth_year':'strict','id1':'strict','id2':'strict','id3':'strict'} # 'family_name':'jaro-winkler',
covariates = ['X1','X2','X3','X4','X5']

nbr_in_common = 100

nbr_iter = 800

approx_integral = 50

methods = {'jaro-winkler':defs.jaro_winkler_similarity, 'levenshtein':defs.levenshtein_similarity, 'strict':defs.strict_equality, 'large':defs.large_equality}

a_sigma, b_sigma, a_sigma2, b_sigma2, a, b, alpha_pi, beta_pi = 1, 1, 1, 1, 1, 1, 1, 1

dict_params_legend = {
    "unmatch":identifiers.keys(), 
    "match":identifiers.keys(), 
    "sigma_square":[f"$\sigma^2$"], 
    "betas":[ r"$\beta_0$", r"$\beta_1$", r"$\beta_2$", r"$\beta_3$", r"$\beta_4$", r"$\beta_5$", r"$\beta_6$", r"$\beta_7$" ], 
    "sigma2_square":[r"$\sigma_2^2$"], 
    "mu2":[r"$\mu_2$"], 
    "atel":["atel"]}

dict_params_title = {
    "unmatch":"unmatch", 
    "match":"match", 
    "sigma_square":r"$\sigma^2$", 
    "betas":r"$\beta$",
    "sigma2_square":r"$\sigma_2^2$", 
    "mu2":r"$\mu_2$", 
    "atel":"atel"}

nbr_simulations = 2

montecarlo_size_for_coverage = 105

sizeA = 430
sizeB = 250

In [3]:
# plt.errorbar(np.arange(nbr_simulations), dict_coverage["mean"], yerr=np.array(dict_coverage["bounds_tuple"]).T, fmt='o')
# plt.plot(np.arange(nbr_simulations), dict_coverage["ate_common_records"], 'o')
# plt.xlabel("Simulations")
# plt.title("95% credible interval for the ATE")

In [4]:
# plt.errorbar(np.arange(nbr_simulations), dict_coverage["ate_mean"], yerr=np.array(dict_coverage["ate_bounds_tuple"]).T, fmt='o')
# plt.axhline(0)
# plt.xlabel("Simulations")
# plt.title("95% credible interval for the ATE centered around the ate_common_records")

In [5]:
# to do launch multiprocessing pour avoir plusieurs simulations

# I will still send you 

# Feel free not to take too much time answering my email, (except if you see some big mistakes I am making maybe), I need to work in autonomy but I also would like to tell you about my progress 

# Here is the link to follow my work on the slides: ...

# I can already show you some graphical results ...

# About the method, I am still not sure about the way I implemented everything, especially: ...

# Time of execution, multiprocessing, ..., size of the datasets, propensity score removed.

In [8]:
params = {'DF':DF, 
          'nbr_in_common':100, 
          'sizeA':430, 
          'sizeB':250, 
          'identifiers':identifiers, 
          'aaa':aaa, 
          'methods':methods, 
          'a_sigma':1, 
          'b_sigma':1, 
          'a_sigma2':1, 
          'b_sigma2':1, 
          'a':1, 
          'b':1, 
          'alpha_pi':1, 
          'beta_pi':1, 
          'nbr_iter':nbr_iter, 
          'approx_integral':approx_integral, 
          'covariates':covariates,
          'montecarlo_size_for_coverage':montecarlo_size_for_coverage}

if __name__ == "__main__":
    n_mcmc = 8
    mcmc_params = []
    for i in range(n_mcmc):
        tmp = params.copy()
        mcmc_params.append(tmp)
    print(mcmc_params)

    with Pool(processes=n_mcmc) as pool:
        results = pool.map(simu.do_simulation, mcmc_params)

[{'DF':      country  birth_year  was_assigned_female  X1        X2        X3  \
0         AL        1987                    1  33  1.239957 -0.001596   
1         AL        2013                    0   7  1.422702  1.129080   
2         AL        2004                    1  16  2.143477 -1.142105   
3         AL        1962                    0  58  3.238529 -1.124719   
4         AL        1962                    1  58  0.796064  0.872650   
...      ...         ...                  ...  ..       ...       ...   
4396      GB        1982                    0  38  1.105738  0.281285   
4397      GB        1982                    0  38  2.614166  0.522716   
4398      GB        1982                    0  38  0.341827  0.610043   
4399      GB        1949                    1  71  1.085068 -1.390973   
4400      GB        1982                    1  38  1.420454 -0.007566   

            X4        X5  treatment          Y  id1  id2  id3  
0    -0.515647 -2.342024          0 -11.637659    8

TypeError: do_simulation() missing 18 required positional arguments: 'nbr_in_common', 'sizeA', 'sizeB', 'identifiers', 'aaa', 'methods', 'a_sigma', 'b_sigma', 'a_sigma2', 'b_sigma2', 'a', 'b', 'alpha_pi', 'beta_pi', 'nbr_iter', 'approx_integral', 'covariates', and 'montecarlo_size_for_coverage'