In [1]:
import pandas as pds
import numpy as np
import textdistance
import timeit
import math
import statsmodels.api as sm
import scipy
from patsy import dmatrix
import time
import math
import tqdm

name_DF = 'DF_N=4401_2023-01-16.csv'
DF = pds.read_csv(os.path.join('..', 'datasets', name_DF), delimiter = ',')
DF = DF[~DF.duplicated()] # delete duplicates
DF = DF.dropna() # delete NaN values
DF['was_assigned_female'] = DF['was_assigned_female'].astype('int32') # turn was_born_female into int type (once Nan values have been removed)

identifiers = {'family_name':'jaro-winkler','was_assigned_female':'strict','country':'strict','birth_year':'large'}
covariates = ['X1','X2','X3','X4','X5']

########## GENERATES ASSOCIATION ##########           

# generate covariates
DF['X1'] = 2020 - DF['birth_year'] # age
DF['X2'] = np.random.normal(loc = 2.5, scale = 1, size = DF.shape[0])
DF['X3'] = np.random.normal(loc = 0, scale = 1, size = DF.shape[0])
DF['X4'] = np.random.normal(loc = 1, scale = 1, size = DF.shape[0])
DF['X5'] = np.random.normal(loc = 1, scale = 1, size = DF.shape[0])

# generate treatment
DF['treatment'] = np.random.binomial(n = 1, p = 1 / ( 1 + np.exp(0.1*DF.X1 -0.2*DF.X2 +0.3*DF.X3 -0.4*DF.X4 +0.5*DF.X5) )) # probability depending on covariates

# generate outcome
residual_errors = np.random.normal(size = DF.shape[0])
a = 5.5
b = 0.01
c = 0.08
d = 0.7

ate = a * 2.5
DF['Y'] = - 10 + a*DF['treatment']*DF['X2'] + b*np.exp(DF['X4']) + c*DF['X3']*DF['X1'] + d*DF['X5'] 

common_records = DF.sample(n = 800)

B = pds.concat([DF.sample(n = 1400), common_records]).drop(['Y'], axis = 1)
B = B.reset_index(drop=True)

A = pds.concat([DF.sample(n = 2000), common_records])[list(identifiers.keys())+['Y']]
A = A.reset_index(drop=True)

def levenshtein_similarity(a,b):

    """ Check that levenshtein similarity (in [0,1]) is above 0.95.
        
        a: string,
        b: string """

    if 1 - textdistance.levenshtein(a, b)/max(len(a),len(b)) >= 0.95:
        return 1
    else:
        return 0

def jaro_winkler_similarity(a,b):

    """ Check that jaro-winkler similarity (in [0,1]) is above 0.95.
        
        a: string,
        b: string """

    if textdistance.jaro_winkler(a,b) >= 0.99:
        return 1
    else:
        return 0

def strict_equality(a,b):

    """ Check that a and b values are equal.
        
        a: any value,
        b: any value """

    return a==b

def large_equality(a,b):

    """ Check that years a and b expressed with four numbers are within the same decade.
        
        a: year,
        b: year """

    return str(a)[:-1]==str(b)[:-1]

def logit(p):
    return np.log(p/(1-p))

def minmaxscaler(X):
    return (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
    
def propensity_score(DF, covariates, scaler, convert_to_logit):
    
    """ Compute propensity score estimates: the probability (logistic regression) that an observation is treated or not conditioned on some covariates.
        These estimates are built conditionaly on covariates passed using a logit after transformation by scaler (when one is specified).
        Estimated probabilities can be converted into logit (convert_to_logit parameter).

        DF:                dataframe,
        covariates:        list of strings for covariates variable in DF,
        scaler:            sklearn.preprocessing function scaler for exemple,
        convert_to_logit:  boolean for converting probabilities to logit when building the propensity score estimates based on a logistic regression
    """
    exog = covariates.copy()
    if scaler != None:
        DF[exog] = scaler(DF[exog])
    if 'intercept' not in DF.columns:
        DF['intercept'] = 1
    exog.append('intercept')
    model = sm.Logit(DF.treatment, DF[exog]).fit(disp=0)
    predictions = model.predict(DF[exog])
    if convert_to_logit:
        return logit(predictions)
    else: 
        return predictions

In [2]:
AB_for_z0 = B[~B.duplicated(keep=False)][identifiers.keys()].merge(A[~A.duplicated(keep=False)][identifiers.keys()], how='cross')
AB_for_z0["source_index_B"] = np.repeat(B[~B.duplicated(keep=False)].index, A[~A.duplicated(keep=False)].shape[0])
AB_for_z0["source_index_A"] = np.tile(A[~A.duplicated(keep=False)].index, B[~B.duplicated(keep=False)].shape[0])

In [3]:
#AB = B[identifiers.keys()].merge(A[identifiers.keys()], how='cross')
AB = B.merge(A, how='cross')
AB["source_index_B"] = np.repeat(B.index, A.shape[0])
AB["source_index_A"] = np.tile(A.index, B.shape[0])

In [4]:
methods = {'jaro-winkler':jaro_winkler_similarity, 'levenshtein':levenshtein_similarity, 'strict':strict_equality, 'large':large_equality}

for linking_var in identifiers.keys():
    method = methods[identifiers[linking_var]]
    df = AB.filter(regex=linking_var)
    AB[linking_var+"_comparison"] = np.array([method(a, b) for a,b in zip(df.iloc[:,0], df.iloc[:,1])]).astype(int).reshape(-1,1)
    df_z0 = AB_for_z0.filter(regex=linking_var)
    AB_for_z0[linking_var+"_comparison"] = np.array([method(a, b) for a,b in zip(df_z0.iloc[:,0], df_z0.iloc[:,1])]).astype(int).reshape(-1,1)
comparisons = AB.filter(regex="comparison")
comparisons_z0 = AB_for_z0.filter(regex="comparison")

unmatch = comparisons.sum(axis=0) / len(comparisons) # probability of having same linking var (at all)
match = np.repeat(0.95, len(identifiers.keys())) # probability of having same linking var when being matches

In [5]:
AB["linking_score"] = (np.multiply(comparisons, np.log2(match/unmatch)) + np.multiply(1-comparisons, np.log2((1-match)/(1-unmatch)))).sum(axis=1)
#AB['treatment'] = np.array(np.repeat(B.treatment, A.shape[0]))
#AB[covariates] = np.repeat(np.array(B[covariates]), A.shape[0], axis=0)
AB['propensity_score'] = propensity_score(AB, covariates, None, False)
#AB['Y'] = np.tile(A.Y, B.shape[0])

score = AB.linking_score.max()

data = AB_for_z0[comparisons_z0.all(axis=1)]

# we consider 1-2-1 matches in z
data_link = data[ (~data.source_index_A.duplicated(keep=False)) & (~data.source_index_B.duplicated(keep=False)) ]

from_A = A.iloc[data_link.source_index_A,:].reset_index(drop=True)
from_B = B.iloc[data_link.source_index_B,:].reset_index(drop=True)
linked_records = pds.concat([from_B, from_A.Y], axis=1)
linked_records['propensity_score'] = propensity_score(linked_records, covariates, None, False)

z0 = -np.ones(B.shape[0])
z0[data_link.source_index_B] = data_link.source_index_A

true_linkage_z = -np.ones(B.shape[0])
true_linkage_z[B.iloc[-800:,:].index] = A.iloc[-800:,:].index
(z0==true_linkage_z).sum()/len(z0)

0.5109090909090909

In [6]:
# def compute_posterior(file_A, file_B, file_mix_BA, linked_records, z, theta_m, theta_u, alpha_pi, beta_pi, sigma_square, a_sigma, b_sigma, beta0, beta1, alpha, mu2, sigma2_square, a_sigma2, b_sigma2):
#     result = 0
#     # likelihood 1
#     # need linked_records computed based on z
#     Y = np.array(linked_records.Y)
#     X = np.array([linked_records.propensity_score*linked_records.treatment, linked_records.propensity_score])
#     X = sm.add_constant(X)
#     X = X.T
#     model = sm.GLM(Y,X)
#     results = model.fit()
#     residuals = Y - X @ results.params
#     estimated_variance = residuals.T @ residuals / (len(residuals) - (X.shape[1]+1))
#     result += np.log(scipy.stats.norm.pdf(residuals, 0, np.sqrt(estimated_variance))).sum()
#     # likelihood 2
#     # need z
#     Y = scipy.stats.norm.rvs(mu2, np.sqrt(sigma2_square), size=len(z[z<0]))
#     result += np.log(scipy.stats.norm.pdf(Y, mu2, np.sqrt(sigma2_square))).sum()
#     # likelihood 3 and 4
#     # need AB, z, theta_m, theta_u ATTENTION AB SHOULD BE KNOWN (GLOBAL)
#     idx_A = z[z>=0]
#     idx_B = np.nonzero(z>=0)[0]
#     links = pds.MultiIndex.from_tuples(zip(idx_A,idx_B))
#     pairs = pds.MultiIndex.from_frame(file_mix_BA[["source_index_A", "source_index_B"]])
#     # 3
#     data = file_mix_BA[pairs.isin(links)] # 1-2-1 matches enforced by construction of z
#     pattern_match, count_match = np.unique(data.filter(regex="comparison"), return_counts=True, axis=0)
#     result += ((pattern_match @ np.log(theta_m) + (1-pattern_match) @ np.log(1-theta_m)) * count_match).sum()
#     # 4
#     data = file_mix_BA[(~file_mix_BA.source_index_B.duplicated())&(~pairs.isin(links))] # enforce 1-2-1 by removing duplicata
#     pattern_unmatch, count_unmatch = np.unique(data.filter(regex="comparison"), return_counts=True, axis=0)
#     result += ((pattern_unmatch @ np.log(theta_u) + (1-pattern_unmatch) @ np.log(1-theta_u)) * count_unmatch).sum()
#     # prior 1
#     # need z, alpha_pi, beta_pi
#     n_AB = (z>=0).sum()
#     result += math.log(math.factorial(file_A.shape[0]-n_AB)) - math.log(math.factorial(file_A.shape[0])) + scipy.special.betaln(n_AB + alpha_pi, file_B.shape[0] - n_AB + beta_pi) - scipy.special.betaln(alpha_pi, beta_pi)
#     # prior 2
#     # need theta_m QUESTION a-1 / b-1?
#     result += (1 * np.log(theta_m) + 1 * np.log(1-theta_m)).sum()
#     # prior 3
#     # need theta_u QUESTION a-1 / b-1?
#     result += (1 * np.log(theta_u) + 1 * np.log(1-theta_u)).sum()
#     # prior 4
#     # need sigma_square, a_sigma, b_sigma QUESTION unclear if it is the pdf?
#     result += np.log(scipy.stats.invgauss.pdf(sigma_square, a_sigma, b_sigma))
#     # prior 5
#     # need beta0, beta1, alpha QUESTION unclear if it is the pdf?
#     result += np.log(scipy.stats.multivariate_normal.pdf([beta0, beta1, alpha], [0,0,0], np.eye(3)))
#     # prior 6
#     # need mu2 QUESTION unclear if it is the pdf?
#     result += np.log(scipy.stats.norm.pdf(mu2, 0, 1))
#     # prior 7
#     # need sigma2_square, a_sigma2, b_sigma2 QUESTION unclear if it is the pdf?
#     result += np.log(scipy.stats.invgauss.pdf(sigma2_square, a_sigma2, b_sigma2))
#     return result

# def compute_proposal(file_A, file_B, file_mix_BA, z, sigma_square, a_sigma, b_sigma, beta0, beta1, alpha, mu2, sigma2_square, a_sigma2, b_sigma2):
#     result = [] # z, linked_records, theta_m, theta_u, beta0, beta1, alpha, sigma_square, mu2, sigma2_square
#     # z
#     linked_record_prop = (z<file_B.shape[0]).sum() / len(z) # proportion of linked records in previous z
#     new = np.random.choice(file_A.shape[0], size=file_B.shape[0], replace=False)
#     new = new * np.random.choice([1,-1], size=file_B.shape[0], p=[linked_record_prop, 1-linked_record_prop]) # randomly set negative some values so that the proportion of positive values corresponds to linked_record_prop
#     result.append(new)
#     # linked records
#     idx_A = z[z>=0]
#     idx_B = np.nonzero(z>=0)[0]
#     from_A = file_A.iloc[idx_A,:].reset_index(drop=True)
#     from_B = file_B.iloc[idx_B,:].reset_index(drop=True)
#     linked_records = pds.concat([from_B, from_A.Y], axis=1)
#     linked_records['propensity_score'] = propensity_score(linked_records, covariates, None, False)
#     result.append(linked_records)
#     # theta_u QUESTION we do not care of dependence on previous param
#     comparisons = file_mix_BA.filter(regex="comparison")
#     unmatch = comparisons.sum(axis=0) / len(comparisons)
#     result.append(unmatch)
#     # theta_m QUESTION we do not care of dependence on previous param
#     match = np.repeat(0.95, len(unmatch))
#     result.append(match)
#     # beta0, beta1, alpha, sigma_square
#     result.append(beta0+scipy.stats.norm.rvs(0,0.5))
#     result.append(beta1+scipy.stats.norm.rvs(0,0.5))
#     result.append(alpha+scipy.stats.norm.rvs(0,0.5))
#     result.append(sigma_square+scipy.stats.invgauss.rvs(a_sigma,b_sigma))
#     # mu2, sigma2_square
#     result.append(mu2+scipy.stats.norm.rvs(0,1))
#     result.append(sigma2_square+scipy.stats.invgauss.rvs(a_sigma2,b_sigma2))
#     return result

In [7]:
# z_k = [z.copy()]

# linked_records_k = [linked_records.copy()]

# theta_m_k = [match.copy()]

# theta_u_k = [unmatch.copy()]

# alpha_pi_k = [1]
# beta_pi_k = [1]

# a_sigma_k = [1]
# b_sigma_k = [1]

# a_sigma2_k = [1]
# b_sigma2_k = [1]

# beta0_k = [scipy.stats.norm.rvs(0,1)]
# beta1_k = [scipy.stats.norm.rvs(0,1)]
# alpha_k = [scipy.stats.norm.rvs(0,1)]
# sigma_square_k = [scipy.stats.invgauss.rvs(a_sigma_k[-1],b_sigma_k[-1])]

# mu2_k = [scipy.stats.norm.rvs(0,1)]
# sigma2_square_k = [scipy.stats.invgauss.rvs(a_sigma2_k[-1],b_sigma2_k[-1])]

# posteriors = []
# posterior = compute_posterior(A, B, AB, linked_records_k[-1], z_k[-1], theta_m_k[-1], theta_u_k[-1], alpha_pi_k[-1], beta_pi_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
# posteriors.append(posterior)

# for _ in tqdm.tqdm(range(500)):
#     if  not(_%50):
#         print((z_k[-1]==true_linkage_z).sum()/len(true_linkage_z))

#     proposal = compute_proposal(A, B, AB, z_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
#     new_z, new_linked_records, new_theta_m, new_theta_u, new_beta0, new_beta1, new_alpha, new_sigma_square, new_mu2, new_sigma2_square = proposal
#     # if update z and linked_records
#     new_posterior = compute_posterior(A, B, AB, new_linked_records, new_z, theta_m_k[-1], theta_u_k[-1], alpha_pi_k[-1], beta_pi_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])

#     posteriors.append(new_posterior)
#     U = scipy.stats.uniform.rvs(0,1)
#     if U < posteriors[-1] / posteriors[-2]:
#         z_k.append(new_z)
#         linked_records_k.append(new_linked_records)
#         theta_m_k.append(new_theta_m)
#         theta_u_k.append(new_theta_u)
#         beta0_k.append(new_beta0)
#         beta1_k.append(new_beta1)
#         alpha_k.append(new_alpha)
#         sigma_square_k.append(new_sigma_square)
#         mu2_k.append(new_mu2)
#         sigma2_square_k.append(new_sigma2_square)

In [8]:
# z_k = [z.copy()]

# linked_records_k = [linked_records.copy()]

# theta_m_k = [match.copy()]

# theta_u_k = [unmatch.copy()]

# alpha_pi_k = [1]
# beta_pi_k = [1]

# a_sigma_k = [1]
# b_sigma_k = [1]

# a_sigma2_k = [1]
# b_sigma2_k = [1]

# beta0_k = [scipy.stats.norm.rvs(0,1)]
# beta1_k = [scipy.stats.norm.rvs(0,1)]
# alpha_k = [scipy.stats.norm.rvs(0,1)]
# sigma_square_k = [scipy.stats.invgauss.rvs(a_sigma_k[-1],b_sigma_k[-1])]

# mu2_k = [scipy.stats.norm.rvs(0,1)]
# sigma2_square_k = [scipy.stats.invgauss.rvs(a_sigma2_k[-1],b_sigma2_k[-1])]

# posteriors = []
# posterior = compute_posterior(A, B, AB, linked_records_k[-1], z_k[-1], theta_m_k[-1], theta_u_k[-1], alpha_pi_k[-1], beta_pi_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
# posteriors.append(posterior)

# for _ in tqdm.tqdm(range(500)):
#     if  not(_%50):
#         print((z_k[-1]==true_linkage_z).sum()/len(true_linkage_z))

#     proposal = compute_proposal(A, B, AB, z_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
#     new_z, new_linked_records, new_theta_m, new_theta_u, new_beta0, new_beta1, new_alpha, new_sigma_square, new_mu2, new_sigma2_square = proposal
#     # if update z and linked_records
#     new_posterior = compute_posterior(A, B, AB, new_linked_records, new_z, theta_m_k[-1], theta_u_k[-1], alpha_pi_k[-1], beta_pi_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
#     U = scipy.stats.uniform.rvs(0,1)
#     if U < new_posterior / posteriors[-1]:
#         posteriors.append(new_posterior)
#         z_k.append(new_z)
#         linked_records_k.append(new_linked_records)

#     proposal = compute_proposal(A, B, AB, z_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
#     new_z, new_linked_records, new_theta_m, new_theta_u, new_beta0, new_beta1, new_alpha, new_sigma_square, new_mu2, new_sigma2_square = proposal
#     # if update theta_m, theta_u
#     new_posterior = compute_posterior(A, B, AB, linked_records_k[-1], z_k[-1], new_theta_m, new_theta_u, alpha_pi_k[-1], beta_pi_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
#     U = scipy.stats.uniform.rvs(0,1)
#     if U < new_posterior / posteriors[-1]:
#         posteriors.append(new_posterior)
#         theta_m_k.append(new_theta_m)
#         theta_u_k.append(new_theta_u)

#     proposal = compute_proposal(A, B, AB, z_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
#     new_z, new_linked_records, new_theta_m, new_theta_u, new_beta0, new_beta1, new_alpha, new_sigma_square, new_mu2, new_sigma2_square = proposal
#     # if update beta0, beta1, alpha, sigma_square, mu2, sigma2_square
#     new_posterior = compute_posterior(A, B, AB, linked_records_k[-1], z_k[-1], theta_m_k[-1], theta_u_k[-1], alpha_pi_k[-1], beta_pi_k[-1], new_sigma_square, a_sigma_k[-1], b_sigma_k[-1], new_beta0, new_beta1, new_alpha, new_mu2, new_sigma2_square, a_sigma2_k[-1], b_sigma2_k[-1])
#     U = scipy.stats.uniform.rvs(0,1)
#     if U < new_posterior / posteriors[-1]:
#         posteriors.append(new_posterior)
#         beta0_k.append(new_beta0)
#         beta1_k.append(new_beta1)
#         alpha_k.append(new_alpha)
#         sigma_square_k.append(new_sigma_square)
#         mu2_k.append(new_mu2)
#         sigma2_square_k.append(new_sigma2_square)

In [6]:
z_k = [z0.copy()]

linked_records_k = [linked_records.copy()]

theta_m_k = [match.copy()]

theta_u_k = [unmatch.copy()]

alpha_pi_k = [1]
beta_pi_k = [1]

a_sigma_k = [1]
b_sigma_k = [1]

a_sigma2_k = [1]
b_sigma2_k = [1]

beta0_k = [scipy.stats.norm.rvs(0,1)]
beta1_k = [scipy.stats.norm.rvs(0,1)]
alpha_k = [scipy.stats.norm.rvs(0,1)]
sigma_square_k = [scipy.stats.invgauss.rvs(a_sigma_k[-1],b_sigma_k[-1])]

mu2_k = [scipy.stats.norm.rvs(0,1)]
sigma2_square_k = [scipy.stats.invgauss.rvs(a_sigma2_k[-1],b_sigma2_k[-1])]

In [10]:
print(z_k)

print(theta_m_k)

print(theta_u_k)

print(alpha_pi_k)
print(beta_pi_k)

print(a_sigma_k)
print(b_sigma_k)

print(a_sigma2_k)
print(b_sigma2_k)

print(beta0_k)
print(beta1_k)
print(alpha_k)
print(sigma_square_k)

print(mu2_k)
print(sigma2_square_k)

[array([ 3.850e+02, -1.000e+00,  5.290e+02, ..., -1.000e+00,  2.798e+03,
       -1.000e+00])]
[array([0.95, 0.95, 0.95, 0.95])]
[family_name_comparison            0.000399
was_assigned_female_comparison    0.500373
country_comparison                0.110637
birth_year_comparison             0.144264
dtype: float64]
[1]
[1]
[1]
[1]
[1]
[1]
[-0.4688399322523959]
[-0.8071345978747391]
[-0.8579093407974692]
[1.6543537724423403]
[-1.8476769374347874]
[6.125059031251658]


In [11]:
# full conditional

# THETA M U
idx_A = z_k[-1][z_k[-1]>=0]
idx_B = np.nonzero(z_k[-1]>=0)[0]
links = pds.MultiIndex.from_tuples(zip(idx_A,idx_B))
pairs = pds.MultiIndex.from_frame(AB[["source_index_A", "source_index_B"]])

data_match = AB[pairs.isin(links)]
comparisons_match = data_match.filter(regex="comparison")
theta_m_k.append(scipy.stats.beta.rvs(comparisons_match.sum(axis=0) + 1, (1-comparisons_match).sum(axis=0) + 1))

data_unmatch = AB[(~pairs.isin(links)) & (~AB.source_index_B.duplicated()) & (~AB.source_index_A.duplicated())] # a revoir pour les duplicata
comparisons_unmatch = data_unmatch.filter(regex="comparison")
theta_u_k.append(scipy.stats.beta.rvs(comparisons_unmatch.sum(axis=0) + 1, (1-comparisons_unmatch).sum(axis=0) + 1))

# OUTCOME REGR. PARAM
tilde_y = linked_records_k[-1].Y
tilde_K = np.array([linked_records_k[-1].intercept, linked_records_k[-1].propensity_score, linked_records_k[-1].treatment]).T

sigma_matrix = np.linalg.inv( tilde_K.T @ tilde_K / sigma_square_k[-1] + np.eye(tilde_K.shape[1]) )
tilde_K.T @ tilde_y
mu_vector = sigma_matrix @ tilde_K.T @ tilde_y / sigma_square_k[-1]
beta0, beta1, alpha = scipy.stats.multivariate_normal.rvs(mu_vector, sigma_matrix)
beta0_k.append(beta0)
beta1_k.append(beta1)
alpha_k.append(alpha)

n_AB = (z_k[-1]>=0).sum()
sigma_square_k.append(scipy.stats.invgauss.rvs(a_sigma_k[-1] + n_AB/2, np.linalg.norm(tilde_y - tilde_K @ np.array([beta0_k[-1], beta1_k[-1], alpha_k[-1]]))**2 / 2 + b_sigma_k[-1]))
sigma2_square_k.append(scipy.stats.invgauss.rvs(a_sigma2_k[-1] + (A.shape[0]-n_AB)/2, b_sigma2_k[-1] + ((A.iloc[~A.index.isin(idx_A),:].Y - mu2_k[-1])**2 / 2).sum()))
sigma_mu_2_square = 1 / ((A.shape[0] -  n_AB)/sigma2_square_k[-1] + 1)
m_mu_2 = sigma_mu_2_square * (A.iloc[~A.index.isin(idx_A),:].Y).sum() / sigma2_square_k[-1]
mu2_k.append(scipy.stats.norm.rvs(m_mu_2, np.sqrt(sigma_mu_2_square)))


In [12]:
print(theta_m_k)

print(theta_u_k)

print(alpha_pi_k)
print(beta_pi_k)

print(a_sigma_k)
print(b_sigma_k)

print(a_sigma2_k)
print(b_sigma2_k)

print(beta0_k)
print(beta1_k)
print(alpha_k)
print(sigma_square_k)

print(mu2_k)
print(sigma2_square_k)

[array([0.95, 0.95, 0.95, 0.95]), array([0.9984138 , 0.99949996, 0.99905481, 0.99839724])]
[family_name_comparison            0.000399
was_assigned_female_comparison    0.500373
country_comparison                0.110637
birth_year_comparison             0.144264
dtype: float64, array([0.2305028 , 0.34686214, 0.38434979, 0.3808328 ])]
[1]
[1]
[1]
[1]
[1]
[1]
[-0.4688399322523959, -9.107019742346091]
[-0.8071345978747391, -0.3371613845969367]
[-0.8579093407974692, 13.489644061240778]
[1.6543537724423403, 6646.444204373614]
[-1.8476769374347874, -1.3456413632085549]
[6.125059031251658, 71186.44068373674]


In [12]:
# # FULFIL A NEW Z VECTOR:
# # 
# j = 3
#set_data = ...[AB['source_index_B']==j]
start = time.time()
new_z_vector = []
already_taken = []

for j in range(B.shape[0]):

    w1 = ((np.multiply(comparisons[AB['source_index_B']==j], np.log(theta_m_k[-1]/theta_u_k[-1])) + np.multiply(1-comparisons[AB['source_index_B']==j], np.log((1-theta_m_k[-1])/(1-theta_u_k[-1])))).sum(axis=1))
    # AB propensity score need to be updated at each iteration!!!
    data = AB[AB['source_index_B']==j]
    residuals = data['Y'] - np.array([data['intercept'], data['propensity_score'], data['propensity_score'] * data['treatment']]).T @ np.array([beta0_k[-1], beta1_k[-1], alpha_k[-1]]).T
    estimated_variance = residuals.T @ residuals / (len(residuals) - (data.shape[1]+1))
    aaa = scipy.stats.norm.pdf(residuals, 0, np.sqrt(estimated_variance))
    bbb = scipy.stats.norm.pdf(data['Y'], mu2_k[-1], np.sqrt(sigma2_square_k[-1]))
    w2 = np.log(aaa/bbb)
    probabilities = np.array(np.exp(w1+w2))

    #
    n_AB_ = (np.delete(z_k[-1], j)>=0).sum()
    probabilities = np.append(probabilities, (A.shape[0] - n_AB_) * (B.shape[0] - n_AB_ - 1 + beta_pi_k[-1]) / (n_AB_ + alpha_pi_k[-1]))

    probabilities = probabilities / (probabilities).sum()

    choice_array = np.arange(A.shape[0])
    choice_array = np.append(choice_array, -1)

    val = np.random.choice(choice_array, p = probabilities)
    # while val in already_taken:
    #     val = np.random.choice(choice_array, p = probabilities)
    if val != -1:
        already_taken.append(val)
    
    new_z_vector.append(val)
end = time.time()
print(end-start)
new_z_vector

       name family_name_x country_x  birth_year_x  was_assigned_female_x  X1  \
0     Paolo         Ratti        IT          1948                      0  72   
1     Paolo         Ratti        IT          1948                      0  72   
2     Paolo         Ratti        IT          1948                      0  72   
3     Paolo         Ratti        IT          1948                      0  72   
4     Paolo         Ratti        IT          1948                      0  72   
...     ...           ...       ...           ...                    ...  ..   
2795  Paolo         Ratti        IT          1948                      0  72   
2796  Paolo         Ratti        IT          1948                      0  72   
2797  Paolo         Ratti        IT          1948                      0  72   
2798  Paolo         Ratti        IT          1948                      0  72   
2799  Paolo         Ratti        IT          1948                      0  72   

            X2        X3        X4     

KeyboardInterrupt: 

In [16]:
#j = 3
#set_data = ...[AB['source_index_B']==j]

new_z_vector = []
already_taken = []

w1 = ((np.multiply(comparisons, np.log(theta_m_k[-1]/theta_u_k[-1])) + np.multiply(1-comparisons, np.log((1-theta_m_k[-1])/(1-theta_u_k[-1])))).sum(axis=1))
# AB propensity score need to be updated at each iteration!!!
data = AB
residuals = data['Y'] - np.array([data['intercept'], data['propensity_score'], data['propensity_score'] * data['treatment']]).T @ np.array([beta0_k[-1], beta1_k[-1], alpha_k[-1]]).T
estimated_variance = residuals.T @ residuals / (len(residuals) - (data.shape[1]+1))
aaa = scipy.stats.norm.pdf(residuals, 0, np.sqrt(estimated_variance))
bbb = scipy.stats.norm.pdf(data['Y'], mu2_k[-1], np.sqrt(sigma2_square_k[-1]))
w2 = np.log(aaa/bbb)
probabilities = np.array(np.exp(w1+w2))
probabilities = probabilities.reshape(B.shape[0], A.shape[0])

#
n_AB_ = np.array([(np.delete(z_k[-1], j)>=0).sum() for j in range(B.shape[0])])
proba_no_link = ((A.shape[0] - n_AB_) * (B.shape[0] - n_AB_ - 1 + beta_pi_k[-1]) / (n_AB_ + alpha_pi_k[-1])).reshape(-1,1)
probabilities = np.concatenate([probabilities, proba_no_link], axis=1)
probabilities = np.divide(probabilities.T, probabilities.sum(axis=1)).T

choice_array = np.arange(A.shape[0])
choice_array = np.append(choice_array, -1)
choice_array = np.tile(choice_array, B.shape[0]).reshape(B.shape[0], A.shape[0]+1)

for j in range(B.shape[0]):
    val = np.random.choice(choice_array[j,:], p = probabilities[j,:])
    # while val in already_taken:
    #     val = np.random.choice(choice_array[j,:], p = probabilities[j,:])
    # if val != -1:
    #     already_taken.append(val)
    new_z_vector.append(val)

new_z_vector

z_k.append(new_z_vector)
print(z_k)

[array([ 1.582e+03, -1.000e+00, -1.000e+00, ..., -1.000e+00, -1.000e+00,
       -1.000e+00]), [2294, 613, 2294, 2294, 2294, 613, 613, 2294, 613, 613, 2294, 613, 613, 613, 613, 2294, 2294, 2294, 613, 2294, 2294, 613, 2294, 613, 2294, 613, 2294, 613, 613, 613, 2294, 613, 2294, 613, 2294, 2294, 2294, 2294, 2294, 2294, 2294, 2294, 613, 613, 613, 613, 613, 613, 613, 613, 2294, 613, 2294, 613, 613, 2294, 613, 613, 2294, 613, 2294, 613, 2294, 613, 613, 613, 613, 2294, 2294, 613, 2294, 2294, 2294, 2294, 2294, 613, 2294, 613, 2294, 613, 613, 613, 2294, 613, 2294, 613, 2294, 2294, 613, 2294, 2294, 613, 613, 613, 2294, 2294, 2294, 2294, 613, 2294, 613, 2294, 2294, 613, 613, 613, 2294, 2294, 613, 613, 613, 613, 613, 613, 613, 2294, 613, 2294, 613, 613, 613, 613, 613, 613, 2294, 613, 613, 613, 613, 2294, 613, 2294, 2294, 613, 2294, 2294, 2294, 2294, 2294, 613, 2294, 613, 2294, 613, 2294, 613, 2294, 613, 613, 613, 2294, 2294, 2294, 2294, 613, 613, 2294, 2294, 613, 2294, 613, 613, 613, 2294, 613, 613

In [17]:
len(z_k)

4

In [113]:
z_k[-1]

array([ 1.000e+01,  8.130e+02, -1.000e+00, ..., -1.000e+00, -1.000e+00,
        2.799e+03])

In [109]:
choice_array

array([[   0,    1,    2, ..., 2798, 2799,   -1],
       [   0,    1,    2, ..., 2798, 2799,   -1],
       [   0,    1,    2, ..., 2798, 2799,   -1],
       ...,
       [   0,    1,    2, ..., 2798, 2799,   -1],
       [   0,    1,    2, ..., 2798, 2799,   -1],
       [   0,    1,    2, ..., 2798, 2799,   -1]])

In [104]:

np.divide(np.array([[1,2],[3,4],[5,6]]).T, [0.5,1,2]).T

array([[2. , 4. ],
       [3. , 4. ],
       [2.5, 3. ]])

In [89]:
proba_no_link.shape

(2200,)

In [83]:
np.array(AB['source_index_B']).reshape(B.shape[0], A.shape[0])

array([[   0,    0,    0, ...,    0,    0,    0],
       [   1,    1,    1, ...,    1,    1,    1],
       [   2,    2,    2, ...,    2,    2,    2],
       ...,
       [2197, 2197, 2197, ..., 2197, 2197, 2197],
       [2198, 2198, 2198, ..., 2198, 2198, 2198],
       [2199, 2199, 2199, ..., 2199, 2199, 2199]])

In [84]:
np.array(AB['source_index_A']).reshape(B.shape[0], A.shape[0])

array([[   0,    1,    2, ..., 2797, 2798, 2799],
       [   0,    1,    2, ..., 2797, 2798, 2799],
       [   0,    1,    2, ..., 2797, 2798, 2799],
       ...,
       [   0,    1,    2, ..., 2797, 2798, 2799],
       [   0,    1,    2, ..., 2797, 2798, 2799],
       [   0,    1,    2, ..., 2797, 2798, 2799]])

In [53]:
n_AB_ = (np.delete(z_k[-1], j)>=0).sum()

array([ 1.000e+01,  8.130e+02, -1.000e+00, ..., -1.000e+00, -1.000e+00,
        2.799e+03])

In [55]:
np.delete(z_k[-1], j)

array([ 1.000e+01,  8.130e+02, -1.000e+00, ..., -1.000e+00, -1.000e+00,
        2.799e+03])

In [46]:
np.array(np.exp(w1+w2)).reshape(B.shape[0], A.shape[0])

(2200, 2800)

In [None]:
np.random.choice(np.arange(A.shape[0]+B.shape[0]), p)

In [51]:
np.array(AB.filter(regex="source_index_B")).reshape( B.shape[0], A.shape[0],)

array([[   0,    0,    0, ...,    0,    0,    0],
       [   1,    1,    1, ...,    1,    1,    1],
       [   2,    2,    2, ...,    2,    2,    2],
       ...,
       [2197, 2197, 2197, ..., 2197, 2197, 2197],
       [2198, 2198, 2198, ..., 2198, 2198, 2198],
       [2199, 2199, 2199, ..., 2199, 2199, 2199]])

In [50]:
np.array(AB.filter(regex="source_index_B")).reshape( A.shape[0], B.shape[0])

array([[   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    1,    1,    1],
       [   1,    1,    1, ...,    2,    2,    2],
       ...,
       [2197, 2197, 2197, ..., 2198, 2198, 2198],
       [2198, 2198, 2198, ..., 2199, 2199, 2199],
       [2199, 2199, 2199, ..., 2199, 2199, 2199]])

In [63]:
# q in index A
# j in index B
# pair (q,j)

w1 = (np.multiply(comparisons, np.log(theta_m_k[-1]/theta_u_k[-1])) + np.multiply(1-comparisons, np.log2((1-match)/(1-unmatch)))).sum(axis=1)


array([-4616.20702877, -4723.47099306, -4612.39435274, -4611.17184777,
       -4611.29261829, -4421.83130061, -4614.27861164, -4615.23289085,
       -4611.87841234, -4616.88101574, -4734.35485002, -4612.7684864 ,
       -4645.26918847, -4615.75557251, -4612.14151868, -4502.96125641,
       -4501.25730073, -4616.93414224, -4619.6552678 , -4611.91790945,
       -4612.67831245, -4470.72403295, -4613.36683558, -4611.40406068,
       -4612.35739281, -4652.98793294, -4615.82487373, -4684.65668473,
       -4616.17899297, -4630.27275893, -4617.73209382, -4713.78898317,
       -4758.03880152, -4616.32907795, -4635.93313419, -4783.27154662,
       -4612.02873863, -4614.72871071, -4503.02962037, -4611.16916207,
       -4611.16043904, -4704.40190382, -4611.21317651, -4612.32472759,
       -4670.81338856, -4641.44790283, -4688.77715008, -4620.02177454,
       -4659.31281746, -4517.24026954, -4612.78805354, -4624.2181627 ,
       -4731.69496636, -4712.16807475, -4615.48772   , -4611.6012203 ,
      

In [53]:
(A.iloc[~A.index.isin(idx_A),:].Y - mu2_k[-1])**2 

1        77.485676
2        93.317290
3        55.752384
4         3.494127
5       107.302317
           ...    
2794     65.225749
2796    226.917124
2797     32.833933
2798    174.066581
2799      8.667763
Name: Y, Length: 2015, dtype: float64

In [19]:
%timeit scipy.linalg.cho_solve(scipy.linalg.cho_factor(sigma_matrix_test,lower=True), np.eye(sigma_matrix_test.shape[0]))

18.5 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
%timeit np.linalg.inv(sigma_matrix_test)

16.4 ms ± 394 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
