In [14]:
import pandas as pds
import numpy as np
import textdistance
import timeit
import math
import statsmodels.api as sm
import scipy
from patsy import dmatrix
import time
import math
import tqdm

name_DF = 'DF_N=4401_2023-01-16.csv'
DF = pds.read_csv(os.path.join('..', 'datasets', name_DF), delimiter = ',')
DF = DF[~DF.duplicated()] # delete duplicates
DF = DF.dropna() # delete NaN values
DF['was_assigned_female'] = DF['was_assigned_female'].astype('int32') # turn was_born_female into int type (once Nan values have been removed)

identifiers = {'family_name':'jaro-winkler','was_assigned_female':'strict','country':'strict','birth_year':'large'}
covariates = ['X1','X2','X3','X4','X5']

########## GENERATES ASSOCIATION ##########

# generate covariates
DF['X1'] = 2020 - DF['birth_year'] # age
DF['X2'] = np.random.normal(loc = 2.5, scale = 1, size = DF.shape[0])
DF['X3'] = np.random.normal(loc = 0, scale = 1, size = DF.shape[0])
DF['X4'] = np.random.normal(loc = 1, scale = 1, size = DF.shape[0])
DF['X5'] = np.random.normal(loc = 1, scale = 1, size = DF.shape[0])

# generate treatment
DF['treatment'] = np.random.binomial(n = 1, p = 1 / ( 1 + np.exp(0.1*DF.X1 -0.2*DF.X2 +0.3*DF.X3 -0.4*DF.X4 +0.5*DF.X5) )) # probability depending on covariates

# generate outcome
residual_errors = np.random.normal(size = DF.shape[0])
a = 5.5
b = 0.01
c = 0.08
d = 0.7

ate = a * 2.5
DF['Y'] = - 10 + a*DF['treatment']*DF['X2'] + b*np.exp(DF['X4']) + c*DF['X3']*DF['X1'] + d*DF['X5'] 

common_records = DF.sample(n = 800)

B = pds.concat([DF.sample(n = 1400), common_records]).drop(['Y'], axis = 1)
B = B.reset_index(drop=True)

A = pds.concat([DF.sample(n = 2000), common_records])[list(identifiers.keys())+['Y']]
A = A.reset_index(drop=True)

def levenshtein_similarity(a,b):

    """ Check that levenshtein similarity (in [0,1]) is above 0.95.
        
        a: string,
        b: string """

    if 1 - textdistance.levenshtein(a, b)/max(len(a),len(b)) >= 0.95:
        return 1
    else:
        return 0

def jaro_winkler_similarity(a,b):

    """ Check that jaro-winkler similarity (in [0,1]) is above 0.95.
        
        a: string,
        b: string """

    if textdistance.jaro_winkler(a,b) >= 0.99:
        return 1
    else:
        return 0

def strict_equality(a,b):

    """ Check that a and b values are equal.
        
        a: any value,
        b: any value """

    return a==b

def large_equality(a,b):

    """ Check that years a and b expressed with four numbers are within the same decade.
        
        a: year,
        b: year """

    return str(a)[:-1]==str(b)[:-1]

def logit(p):
    return np.log(p/(1-p))

def minmaxscaler(X):
    return (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
    
def propensity_score(DF, covariates, scaler, convert_to_logit):
    
    """ Compute propensity score estimates: the probability (logistic regression) that an observation is treated or not conditioned on some covariates.
        These estimates are built conditionaly on covariates passed using a logit after transformation by scaler (when one is specified).
        Estimated probabilities can be converted into logit (convert_to_logit parameter).

        DF:                dataframe,
        covariates:        list of strings for covariates variable in DF,
        scaler:            sklearn.preprocessing function scaler for exemple,
        convert_to_logit:  boolean for converting probabilities to logit when building the propensity score estimates based on a logistic regression
    """
    exog = covariates.copy()
    if scaler != None:
        DF[exog] = scaler(DF[exog])
    if 'intercept' not in DF.columns:
        DF['intercept'] = 1
    exog.append('intercept')
    model = sm.Logit(DF.treatment, DF[exog]).fit(disp=0)
    predictions = model.predict(DF[exog])
    if convert_to_logit:
        return logit(predictions)
    else: 
        return predictions

AB = B[identifiers.keys()].merge(A[identifiers.keys()], how='cross')

AB_for_z0 = B[~B.duplicated(keep=False)][identifiers.keys()].merge(A[~A.duplicated(keep=False)][identifiers.keys()], how='cross')

AB["source_index_B"] = np.repeat(B.index, A.shape[0])
AB["source_index_A"] = np.tile(A.index, B.shape[0])

AB_for_z0["source_index_B"] = np.repeat(B[~B.duplicated(keep=False)].index, A[~A.duplicated(keep=False)].shape[0])
AB_for_z0["source_index_A"] = np.tile(A[~A.duplicated(keep=False)].index, B[~B.duplicated(keep=False)].shape[0])

methods = {'jaro-winkler':jaro_winkler_similarity, 'levenshtein':levenshtein_similarity, 'strict':strict_equality, 'large':large_equality}

for linking_var in identifiers.keys():
    method = methods[identifiers[linking_var]]
    df = AB.filter(regex=linking_var)
    AB[linking_var+"_comparison"] = np.array([method(a, b) for a,b in zip(df.iloc[:,0], df.iloc[:,1])]).astype(int).reshape(-1,1)
    df_z0 = AB_for_z0.filter(regex=linking_var)
    AB_for_z0[linking_var+"_comparison"] = np.array([method(a, b) for a,b in zip(df_z0.iloc[:,0], df_z0.iloc[:,1])]).astype(int).reshape(-1,1)
comparisons = AB.filter(regex="comparison")
comparisons_z0 = AB_for_z0.filter(regex="comparison")
comparisons_z0[comparisons_z0.all(axis=1)]

Unnamed: 0,family_name_comparison,was_assigned_female_comparison,country_comparison,birth_year_comparison
8122,1,1,1,1
10373,1,1,1,1
13313,1,1,1,1
15492,1,1,1,1
19360,1,1,1,1
...,...,...,...,...
3363151,1,1,1,1
3369230,1,1,1,1
3371259,1,1,1,1
3373286,1,1,1,1


In [17]:
data = AB_for_z0[comparisons_z0.all(axis=1)]

# we consider 1-2-1 matches in z
data_link = data[ (~data.source_index_A.duplicated(keep=False)) & (~data.source_index_B.duplicated(keep=False)) ]

z0 = -np.ones(B.shape[0])
z0[data_link.source_index_B] = data_link.source_index_A

true_linkage_z = -np.ones(B.shape[0])
true_linkage_z[B.iloc[-800:,:].index] = A.iloc[-800:,:].index
(z0==true_linkage_z).sum()/len(z0)

0.5277272727272727

In [None]:
unmatch = comparisons.sum(axis=0) / len(comparisons) # probability of having same linking var (at all)

match = np.repeat(0.95, len(identifiers.keys())) # probability of having same linking var when being matches

AB["linking_score"] = (np.multiply(comparisons, np.log2(match/unmatch)) + np.multiply(1-comparisons, np.log2((1-match)/(1-unmatch)))).sum(axis=1)

score = AB.linking_score.max()
data = AB[AB.linking_score==score]

data = data[data.source_index_A.]

# we consider 1-2-1 matches in z
data_link = data[ (~data.source_index_A.duplicated(keep=False)) & (~data.source_index_B.duplicated(keep=False)) ]

In [11]:
A

Unnamed: 0,family_name,was_assigned_female,country,birth_year,Y
0,Владимр,0,RU,2018,7.563163
1,Mroczkowski,0,PL,1970,-9.690913
2,Исаев,0,RU,1961,-12.940553
3,Berisha,0,IT,1987,-8.048016
4,Lehner,0,AT,1952,-9.517789
...,...,...,...,...,...
2795,Rieutord,1,FR,1978,-13.895535
2796,Бондарь,0,RU,1976,-2.940923
2797,Mbaye,1,FR,1978,-10.439819
2798,Хакиев,0,RU,1976,-6.520781


In [12]:
A[~A.duplicated(keep=False)]

Unnamed: 0,family_name,was_assigned_female,country,birth_year,Y
0,Владимр,0,RU,2018,7.563163
1,Mroczkowski,0,PL,1970,-9.690913
2,Исаев,0,RU,1961,-12.940553
3,Berisha,0,IT,1987,-8.048016
4,Lehner,0,AT,1952,-9.517789
...,...,...,...,...,...
2785,Ковалева,1,RU,1998,-7.364520
2786,Nicolai,0,MD,1987,-8.514947
2791,Poulard,0,DE,2003,-9.580047
2795,Rieutord,1,FR,1978,-13.895535


In [7]:
z = -np.ones(B.shape[0])
z[data_link.source_index_B] = data_link.source_index_A

from_A = A.iloc[data_link.source_index_A,:].reset_index(drop=True)
from_B = B.iloc[data_link.source_index_B,:].reset_index(drop=True)
linked_records = pds.concat([from_B, from_A.Y], axis=1)
linked_records['propensity_score'] = propensity_score(linked_records, covariates, None, False)

      family_name_x  was_assigned_female_x country_x  birth_year_x  \
1999       Gehbauer                      1        DE          1995   
4351       Dujardin                      1        BE          2002   
11739           Cau                      1        IT          1987   
14472         Churm                      0        GB          1982   
17598     De Marchi                      0        IT          1987   

      family_name_y  was_assigned_female_y country_y  birth_year_y  \
1999       Gehbauer                      1        DE          1995   
4351       Dujardin                      1        BE          2002   
11739           Cau                      1        IT          1987   
14472         Churm                      0        GB          1982   
17598     De Marchi                      0        IT          1987   

       source_index_B  source_index_A  family_name_comparison  \
1999                0            1999                       1   
4351                1       

In [9]:
data_link

Unnamed: 0,family_name_x,was_assigned_female_x,country_x,birth_year_x,family_name_y,was_assigned_female_y,country_y,birth_year_y,source_index_B,source_index_A,family_name_comparison,was_assigned_female_comparison,country_comparison,birth_year_comparison,linking_score
6154397,Mbaye,1,FR,1978,Mbaye,1,FR,1978,2197,2797,1,1,1,1,18.017076
6148795,Rieutord,1,FR,1978,Rieutord,1,FR,1978,2195,2795,1,1,1,1,18.017076
6137591,Poulard,0,DE,2003,Poulard,0,DE,2003,2191,2791,1,1,1,1,18.017076
6123586,Nicolai,0,MD,1987,Nicolai,0,MD,1987,2186,2786,1,1,1,1,18.017076
6120785,Ковалева,1,RU,1998,Ковалева,1,RU,1998,2185,2785,1,1,1,1,18.017076
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17598,De Marchi,0,IT,1987,De Marchi,0,IT,1987,6,798,1,1,1,1,18.017076
14472,Churm,0,GB,1982,Churm,0,GB,1982,5,472,1,1,1,1,18.017076
11739,Cau,1,IT,1987,Cau,1,IT,1987,4,539,1,1,1,1,18.017076
4351,Dujardin,1,BE,2002,Dujardin,1,BE,2002,1,1551,1,1,1,1,18.017076


In [5]:
A.iloc[-800:,:]

Unnamed: 0,family_name,was_assigned_female,country,birth_year,Y
2000,Larsen,0,NO,1985,-8.083628
2001,Broad,1,GB,2013,-9.408401
2002,Tarmure,0,ES,1981,-6.939427
2003,Шитова,1,RU,1976,-13.050829
2004,Лапшина,1,RU,1976,-5.950877
...,...,...,...,...,...
2795,Macahaniuc,1,IT,1948,-8.651966
2796,Wi,1,PL,1957,-10.299188
2797,Todorovic,0,RS,1964,-3.689590
2798,Shapkarin,0,RU,1961,-11.734335


In [10]:
# GET true z
# 
# common_records

true_linkage_z = -np.ones(B.shape[0])
true_linkage_z[B.iloc[-800:,:].index] = A.iloc[-800:,:].index
(z==true_linkage_z).sum()/len(z)

0.5163636363636364

In [None]:

def compute_posterior(file_A, file_B, file_mix_BA, linked_records, z, theta_m, theta_u, alpha_pi, beta_pi, sigma_square, a_sigma, b_sigma, beta0, beta1, alpha, mu2, sigma2_square, a_sigma2, b_sigma2):
    result = 0
    # likelihood 1
    # need linked_records computed based on z
    Y = np.array(linked_records.Y)
    X = np.array([linked_records.propensity_score*linked_records.treatment, linked_records.propensity_score])
    X = sm.add_constant(X)
    X = X.T
    model = sm.GLM(Y,X)
    results = model.fit()
    residuals = Y - X @ results.params
    estimated_variance = residuals.T @ residuals / (len(residuals) - (X.shape[1]+1))
    result += np.log(scipy.stats.norm.pdf(residuals, 0, np.sqrt(estimated_variance))).sum()
    # likelihood 2
    # need z
    Y = scipy.stats.norm.rvs(mu2, np.sqrt(sigma2_square), size=len(z[z<0]))
    result += np.log(scipy.stats.norm.pdf(Y, mu2, np.sqrt(sigma2_square))).sum()
    # likelihood 3 and 4
    # need AB, z, theta_m, theta_u ATTENTION AB SHOULD BE KNOWN (GLOBAL)
    idx_A = z[z>=0]
    idx_B = np.nonzero(z>=0)[0]
    links = pds.MultiIndex.from_tuples(zip(idx_A,idx_B))
    pairs = pds.MultiIndex.from_frame(file_mix_BA[["source_index_A", "source_index_B"]])
    # 3
    data = file_mix_BA[pairs.isin(links)] # 1-2-1 matches enforced by construction of z
    pattern_match, count_match = np.unique(data.filter(regex="comparison"), return_counts=True, axis=0)
    result += ((pattern_match @ np.log(theta_m) + (1-pattern_match) @ np.log(1-theta_m)) * count_match).sum()
    # 4
    data = file_mix_BA[(~file_mix_BA.source_index_B.duplicated())&(~pairs.isin(links))] # enforce 1-2-1 by removing duplicata
    pattern_unmatch, count_unmatch = np.unique(data.filter(regex="comparison"), return_counts=True, axis=0)
    result += ((pattern_unmatch @ np.log(theta_u) + (1-pattern_unmatch) @ np.log(1-theta_u)) * count_unmatch).sum()
    # prior 1
    # need z, alpha_pi, beta_pi
    n_AB = (z>=0).sum()
    result += math.log(math.factorial(file_A.shape[0]-n_AB)) - math.log(math.factorial(file_A.shape[0])) + scipy.special.betaln(n_AB + alpha_pi, file_B.shape[0] - n_AB + beta_pi) - scipy.special.betaln(alpha_pi, beta_pi)
    # prior 2
    # need theta_m QUESTION a-1 / b-1?
    result += (1 * np.log(theta_m) + 1 * np.log(1-theta_m)).sum()
    # prior 3
    # need theta_u QUESTION a-1 / b-1?
    result += (1 * np.log(theta_u) + 1 * np.log(1-theta_u)).sum()
    # prior 4
    # need sigma_square, a_sigma, b_sigma QUESTION unclear if it is the pdf?
    result += np.log(scipy.stats.invgauss.pdf(sigma_square, a_sigma, b_sigma))
    # prior 5
    # need beta0, beta1, alpha QUESTION unclear if it is the pdf?
    result += np.log(scipy.stats.multivariate_normal.pdf([beta0, beta1, alpha], [0,0,0], np.eye(3)))
    # prior 6
    # need mu2 QUESTION unclear if it is the pdf?
    result += np.log(scipy.stats.norm.pdf(mu2, 0, 1))
    # prior 7
    # need sigma2_square, a_sigma2, b_sigma2 QUESTION unclear if it is the pdf?
    result += np.log(scipy.stats.invgauss.pdf(sigma2_square, a_sigma2, b_sigma2))
    return result

def compute_proposal(file_A, file_B, file_mix_BA, z, sigma_square, a_sigma, b_sigma, beta0, beta1, alpha, mu2, sigma2_square, a_sigma2, b_sigma2):
    result = [] # z, linked_records, theta_m, theta_u, beta0, beta1, alpha, sigma_square, mu2, sigma2_square
    # z
    linked_record_prop = (z<file_B.shape[0]).sum() / len(z) # proportion of linked records in previous z
    new = np.random.choice(file_A.shape[0], size=file_B.shape[0], replace=False)
    new = new * np.random.choice([1,-1], size=file_B.shape[0], p=[linked_record_prop, 1-linked_record_prop]) # randomly set negative some values so that the proportion of positive values corresponds to linked_record_prop
    result.append(new)
    # linked records
    idx_A = z[z>=0]
    idx_B = np.nonzero(z>=0)[0]
    from_A = file_A.iloc[idx_A,:].reset_index(drop=True)
    from_B = file_B.iloc[idx_B,:].reset_index(drop=True)
    linked_records = pds.concat([from_B, from_A.Y], axis=1)
    linked_records['propensity_score'] = propensity_score(linked_records, covariates, None, False)
    result.append(linked_records)
    # theta_u QUESTION we do not care of dependence on previous param
    comparisons = file_mix_BA.filter(regex="comparison")
    unmatch = comparisons.sum(axis=0) / len(comparisons)
    result.append(unmatch)
    # theta_m QUESTION we do not care of dependence on previous param
    match = np.repeat(0.95, len(unmatch))
    result.append(match)
    # beta0, beta1, alpha, sigma_square
    result.append(beta0+scipy.stats.norm.rvs(0,0.5))
    result.append(beta1+scipy.stats.norm.rvs(0,0.5))
    result.append(alpha+scipy.stats.norm.rvs(0,0.5))
    result.append(sigma_square+scipy.stats.invgauss.rvs(a_sigma,b_sigma))
    # mu2, sigma2_square
    result.append(mu2+scipy.stats.norm.rvs(0,1))
    result.append(sigma2_square+scipy.stats.invgauss.rvs(a_sigma2,b_sigma2))
    return result

0.51

In [None]:
# z_k = [z.copy()]

# linked_records_k = [linked_records.copy()]

# theta_m_k = [match.copy()]

# theta_u_k = [unmatch.copy()]

# alpha_pi_k = [1]
# beta_pi_k = [1]

# a_sigma_k = [1]
# b_sigma_k = [1]

# a_sigma2_k = [1]
# b_sigma2_k = [1]

# beta0_k = [scipy.stats.norm.rvs(0,1)]
# beta1_k = [scipy.stats.norm.rvs(0,1)]
# alpha_k = [scipy.stats.norm.rvs(0,1)]
# sigma_square_k = [scipy.stats.invgauss.rvs(a_sigma_k[-1],b_sigma_k[-1])]

# mu2_k = [scipy.stats.norm.rvs(0,1)]
# sigma2_square_k = [scipy.stats.invgauss.rvs(a_sigma2_k[-1],b_sigma2_k[-1])]

# posteriors = []
# posterior = compute_posterior(A, B, AB, linked_records_k[-1], z_k[-1], theta_m_k[-1], theta_u_k[-1], alpha_pi_k[-1], beta_pi_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
# posteriors.append(posterior)

# for _ in tqdm.tqdm(range(500)):
#     if  not(_%50):
#         print((z_k[-1]==true_linkage_z).sum()/len(true_linkage_z))

#     proposal = compute_proposal(A, B, AB, z_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
#     new_z, new_linked_records, new_theta_m, new_theta_u, new_beta0, new_beta1, new_alpha, new_sigma_square, new_mu2, new_sigma2_square = proposal
#     # if update z and linked_records
#     new_posterior = compute_posterior(A, B, AB, new_linked_records, new_z, theta_m_k[-1], theta_u_k[-1], alpha_pi_k[-1], beta_pi_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])

#     posteriors.append(new_posterior)
#     U = scipy.stats.uniform.rvs(0,1)
#     if U < posteriors[-1] / posteriors[-2]:
#         z_k.append(new_z)
#         linked_records_k.append(new_linked_records)
#         theta_m_k.append(new_theta_m)
#         theta_u_k.append(new_theta_u)
#         beta0_k.append(new_beta0)
#         beta1_k.append(new_beta1)
#         alpha_k.append(new_alpha)
#         sigma_square_k.append(new_sigma_square)
#         mu2_k.append(new_mu2)
#         sigma2_square_k.append(new_sigma2_square)

In [8]:
z_k = [z.copy()]

linked_records_k = [linked_records.copy()]

theta_m_k = [match.copy()]

theta_u_k = [unmatch.copy()]

alpha_pi_k = [1]
beta_pi_k = [1]

a_sigma_k = [1]
b_sigma_k = [1]

a_sigma2_k = [1]
b_sigma2_k = [1]

beta0_k = [scipy.stats.norm.rvs(0,1)]
beta1_k = [scipy.stats.norm.rvs(0,1)]
alpha_k = [scipy.stats.norm.rvs(0,1)]
sigma_square_k = [scipy.stats.invgauss.rvs(a_sigma_k[-1],b_sigma_k[-1])]

mu2_k = [scipy.stats.norm.rvs(0,1)]
sigma2_square_k = [scipy.stats.invgauss.rvs(a_sigma2_k[-1],b_sigma2_k[-1])]

posteriors = []
posterior = compute_posterior(A, B, AB, linked_records_k[-1], z_k[-1], theta_m_k[-1], theta_u_k[-1], alpha_pi_k[-1], beta_pi_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
posteriors.append(posterior)

for _ in tqdm.tqdm(range(500)):
    if  not(_%50):
        print((z_k[-1]==true_linkage_z).sum()/len(true_linkage_z))

    proposal = compute_proposal(A, B, AB, z_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
    new_z, new_linked_records, new_theta_m, new_theta_u, new_beta0, new_beta1, new_alpha, new_sigma_square, new_mu2, new_sigma2_square = proposal
    # if update z and linked_records
    new_posterior = compute_posterior(A, B, AB, new_linked_records, new_z, theta_m_k[-1], theta_u_k[-1], alpha_pi_k[-1], beta_pi_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
    U = scipy.stats.uniform.rvs(0,1)
    if U < new_posterior / posteriors[-1]:
        posteriors.append(new_posterior)
        z_k.append(new_z)
        linked_records_k.append(new_linked_records)

    proposal = compute_proposal(A, B, AB, z_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
    new_z, new_linked_records, new_theta_m, new_theta_u, new_beta0, new_beta1, new_alpha, new_sigma_square, new_mu2, new_sigma2_square = proposal
    # if update theta_m, theta_u
    new_posterior = compute_posterior(A, B, AB, linked_records_k[-1], z_k[-1], new_theta_m, new_theta_u, alpha_pi_k[-1], beta_pi_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
    U = scipy.stats.uniform.rvs(0,1)
    if U < new_posterior / posteriors[-1]:
        posteriors.append(new_posterior)
        theta_m_k.append(new_theta_m)
        theta_u_k.append(new_theta_u)

    proposal = compute_proposal(A, B, AB, z_k[-1], sigma_square_k[-1], a_sigma_k[-1], b_sigma_k[-1], beta0_k[-1], beta1_k[-1], alpha_k[-1], mu2_k[-1], sigma2_square_k[-1], a_sigma2_k[-1], b_sigma2_k[-1])
    new_z, new_linked_records, new_theta_m, new_theta_u, new_beta0, new_beta1, new_alpha, new_sigma_square, new_mu2, new_sigma2_square = proposal
    # if update beta0, beta1, alpha, sigma_square, mu2, sigma2_square
    new_posterior = compute_posterior(A, B, AB, linked_records_k[-1], z_k[-1], theta_m_k[-1], theta_u_k[-1], alpha_pi_k[-1], beta_pi_k[-1], new_sigma_square, a_sigma_k[-1], b_sigma_k[-1], new_beta0, new_beta1, new_alpha, new_mu2, new_sigma2_square, a_sigma2_k[-1], b_sigma2_k[-1])
    U = scipy.stats.uniform.rvs(0,1)
    if U < new_posterior / posteriors[-1]:
        posteriors.append(new_posterior)
        beta0_k.append(new_beta0)
        beta1_k.append(new_beta1)
        alpha_k.append(new_alpha)
        sigma_square_k.append(new_sigma_square)
        mu2_k.append(new_mu2)
        sigma2_square_k.append(new_sigma2_square)

  0%|          | 0/500 [00:00<?, ?it/s]

0.51


 10%|█         | 50/500 [04:44<43:08,  5.75s/it]

0.0


 20%|██        | 100/500 [09:32<38:26,  5.77s/it]

0.0


 21%|██        | 103/500 [09:52<38:01,  5.75s/it]


KeyboardInterrupt: 

In [9]:
z_k[-1]

array([  974,  1201,  2003, ..., -1307,  1860,   245])