In [1]:
import numpy as np
import textdistance
import timeit
import pandas as pds
from sklearn import mixture
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [2]:
name_DF = 'DF_associations_N=43889_2023-01-03.csv'
# COMPLETE identifiers = {'name':'jaro-winkler','family name':'jaro-winkler','gender':'strict','country':'strict','birth year':'large'}
identifiers = {'family name':'jaro-winkler','gender':'strict','country':'strict','birth year':'large'}
covariates = ['X1','X2','X3','X4','X5']

DF = pds.read_csv(os.path.join('..', name_DF), delimiter = ',')

overlap = DF.sample(n = 150)

A = pds.concat([DF.sample(n = 450), overlap]).drop(['Y'], axis = 1)
A = A.reset_index(drop=True)

B = pds.concat([DF.sample(n = 350), overlap])[list(identifiers.keys())+['Y']]
B = B.reset_index(drop=True)

In [3]:
A.head()

Unnamed: 0,name,family name,gender,country,birth year,treatment,X1,X2,X3,X4,X5
0,Ariane,Hausmann,F,DE,1978,0,42,1.368755,3.970096,8.963813,-0.040154
1,Robert,Kycia,M,PL,1976,0,44,5.162158,4.167418,9.086824,0.533097
2,Gökhan,Subatan,M,BE,1993,1,27,4.15052,3.864387,9.262914,0.79825
3,Ilaria,Campanale,F,IT,1953,0,67,6.450749,3.525062,9.163808,0.216214
4,Nathalie,Kunkel,F,DE,1978,1,42,1.633013,3.099963,9.500583,1.822581


In [4]:
B.head()

Unnamed: 0,family name,gender,country,birth year,Y
0,Nicolini,F,IT,1963,52.519435
1,Ouaras,F,FR,1962,51.541661
2,Holl,F,IT,1963,-4.559265
3,Sh,F,DE,1957,59.016003
4,Nikolopoulos,M,GR,1958,-3.467832


In [5]:
def levenshtein_similarity(a,b):

    """ Check that levenshtein similarity (in [0,1]) is above 0.95.
        
        a: string,
        b: string """

    if 1 - textdistance.levenshtein(a, b)/max(len(a),len(b)) >= 0.95:
        return 1
    else:
        return 0

def jaro_winkler_similarity(a,b):

    """ Check that jaro-winkler similarity (in [0,1]) is above 0.95.
        
        a: string,
        b: string """

    if textdistance.jaro_winkler(a,b) >= 0.95:
        return 1
    else:
        return 0

def strict_equality(a,b):

    """ Check that a and b values are equal.
        
        a: any value,
        b: any value """

    return a==b

def large_equality(a,b):

    """ Check that years a and b expressed with four numbers are within the same decade.
        
        a: year,
        b: year """

    return str(a)[:-1]==str(b)[:-1]

In [6]:
def comparison_vector(A_record, B, identifiers):
    
    """ Compare one record in A with all records in B. 
        Return the binary comparison of the identifiers for one record in A with all records in B.

        A_record:     series of one row, 
        B:            dataframe, 
        identifiers:  dict: k = column name, 
                            v = method in {'large','strict','levenshtein','jaro-winkler'}
    """

    methods = {'jaro-winkler':jaro_winkler_similarity, 'levenshtein':levenshtein_similarity, 'strict':strict_equality, 'large':large_equality}
    comparisons = {}
    for linking_var in identifiers:
        method = methods[identifiers[linking_var]]
        comparisons[linking_var] = np.array(B.apply(lambda row: method(A_record[linking_var], row[linking_var]), axis=1)).reshape(-1,1)
    return np.concatenate(tuple(comparisons.values()), axis = 1) 
    
A_record = A.iloc[0,:]
comparison_vector(A_record, B, identifiers)

array([[0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       ...,
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0]])

In [18]:
def almost_exact_matches(A, B, compare_on):
    
    """ Compare A records and B records.
        Return the exact common records (where all identifiers are equals).
        Be aware that these record are not clearly exact matches in reality (if we do not have name + family name + birth date as identifiers for example).

        A: dataframe, 
        B: dataframe,
        identifiers:  dict: k = column name, 
                            v = method in {'large','strict','levenshtein','jaro-winkler'}
    """

    A['source'] = 'A'
    B['source'] = 'B'
    # we remove A and B duplicates to ensure returning 1-2-1 true matches:
    df = pds.concat([A[~A[identifiers.keys()].duplicated(keep=False)],B[~B[identifiers.keys()].duplicated(keep=False)]], join='inner')
    duplicata = df[df[compare_on.keys()].duplicated(keep=False)]
    duplicata = np.array(duplicata.groupby(list(df[compare_on.keys()])).apply(lambda row: row.index))
    print(duplicata)
    return {'A':np.array([idx[0] for idx in duplicata]), 'B':np.array([idx[1] for idx in duplicata])}

almost_exact_matches(A, B, identifiers)

[Int64Index([488, 388], dtype='int64')
 Int64Index([561, 461], dtype='int64')
 Int64Index([599, 499], dtype='int64')
 Int64Index([512, 412], dtype='int64')
 Int64Index([593, 493], dtype='int64')
 Int64Index([548, 448], dtype='int64')
 Int64Index([463, 363], dtype='int64')
 Int64Index([584, 484], dtype='int64')
 Int64Index([485, 385], dtype='int64')
 Int64Index([507, 407], dtype='int64')
 Int64Index([501, 401], dtype='int64')
 Int64Index([581, 481], dtype='int64')
 Int64Index([580, 480], dtype='int64')
 Int64Index([585, 485], dtype='int64')
 Int64Index([482, 382], dtype='int64')
 Int64Index([546, 446], dtype='int64')
 Int64Index([451, 351], dtype='int64')
 Int64Index([538, 438], dtype='int64')
 Int64Index([590, 490], dtype='int64')
 Int64Index([564, 464], dtype='int64')
 Int64Index([553, 453], dtype='int64')
 Int64Index([578, 478], dtype='int64')
 Int64Index([588, 488], dtype='int64')
 Int64Index([456, 356], dtype='int64')
 Int64Index([583, 483], dtype='int64')
 Int64Index([596, 496], d

{'A': array([488, 561, 599, 512, 593, 548, 463, 584, 485, 507, 501, 581, 580,
        585, 482, 546, 451, 538, 590, 564, 553, 578, 588, 456, 583, 596,
        525, 533, 555, 489, 595, 464, 511, 516, 552, 569, 527, 562, 523,
        571, 577, 558, 542, 494, 591, 479, 473, 587, 544, 589, 474, 518,
        597, 467, 537, 508, 503, 598, 457, 528, 500, 460, 549, 530, 476,
        478, 506, 502, 572, 499, 568, 579, 486, 509, 582, 554, 515, 566,
        557, 510, 417, 491, 526, 534,  32, 522, 472, 567, 481, 563, 492,
        540, 477, 576, 487, 455, 536, 475, 539, 462, 490, 459, 504, 573,
        529, 497, 556, 520, 543, 574, 547, 586, 454, 531, 541, 458, 517,
        545, 524, 471, 575, 498, 592, 551, 513, 468, 483,  71, 521, 484,
        461, 495, 535, 514, 453, 496, 550, 532, 465, 466, 505, 559, 470,
        493, 480, 560, 469, 450,  91, 519, 594]),
 'B': array([388, 461, 499, 412, 493, 448, 363, 484, 385, 407, 401, 481, 480,
        485, 382, 446, 351, 438, 490, 464, 453, 478, 488, 356, 4

In [9]:
# Parameters
# match: probability of having same linking var when being true matches
# unmatch: probability of having same linking var (at all)

match = np.repeat(0.95, len(identifiers.keys()))
unmatch = A.apply(lambda row: comparison_vector(row, B, identifiers).sum(axis=0), axis=1).sum() / (A.shape[0]*B.shape[0]) # for each A record and for each linking variable, we check 'the probability' to match (over the nA * nB pairs)

In [10]:
def linking_score(A, B, identifiers, match, unmatch):
        
    """ Compare records in A with records in B, computing all linking scores for records in A with records in B. 
        Return the indices of records in A with the best match index for record in B.

        A:            dataframe, 
        B:            dataframe, 
        identifiers:  dict: k = column name, 
                            v = method in {'large','strict','levenshtein','jaro-winkler'}
        match:        array of probabilities of having same linking variables when being a match,
        unmatch:      array of probabilities of having same linking variables (at all, among the nA x nB pairs of record).
    """

    def compute_max_linking_score(A_record, B, identifiers, match, unmatch):
        similarities = comparison_vector(A_record, B, identifiers)
        linking_score = (np.multiply(similarities, np.log2(match/unmatch)) + np.multiply(1-similarities, np.log2((1-match)/(1-unmatch)))).sum(axis=1)
        return linking_score.argmax(), linking_score.max()

    links = A.apply(lambda row: compute_max_linking_score(row[list(identifiers.keys())], B, identifiers, match, unmatch), axis=1)
    idx_in_A = np.arange(A.shape[0])
    idx_in_B = np.array([element[0] for element in links])
    matching_scores = np.array([element[1] for element in links])
    return {'A':idx_in_A, 'B':idx_in_B, 'scores':matching_scores}

linking_score(A, B, identifiers, match, unmatch)

{'A': array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176

In [11]:
def stratified_ATE(DF_group, pop_size):

    """ Compute the Average Treatment Effect on the specific stratum represented in DF_group.

        DF_group: dataframe. 
    """

    n_treated, n_untreated = DF_group[DF_group.treatment == 1].shape[0], DF_group[DF_group.treatment == 0].shape[0]
    assert (n_treated!=0)&(n_untreated!=0), ("One group among treated/untreated is empty with this stratification.")
    avg_outcome_treated = (DF_group.treatment * DF_group.Y).sum() / n_treated 
    avg_outcome_untreated = ((1 - DF_group.treatment) * DF_group.Y).sum() / n_untreated
    var_treated = ( (DF_group.treatment * DF_group.Y - avg_outcome_treated)**2 ).sum() / (n_treated - 1)
    var_untreated = ( ((1 - DF_group.treatment) * DF_group.Y - avg_outcome_untreated)**2 ).sum() / (n_untreated - 1)
    size_group = DF_group.shape[0]
    ATE_group = (size_group/pop_size) * (avg_outcome_treated - avg_outcome_untreated)
    variance_group = (size_group/pop_size)**2 * (var_untreated/n_untreated + var_treated/n_treated)
    return ATE_group, variance_group

stratified_ATE(DF, DF.shape[0])

(43.855209924135366, 0.15908948881291174)

In [12]:
def logit(p):
    return np.log(p/(1-p))

def propensity_score(DF, covariates, scaler, convert_to_logit):
    
    """ Compute propensity score estimates: the probability (logistic regression) that an observation is treated or not conditioned on some covariates.

        DF:                dataframe,
        covariates:        list of strings for covariates variable in DF,
        scaler:            sklearn.preprocessing function scaler for exemple,
        convert_to_logit:  boolean for converting probabilities to logit when building the propensity score estimates based on a logistic regression
    """

    if scaler != None:
        pipe = Pipeline([('scaler', scaler),('logistic_classifier', LogisticRegression())])
    else:
        pipe = Pipeline([('logistic_classifier', LogisticRegression())])
    pipe.fit(DF[covariates], DF.treatment)
    predictions = pipe.predict_proba(DF[covariates])
    if convert_to_logit:
        predictions_logit = logit(predictions[:,1])
        return predictions_logit
    else: 
        return predictions[:,1]

propensity_score(DF, covariates, MinMaxScaler(), True)

array([-0.41007383, -0.38821264, -0.39971652, ..., -0.40583505,
       -0.42754707, -0.43093322])

In [13]:
DF['propensity_score'] = propensity_score(DF, covariates, MinMaxScaler(), True)
q = 5 # recommended
DF['prop_score_quantile'] = pds.qcut(DF['propensity_score'], q, labels = False)

# check the balance
np.array([(DF[DF['prop_score_quantile']==value].treatment.value_counts() > DF.shape[0]/(q*3)).all() for value in range(q)]).all()

True

In [14]:
def ATE(DF, strata): # covariates=None, scaler=None, convert_to_logit=None,
    
    """ Compute the Average Treatment Effect in DF according to the stratification method:
        no stratification when strata is None, 
        stratified dataframe build based on the list of specific covariates when one is passed,
        or propensity score stratification.
        Propensity score estimates are built conditionaly on covariates passed using a logistic regression after transformation by scaler (when one is specified).
        Estimated probabilities can be converted into logit (convert_to_logit parameter).
        Quantiles are used to partition the data based on propensity score estimates.
        'covariates', 'scaler', 'convert_to_logit', 'quantiles' are only needed for 'propensity stratification' method.

        DF:                dataframe,
        strata:            value in {None, [...], 'propensity stratification'},
        covariates:        list of strings for covariates variable in DF,
        scaler:            sklearn.preprocessing function scaler for exemple,
        convert_to_logit:  boolean for converting probabilities to logit when building the propensity score estimates based on a logistic regression,
        quantiles:         list of quantiles (>0) to consider to build strata based on propensity score
    """
    
    pop_size = DF.shape[0]
    if strata == None: # no stratification
        return stratified_ATE(DF, pop_size)
    elif strata == 'propensity stratification': # propensity score stratification
        assert ('propensity_score' in DF.columns)&('prop_score_quantile' in DF.columns), ("For propensity score stratification you need first to add the propensity_score and prop_score_quantile columns into the dataframe.")
        ATE, variance = 0, 0
        for q in DF['prop_score_quantile'].unique():
            stratum_data = DF[DF['prop_score_quantile'] == q]
            ATE_stratum, variance_stratum = stratified_ATE(stratum_data, pop_size)
            ATE += ATE_stratum
            variance += variance_stratum
        return ATE, variance
    else: # stratification based on the covariates passed
        ATE, variance = 0, 0
        for stratum in DF.groupby(strata):
            stratum_id, stratum_data = stratum
            ATE_stratum, variance_stratum = stratified_ATE(stratum_data, pop_size)
            ATE += ATE_stratum
            variance += variance_stratum
        return ATE, variance

ATE(DF, 'propensity stratification')

(43.83599521070258, 0.15905622636676375)

In [15]:
ATE(DF, None)

(43.855209924135366, 0.15908948881291174)

In [16]:
ATE(DF, ['gender'])

  for stratum in DF.groupby(strata):


(43.85639628206826, 0.15913698174475693)

In [17]:
def Estimate_Tethered_Stopping_Rule(A, B, identifiers, match, unmatch, covariates, scaler, convert_to_logit):

    """ Compare records in A with records in B, computing all linking scores for records in A with records in B. 
        Return the indices of records in A with the best match index for record in B.

        A: dataframe, 
        B: dataframe, 
        identifiers: dict: k = column name, v = method in {'large','strict','levenshtein','jaro-winkler',
        match: array of probabilities of having same linking variables when being a match,
        unmatch: array of probabilities of having same linking variables (at all, among the nA x nB pairs of record),
        strata: .
    """
    
    correct_links = almost_exact_matches(A, B, identifiers)
    matchings = linking_score(A, B, identifiers, match, unmatch)
    from_A = A.iloc[correct_links['A'],:].drop('source', axis=1).reset_index(drop=True)
    from_B = B.iloc[correct_links['B'],:]['Y'].reset_index(drop=True)
    linked_records = pds.concat([from_A, from_B], axis = 1)
    ATE_links, variance_links = ATE(linked_records, 'propensity stratification')
    ATE_list = [ATE_links]
    Var_list = [variance_links]
    for score in np.sort(np.unique(matchings['scores']))[::-1][1:]:
        new_matchings = matchings['A'][matchings['scores'] == score]
        from_A = A.iloc[new_matchings,:].drop('source', axis=1).reset_index(drop=True)
        from_B = B.iloc[matchings['B'][new_matchings],:]['Y'].reset_index(drop=True)
        linked_records = pds.concat([linked_records, pds.concat([from_A,from_B], axis = 1)])
        ATE_links, variance_links = ATE(linked_records, 'propensity stratification')
        ATE_list.append(ATE_links)
        Var_list.append(variance_links)
    return ATE_list, Var_list

A['propensity_score'] = propensity_score(A, covariates, MinMaxScaler(), True)
q = 5 # recommended
A['prop_score_quantile'] = pds.qcut(A['propensity_score'], q, labels = False)

# check the balance
np.array([(A[A['prop_score_quantile']==value].treatment.value_counts() > A.shape[0]/(q*3)).all() for value in range(q)]).all()

Estimate_Tethered_Stopping_Rule(A, B, identifiers, match, unmatch, ['X1','X2','X3'], MinMaxScaler(), True)

([47.33719513914916,
  46.39276952192214,
  46.153054900522065,
  45.27140935392173,
  44.72825817031678,
  15.600965590480033,
  15.49195327496395,
  15.309637357517015,
  13.781570614273592,
  13.781329335788602],
 [69.30381736813325,
  65.3316786061727,
  67.11109893024654,
  64.5648768468791,
  61.89730849861741,
  13.821247094960613,
  13.805832166752769,
  13.432986717798697,
  11.510034145134389,
  11.402201028704862])