In [1]:
import numpy as np
import textdistance
import timeit
import pandas as pds
from sklearn import mixture

In [3]:
name_DF = 'DF_associations_N=8795_2022-12-30.csv'
DF = pds.read_csv(os.path.join(name_DF), delimiter = ',')

A = DF.sample(n = 50)
A = A.reset_index(drop=True)

B = pds.concat([DF.sample(n =35),A.sample(n = 10)])
B = B.reset_index(drop=True)

In [4]:
A.head()

Unnamed: 0,name,family name,gender,country,birth year,treatment,X1,X2,X3,X4,X5,Y
0,Horst,Hentschel,M,DE,1939,0,81,7.812391,0.411026,6.373784,7.0,9.776982
1,Kaja,Margaret,F,PL,1979,1,41,10.425177,3.55657,6.950604,7.0,167.361336
2,Bernhard,Hajdinjak,M,AT,1948,1,72,6.210948,2.910845,6.463527,7.0,116.706191
3,Dalibor,Trajkovski,M,DE,1959,0,61,7.728919,-0.51214,5.957264,7.0,-17.615349
4,Arman,Tajik,M,DE,2001,1,19,11.650607,1.509486,7.145962,7.0,77.954851


In [5]:
B.head()

Unnamed: 0,name,family name,gender,country,birth year,treatment,X1,X2,X3,X4,X5,Y
0,Michael,Frenz,M,DE,1939,0,81,7.370717,0.402362,6.745988,7.0,8.969322
1,O Nyx,Dangi,F,DE,1939,0,81,6.101504,1.355549,6.910107,7.0,29.181946
2,Geraldine,Power,F,IE,2019,0,1,8.229069,-0.725252,6.964056,7.0,-24.909288
3,Frank,Schmitt,M,DE,1939,1,81,6.447628,3.086866,5.926535,7.0,129.746036
4,Juzew,Breszka,M,PL,2003,0,17,9.905752,1.023001,7.369937,7.0,36.40197


In [6]:
def levenshtein_similarity(a,b):
    if 1 - textdistance.levenshtein(a, b)/max(len(a),len(b)) >= 0.95:
        return 1
    else:
        return 0

def jaro_winkler_similarity(a,b):
    if textdistance.jaro_winkler(a,b) >= 0.95:
        return 1
    else:
        return 0

def strict_equality(a,b):
    return int(a==b)

def large_equality(a,b):
    return np.isclose(a,b) # a revoir

In [7]:
def comparison_vector(A_record, B, identifiers):
    
    """ Compare one record in A with all records in B. 
        Return the binary comparison of the identifiers for one record in A with all records in B.

        A_record: series of one row, 
        B: dataframe, 
        identifiers: dict: k = column name, v = method in {'large','strict','levenshtein','jaro-winkler'}
    """

    methods = {'jaro-winkler':jaro_winkler_similarity, 'levenshtein':levenshtein_similarity, 'strict':strict_equality, 'large':large_equality}
    comparisons = {}
    for linking_var in identifiers:
        method = methods[identifiers[linking_var]]
        comparisons[linking_var] = np.array(B.apply(lambda row: method(A_record[linking_var], row[linking_var]), axis=1)).reshape(-1,1)
    return np.concatenate(tuple(comparisons.values()), axis = 1) 
    
A_record = A.iloc[0,:]
identifiers = {'family name': 'jaro-winkler', 'gender': 'strict', 'birth year': 'strict'}
comparison_vector(A_record, B, identifiers)

array([[0, 1, 1],
       [0, 0, 1],
       [0, 0, 0],
       [0, 1, 1],
       [0, 1, 0],
       [0, 0, 0],
       [0, 1, 0],
       [0, 1, 1],
       [0, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 1, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 1]])

In [8]:
def exact_matches(A, B, identifiers):
    
    """ Compare A records and B records.
        Return the exact common records (where all identifiers are equals).

        A: dataframe, 
        B: dataframe,
        identifiers: dict: k = column name, v = method in {'large','strict','levenshtein','jaro-winkler'}
    """

    A['source'] = 'A'
    B['source'] = 'B'
    df = pds.concat([A,B])
    linking_var = list(identifiers.keys())
    duplicata = df[df[linking_var].duplicated(keep=False)]
    duplicata = np.array(duplicata.groupby(list(df[linking_var])).apply(lambda row: row.index))
    return {'A':np.array([idx[0] for idx in duplicata]), 'B':np.array([idx[1] for idx in duplicata])}

exact_matches(A, B, identifiers)    

{'A': array([ 9, 40, 32, 41,  1, 16, 39, 13,  4,  5]),
 'B': array([41, 39, 40, 44, 38, 43, 36, 42, 35, 37])}

In [9]:
# Parameters
# match: probability of having same linking var when being true matches
# unmatch: probability of having same linking var (at all)
match = np.repeat(0.95, len(identifiers.keys()))
unmatch = A.apply(lambda row: comparison_vector(row, B, identifiers).sum(axis=0), axis=1).sum() / (A.shape[0]*B.shape[0]) # for each A record and for each linking variable, we check 'the probability' to match (over the nA * nB pairs)

In [10]:
def linking_score(A, B, identifiers, match, unmatch):
        
    """ Compare records in A with records in B, computing all linking scores for records in A with records in B. 
        Return the indices of records in A with the best match index for record in B.

        A: dataframe, 
        B: dataframe, 
        identifiers: dict: k = column name, v = method in {'large','strict','levenshtein','jaro-winkler',
        match: array of probabilities of having same linking variables when being a match,
        unmatch: array of probabilities of having same linking variables (at all, among the nA x nB pairs of record).
    """

    def compute_max_linking_score(A_record, B, identifiers, match, unmatch):
        similarities = comparison_vector(A_record, B, identifiers)
        linking_score = (np.multiply(similarities, np.log2(match/unmatch)) + np.multiply(1-similarities, np.log2((1-match)/(1-unmatch)))).sum(axis=1)
        return linking_score.argmax(), linking_score.max()
    links = A.apply(lambda row: compute_max_linking_score(row[list(identifiers.keys())], B, identifiers, match, unmatch), axis=1)
    idx_in_A = np.arange(A.shape[0])
    idx_in_B = np.array([element[0] for element in links])
    matching_scores = np.array([element[1] for element in links])
    return {'A':idx_in_A, 'B':idx_in_B, 'scores':matching_scores}

linking_score(A, B, identifiers, match, unmatch)

{'A': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
 'B': array([ 0, 38,  0, 28, 35, 37, 36,  5,  0, 41,  1, 22, 38, 42, 33, 33, 43,
        24, 22,  5,  1, 38,  5, 22,  5, 16,  5,  1, 22, 21,  0, 28, 40,  0,
        22,  0, 33,  0, 28, 36, 39, 44,  1, 33, 33, 29, 10,  0,  5,  0]),
 'scores': array([ 0.22196919, 12.13910389, -7.71401432,  0.22196919, 12.13910389,
        12.13910389,  4.20312039, -3.79463278, -7.71401432, 12.13910389,
        -7.71401432,  0.22196919, -3.79463278, 12.13910389,  0.22196919,
         0.22196919, 12.13910389,  0.22196919,  0.22196919,  0.22196919,
        -7.71401432,  0.22196919,  0.22196919,  0.22196919,  0.22196919,
         0.22196919, -3.79463278, -7.71401432,  0.22196919,  0.22196919,
        -7.71401432,  0.22196919, 12.13910389, -7.71401432,  0.22196919,
        -7.714014

In [6]:
def Estimate_Tethered_Stopping_Rule(A, B, identifiers, match, unmatch, strata):

    """ Compare records in A with records in B, computing all linking scores for records in A with records in B. 
        Return the indices of records in A with the best match index for record in B.

        A: dataframe, 
        B: dataframe, 
        identifiers: dict: k = column name, v = method in {'large','strict','levenshtein','jaro-winkler',
        match: array of probabilities of having same linking variables when being a match,
        unmatch: array of probabilities of having same linking variables (at all, among the nA x nB pairs of record),
        strata: .
    """
    correct_links = exact_matches(A, B, identifiers)
    matchings = linking_score(A, B, identifiers, match, unmatch)


SyntaxError: incomplete input (284073500.py, line 3)