In [1]:
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
from arfpy import arf
import numpy as np
import pandas as pds
from matplotlib import pyplot as plt
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score
from splink import DuckDBAPI, block_on
from splink.blocking_analysis import (
    cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
)
import splink.comparison_level_library as cll
import splink.comparison_library as cl
import gc
import math

In [2]:
def one_round(list_params_key, linkage, nA, nB, overlap, save_df, setting_info, dedup=False):

    df_origin = pds.DataFrame(np.array([np.round(np.random.normal(size=nA)*list_params[list_params_key][0]+50,0),
                            np.round(np.random.beta(0.2, 0.2, size=nA)*list_params[list_params_key][1]+1,0),
                            np.random.randint(0, list_params[list_params_key][2], size=nA)+1,
                            np.random.randint(0, list_params[list_params_key][3], size=nA)+1,
                            np.round(np.random.beta(1, 2, size=nA)*list_params[list_params_key][4]+1,0)]).transpose())
    
    df_destination = pds.DataFrame(np.array([np.round(np.random.normal(size=nB)*list_params[list_params_key][0]+50,0),
                                np.round(np.random.beta(0.2, 0.2, size=nB)*list_params[list_params_key][1]+1,0),
                                np.random.randint(0, list_params[list_params_key][2], size=nB)+1,
                                np.random.randint(0, list_params[list_params_key][3], size=nB)+1,
                                np.round(np.random.beta(1, 2, size=nB)*list_params[list_params_key][4]+1,0)]).transpose())
            
    df_origin.loc[:,"link"] = False
    df_destination.loc[:,"link"] = False

    # "at_random", "strong_dep_pivs", "light_dep_pivs"

    if linkage == "strong_dep_pivs":

        # selection on piv
        df_origin.loc[ (df_origin[2]) > (df_origin[2]).quantile(1-overlap),"link"] = True
    
        df_origin.loc[ df_origin.link, 0 ] = np.round(np.random.normal(size=df_origin.link.sum())*(list_params[list_params_key][0]), 0)
        df_origin.loc[ df_origin.link, 2 ] = np.random.randint(list_params[list_params_key][2], 2* list_params[list_params_key][2], size=df_origin.link.sum())+1
        df_origin.loc[ df_origin.link, 4 ] = np.round(np.random.beta(2, 1, size=df_origin.link.sum())*list_params[list_params_key][4], 0)

    elif linkage == "light_dep_pivs":

        # selection on piv
        df_origin.loc[ (df_origin[2]) > (df_origin[2]).quantile(1-overlap),"link"] = True
    
        # df_origin.loc[ df_origin.link, 0 ] = np.round(np.random.normal(size=df_origin.link.sum())*list_params[list_params_key][0]+45, 0)
        # # df_origin.loc[ df_origin.link, 2 ] = np.random.randint(0, 0.5*list_params[list_params_key][2], size=df_origin.link.sum())+1
        # df_origin.loc[ df_origin.link, 4 ] = np.round(np.random.beta(2, 1, size=df_origin.link.sum())*list_params[list_params_key][4], 0)

    elif linkage == "at_random":

        rng = np.random.default_rng()
        rdgen = rng.choice(df_origin.shape[0], size=int(overlap*df_origin.shape[0]), replace=False)
        df_origin.loc[rdgen,"link"] = True

    df_origin.loc[:,"unique_id"] = range(1,1+df_origin.shape[0])
    df_destination.loc[:,"unique_id"] = range(5000001,5000001+df_destination.shape[0])

    df_origin.loc[:,"synthetic"] = False
    df_destination.loc[:,"synthetic"] = False

    df_origin.columns = ["piv1", "piv2", "piv3", "piv4", "piv5", "link", "unique_id", "synthetic"]
    df_destination.columns = ["piv1", "piv2", "piv3", "piv4", "piv5", "link", "unique_id", "synthetic"]
        
    # links get identical in the other file:
    df_destination.iloc[np.where(df_origin.link)[0]] = df_origin.iloc[np.where(df_origin.link)[0]]

    df_origin.loc[:,"duplication"] = False
    df_destination.loc[:,"duplication"] = False
    
    if dedup:
        
        # we augment each file with 5% of its size with duplicated records
        # 2.5% of the links get duplicated (at random)
        # 2.5% of the non-links get duplicated (at random)
        
        rng = np.random.default_rng()
        rdgen = rng.choice(np.where(df_origin.link)[0], size=int(0.025*df_origin.shape[0]), replace=False)
        df_origin.loc[rdgen,"duplication"] = True
        rng = np.random.default_rng()
        rdgen = rng.choice(np.where(~df_origin.link)[0], size=int(0.025*df_origin.shape[0]), replace=False)
        df_origin.loc[rdgen,"duplication"] = True
        
        rng = np.random.default_rng()
        rdgen = rng.choice(np.where(df_destination.link)[0], size=int(0.025*df_destination.shape[0]), replace=False)
        df_destination.loc[rdgen,"duplication"] = True
        rng = np.random.default_rng()
        rdgen = rng.choice(np.where(~df_destination.link)[0], size=int(0.025*df_destination.shape[0]), replace=False)
        df_destination.loc[rdgen,"duplication"] = True

        # duplicate:
        new_set = df_origin.loc[df_origin.duplication,:].copy()
        df_origin = pds.concat([df_origin, new_set])
        new_set = df_destination.loc[df_destination.duplication,:].copy()
        df_destination = pds.concat([df_destination, new_set])

        nA = df_origin.shape[0]
        nB = df_destination.shape[0]

        df_origin = df_origin.reset_index(drop=True)
        df_destination = df_destination.reset_index(drop=True)

    for column in df_destination.columns:
        unique_values = df_destination[column].unique()
        print(f"Unique values in column '{column}': {len(unique_values)}")

    # info to save
    overlapInA = sum(df_origin.link)/df_origin.shape[0]
    overlapInB = sum(df_destination.link)/df_destination.shape[0]
    difficultyInA = len(np.unique(df_origin[["piv1", "piv2", "piv3", "piv4", "piv5"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)))/df_origin.shape[0]
    difficultyInB = len(np.unique(df_destination[["piv1", "piv2", "piv3", "piv4", "piv5"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)))/df_destination.shape[0]
    difficultyInL = len(np.unique(df_destination.loc[df_destination.link,["piv1", "piv2", "piv3", "piv4", "piv5"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)))/df_destination.link.sum()
    difficultyInNL = len(np.unique(df_destination.loc[~df_destination.link,["piv1", "piv2", "piv3", "piv4", "piv5"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)))/(df_destination.shape[0]-df_destination.link.sum())

    print(difficultyInB)
    max_id_ = max( max(df_destination.unique_id), max(df_origin.unique_id) )

    #
    # print("PIV1")
    # print("links profile")
    # plt.show(plt.hist(df_destination.loc[df_destination.link,"piv1"]))
    # print("PIV1")
    # print("non-links profile")
    # plt.show(plt.hist(df_destination.loc[~df_destination.link,"piv1"]))
    # print("PIV2")
    # print("links profile")
    # plt.show(plt.hist(df_destination.loc[df_destination.link,"piv2"]))
    # print("PIV2")
    # print("non-links profile")
    # plt.show(plt.hist(df_destination.loc[~df_destination.link,"piv2"]))
    # print("PIV3")
    # print("links profile")
    # plt.show(plt.hist(df_destination.loc[df_destination.link,"piv3"]))
    # print("PIV3")
    # print("non-links profile")
    # plt.show(plt.hist(df_destination.loc[~df_destination.link,"piv3"]))
    # print("PIV4")
    # print("links profile")
    # plt.show(plt.hist(df_destination.loc[df_destination.link,"piv4"]))
    # print("PIV4")
    # print("non-links profile")
    # plt.show(plt.hist(df_destination.loc[~df_destination.link,"piv4"]))
    # print("PIV5")
    # print("links profile")
    # plt.show(plt.hist(df_destination.loc[df_destination.link,"piv5"]))
    # print("PIV5")
    # print("non-links profile")
    # plt.show(plt.hist(df_destination.loc[~df_destination.link,"piv5"]))
    #

    # Train the ARF
    my_arf = arf.arf(x = df_destination[["piv1", "piv2", "piv3", "piv4", "piv5"]])
    
    # Get density estimates
    my_arf.forde()

    nS = round(0.10*df_destination.shape[0])
    
    # Generate data
    synthdata = my_arf.forge(n = nS)
    
    synthdata['unique_id'] = max_id_ + np.array(range(1,1+nS))
    synthdata['link'] = False
    synthdata['synthetic'] = True
    
    synthetictest = synthdata.copy()
    synthetictest[["piv1", "piv2", "piv3", "piv4", "piv5"]] = round(synthetictest[["piv1", "piv2", "piv3", "piv4", "piv5"]],0)
    
    # investigate links profiles / synthetic profiles
    
    Xtrain = pds.concat([ df_destination.loc[0:round(nB/2,0),["piv1", "piv2", "piv3", "piv4", "piv5"]], synthetictest.loc[0:round(nS/2,0),["piv1", "piv2", "piv3", "piv4", "piv5"]]  ])
    ytrain = pds.concat([ df_destination.loc[0:round(nB/2,0),["synthetic"]], synthetictest.loc[0:round(nS/2,0),["synthetic"]]  ])

    print(round(nB/2,0))
    print(df_destination.index)
        
    Xtest = pds.concat([ df_destination.loc[round(nB/2,0):nB,["piv1", "piv2", "piv3", "piv4", "piv5"]], synthetictest.loc[round(nS/2,0):nS,["piv1", "piv2", "piv3", "piv4", "piv5"]]  ])
    ytest = pds.concat([ df_destination.loc[round(nB/2,0):nB,["synthetic"]], synthetictest.loc[round(nS/2,0):nS,["synthetic"]]  ])

    # info to save
    ytrain_synth_prop = ytrain.sum()/ytrain.shape[0]
    ytest_synth_prop = ytest.sum()/ytest.shape[0]
    
    model = xgb.XGBClassifier()
    model.fit(Xtrain, ytrain)
    predictions = model.predict(Xtest)
    y_score = model.predict_proba(Xtest)[:, 1]
    auc = roc_auc_score(ytest, y_score)

    # info to save
    auc_synth = auc

    rng = np.random.default_rng()
    rdgen = rng.choice(np.where(df_destination.link)[0], size=sum(df_destination.link), replace=False)
    rng2 = np.random.default_rng()
    rdgen2 = rng.choice(np.where(~df_destination.link)[0], size=sum(~df_destination.link), replace=False)
    
    XtrainL1 = df_destination.loc[rdgen[0:int(len(rdgen)/2)],["piv1", "piv2", "piv3", "piv4", "piv5"]]
    XtrainL2 = df_destination.loc[rdgen2[0:int(len(rdgen2)/2)],["piv1", "piv2", "piv3", "piv4", "piv5"]]
    XtrainL = pds.concat([XtrainL1,XtrainL2])
    ytrainL1 = df_destination.loc[rdgen[0:int(len(rdgen)/2)],["link"]]
    ytrainL2 = df_destination.loc[rdgen2[0:int(len(rdgen2)/2)],["link"]]
    ytrainL = pds.concat([ytrainL1,ytrainL2])
    
    XtestL1 = df_destination.loc[rdgen[int(len(rdgen)/2):len(rdgen)],["piv1", "piv2", "piv3", "piv4", "piv5"]]
    XtestL2 = df_destination.loc[rdgen2[int(len(rdgen2)/2):len(rdgen2)],["piv1", "piv2", "piv3", "piv4", "piv5"]]
    XtestL = pds.concat([XtestL1,XtestL2])
    ytestL1 = df_destination.loc[rdgen[int(len(rdgen)/2):len(rdgen)],["link"]]
    ytestL2 = df_destination.loc[rdgen2[int(len(rdgen2)/2):len(rdgen2)],["link"]]
    ytestL = pds.concat([ytestL1,ytestL2])

    # info to save
    ytrain_link_prop = ytrainL.sum()/ytrainL.shape[0]
    ytest_link_prop = ytestL.sum()/ytestL.shape[0]
    
    model = xgb.XGBClassifier()
    model.fit(XtrainL, ytrainL)
    predictions = model.predict(XtestL)
    y_scoreL = model.predict_proba(XtestL)[:, 1]
    auc = roc_auc_score(ytestL, y_scoreL)

    # info to save
    auc_link = auc

    NEW_df_destination = pds.concat([df_destination, synthetictest])

    NEW_df_destination[["piv1", "piv2", "piv3", "piv4", "piv5"]] = NEW_df_destination[["piv1", "piv2", "piv3", "piv4", "piv5"]].astype(int)
    df_origin[["piv1", "piv2", "piv3", "piv4", "piv5"]] = df_origin[["piv1", "piv2", "piv3", "piv4", "piv5"]].astype(int)

    if save_df:

        NEW_df_destination.to_csv(f'{setting_info}_df_destination{_}.csv', index=False)
        df_origin.to_csv(f'{setting_info}_df_origin{_}.csv', index=False)
    
    blocking_rule_1 = """
    round(l.piv1) = round(r.piv1) and
    round(l.piv2) = round(r.piv2) and
    round(l.piv3) = round(r.piv3) and
    round(l.piv4) = round(r.piv4) and
    round(l.piv5) = round(r.piv5)
    """
    
    db_api = DuckDBAPI()
    
    brs = [
      blocking_rule_1,
    ]
    
    comparison_1 = {
      "output_column_name": "piv1",
      "comparison_levels": [
          cll.NullLevel("piv1"),
          cll.ExactMatchLevel("piv1"),
          cll.ElseLevel(),
      ],
      "comparison_description": "piv1 difference",
    }
    
    comparison_2 = {
      "output_column_name": "piv2",
      "comparison_levels": [
          cll.NullLevel("piv2"),
          cll.ExactMatchLevel("piv2"),
          cll.ElseLevel(),
      ],
      "comparison_description": "piv2 difference",
    }
    
    comparison_3 = {
      "output_column_name": "piv3",
      "comparison_levels": [
          cll.NullLevel("piv3"),
          cll.ExactMatchLevel("piv3"),
          cll.ElseLevel(),
      ],
      "comparison_description": "piv3 difference",
    }
    
    comparison_4 = {
      "output_column_name": "piv4",
      "comparison_levels": [
          cll.NullLevel("piv4"),
          cll.ExactMatchLevel("piv4"),
          cll.ElseLevel(),
      ],
      "comparison_description": "piv4 difference",
    }
    
    comparison_5 = {
      "output_column_name": "piv5",
      "comparison_levels": [
          cll.NullLevel("piv5"),
          cll.ExactMatchLevel("piv5"),
          cll.ElseLevel(),
      ],
      "comparison_description": "piv5 difference",
    }
    
    settings = SettingsCreator(
      link_type="link_only",
      blocking_rules_to_generate_predictions=brs,
      comparisons=[
          comparison_1,
          comparison_2,
          comparison_3,
          comparison_4,
          comparison_5,
      ],
      retain_intermediate_calculation_columns=True,
    )
    
    linker = Linker(
      [df_origin, NEW_df_destination],
      settings,
      input_table_aliases=["__ori", "_dest"],
      db_api=db_api,
    )
    
    linker.training.estimate_u_using_random_sampling(max_pairs=1e7)
    
    # linker.training.estimate_parameters_using_expectation_maximisation(block_on("piv1"))
    # linker.training.estimate_parameters_using_expectation_maximisation(block_on("piv2"))
    # linker.training.estimate_parameters_using_expectation_maximisation(block_on("piv3"))
    # linker.training.estimate_parameters_using_expectation_maximisation(block_on("piv4"))
    # linker.training.estimate_parameters_using_expectation_maximisation(block_on("piv5"))
    linker.training.estimate_parameters_using_expectation_maximisation(block_on("piv1","piv2","piv3"))
    linker.training.estimate_parameters_using_expectation_maximisation(block_on("piv4","piv5"))
    
    df_predict = linker.inference.predict(threshold_match_probability=0.00001)

    # info to save
    max_proba_linked = max(df_predict.as_pandas_dataframe().match_probability)
    
    median_proba_linked = np.median(df_predict.as_pandas_dataframe().match_probability)
    median_proba_linked = math.floor(median_proba_linked * 100) / 100

    linkedpairs05 = df_predict.as_pandas_dataframe()

    # we cheat because the model sometimes does not link anything
    # if there are not enough linked pairs: threshold is set at median match_probability
    # if there are enough linked pairs: threshold is set at max(0.5, median match_probability) (this may help for 1-2-1 assignment constraint)
    # in general we cheat because we do not enforce 1-2-1 assignment constraint (it is not implemented in SPLink)
    if max_proba_linked >= 0.5:
        threshold_for_links = 0.5 # max(0.5, median_proba_linked)
    else:
        threshold_for_links = median_proba_linked
        
    linkedpairs05 = linkedpairs05.loc[linkedpairs05.match_probability>threshold_for_links,:]
    
    synthlinkedpairs = linkedpairs05[linkedpairs05.unique_id_r > max_id_]
    reallinkedpairs = linkedpairs05[linkedpairs05.unique_id_r <= max_id_]

    tp = 0
    fp = 0
    true_fdr = 0
    hat_fdr_us = 0
    hat_fdr_prob = 0

    if reallinkedpairs.shape[0]!=0 or synthlinkedpairs.shape[0]!=0:
    
        tp = (reallinkedpairs.unique_id_l == reallinkedpairs.unique_id_r).sum()
        fp = (reallinkedpairs.unique_id_l != reallinkedpairs.unique_id_r).sum()
        true_fdr = fp/(tp+fp)
        hat_fdr_us = ( synthlinkedpairs.shape[0] * (nB / nS) ) / reallinkedpairs.shape[0]
        hat_fdr_prob = 1 - reallinkedpairs[reallinkedpairs.match_probability>threshold_for_links].match_probability.sum() / reallinkedpairs[reallinkedpairs.match_probability>threshold_for_links].shape[0]
        
    # info to save
    synth_pairs_prop = synthlinkedpairs.shape[0] / nS
    real_pairs_prop = fp / nB
    
    return overlapInA, overlapInB, difficultyInA, difficultyInB, difficultyInL, difficultyInNL, ytrain_synth_prop, ytest_synth_prop, auc_synth, ytrain_link_prop, ytest_link_prop, auc_link, max_proba_linked, median_proba_linked, threshold_for_links, synth_pairs_prop, real_pairs_prop, true_fdr, hat_fdr_us, hat_fdr_prob        


In [None]:
# STUDY SCALABILITY

linkagestructures = ["at_random"]
nAlist = [100000]
list_params = { 85:[10,16,10,15,14], 95:[10,20,15,25,20] }       
set_of_overlap = [0.35, 0.75]      

nB = 200000
save_df = False # True

In [3]:
# STUDY ROBUSTNESS TO LINKS HAPPEN AT RANDOM

linkagestructures = ["at_random", "strong_dep_pivs"] # "light_dep_pivs"
nAlist = [2002, 4502]
list_params = { 85:[1.5,7,7,8,8], 95:[2,9,9,10,10] } 
set_of_overlap = [0.35, 0.75]      

nB = 5000
save_df = False # True

In [None]:
# STUDY ROBUSTNESS TO THE PRESENCE OF NON-DEDUPLICATED DATA

linkagestructures = ["at_random"]
nAlist = [2002, 4502]
list_params = { 85:[10,16,10,15,14], 95:[10,20,15,25,20] }       
set_of_overlap = [0.35, 0.75]      

nB = 5000
save_df = False # True
# change the dedup parameter below in the code!!!

In [None]:
                          
dico_results_final = {}

for linkage in linkagestructures:
    
    dico_results_linkage = {}
    
    for nA in nAlist:
    
        dico_results_size = {}
        
        for list_params_key in list_params.keys():
            
            dico_results_setting = {}
            
            for overlap in set_of_overlap:
            
                print("OVERLAP")
                print(overlap)
        
                iter_secu = 0
            
                overlapInA_res = []
                overlapInB_res = []
                difficultyInA_res = []
                difficultyInB_res = []
                difficultyInL_res = []
                difficultyInNL_res = []
                ytrain_synth_prop_res = []
                ytest_synth_prop_res = []
                auc_synth_res = []
                ytrain_link_prop_res = []
                ytest_link_prop_res = []
                auc_link_res = []
                max_proba_linked_res = []
                median_proba_linked_res = []
                threshold_for_links_res = []
                synth_pairs_prop_res = []
                real_pairs_prop_res = []
                true_fdr_res = []
                hat_fdr_us_res = []
                hat_fdr_prob_res = []
        
                setting_info = f"links_{linkage}_nA_{nA}_discrlevel_{list_params_key}_overlaplevel_{overlap}_highdim"   
            
                while len(np.nonzero(true_fdr_res)[0]) < 10 and iter_secu < 15:
        
                    iter_secu += 1
            
                    print("ITERATION")
                    print(iter_secu)
        
                    overlapInA, overlapInB, difficultyInA, difficultyInB, difficultyInL, difficultyInNL, ytrain_synth_prop, ytest_synth_prop, auc_synth, ytrain_link_prop, ytest_link_prop, auc_link, max_proba_linked, median_proba_linked, threshold_for_links, synth_pairs_prop, real_pairs_prop, true_fdr, hat_fdr_us, hat_fdr_prob = one_round(list_params_key, linkage, nA, nB, overlap, save_df, setting_info)   # # #   
        
                    overlapInA_res.append(overlapInA)
                    overlapInB_res.append(overlapInB)
                    difficultyInA_res.append(difficultyInA)
                    difficultyInB_res.append(difficultyInB)
                    difficultyInL_res.append(difficultyInL)
                    difficultyInNL_res.append(difficultyInNL)
                    ytrain_synth_prop_res.append(ytrain_synth_prop)
                    ytest_synth_prop_res.append(ytest_synth_prop)
                    auc_synth_res.append(auc_synth)
                    ytrain_link_prop_res.append(ytrain_link_prop)
                    ytest_link_prop_res.append(ytest_link_prop)
                    auc_link_res.append(auc_link)
                    max_proba_linked_res.append(max_proba_linked)
                    median_proba_linked_res.append(median_proba_linked)
                    threshold_for_links_res.append(threshold_for_links)
                    synth_pairs_prop_res.append(synth_pairs_prop)
                    real_pairs_prop_res.append(real_pairs_prop)
                    true_fdr_res.append(true_fdr)
                    hat_fdr_us_res.append(hat_fdr_us)
                    hat_fdr_prob_res.append(hat_fdr_prob)
                
                    dico_results_setting[overlap] = {
                        
                        "overlapInA" : overlapInA_res,
                        "overlapInB" : overlapInB_res,
                        "difficultyInA" : difficultyInA_res,
                        "difficultyInB" : difficultyInB_res,
                        "difficultyInL" : difficultyInL_res,
                        "difficultyInNL" : difficultyInNL_res,
                        "ytrain_synth_prop" : ytrain_synth_prop_res,
                        "ytest_synth_prop" : ytest_synth_prop_res,
                        "auc_synth" : auc_synth_res,
                        "ytrain_link_prop" : ytrain_link_prop_res,
                        "ytest_link_prop" : ytest_link_prop_res,
                        "auc_link" : auc_link_res,
                        "max_proba_linked" : max_proba_linked_res,
                        "median_proba_linked" : median_proba_linked_res,
                        "threshold_for_links" : threshold_for_links_res,
                        "synth_pairs_prop" : synth_pairs_prop_res,
                        "real_pairs_prop" : real_pairs_prop_res,
                        "true_fdr" : true_fdr_res,
                        "hat_fdr_us" : hat_fdr_us_res,
                        "hat_fdr_prob" : hat_fdr_prob_res
                        
                    }
        
                    print(nA)
                    print(list_params_key)
                    print(overlap)
                    print(dico_results_setting)
                
                    gc.collect()
        
            dico_results_size[list_params_key] = dico_results_setting
        
        dico_results_linkage[nA] = dico_results_size
    
    dico_results_final[linkage] = dico_results_linkage


OVERLAP
0.35
ITERATION
1
Unique values in column 'piv1': 11
Unique values in column 'piv2': 8
Unique values in column 'piv3': 7
Unique values in column 'piv4': 8
Unique values in column 'piv5': 9
Unique values in column 'link': 2
Unique values in column 'unique_id': 5000
Unique values in column 'synthetic': 1
Unique values in column 'duplication': 1
0.7932
Initial accuracy is 0.4904


  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.00432 in the m_probability of piv5, level `Exact match on piv5`
Iteration 2: Largest change in params was 0.00281 in the m_probability of piv5, level `All other comparisons`
Iteratio

2002
85
0.35
{0.35: {'overlapInA': [0.34965034965034963], 'overlapInB': [0.14], 'difficultyInA': [0.8921078921078921], 'difficultyInB': [0.7932], 'difficultyInL': [np.float64(0.9542857142857143)], 'difficultyInNL': [np.float64(0.8181395348837209)], 'ytrain_synth_prop': [synthetic    0.091206
dtype: float64], 'ytest_synth_prop': [synthetic    0.090909
dtype: float64], 'auc_synth': [np.float64(0.4827088)], 'ytrain_link_prop': [link    0.14
dtype: float64], 'ytest_link_prop': [link    0.14
dtype: float64], 'auc_link': [np.float64(0.49102591362126247)], 'max_proba_linked': [0.41406256855084755], 'median_proba_linked': [0.41], 'threshold_for_links': [0.41], 'synth_pairs_prop': [0.212], 'real_pairs_prop': [np.float64(0.2148)], 'true_fdr': [np.float64(0.6054114994363021)], 'hat_fdr_us': [0.5975197294250282], 'hat_fdr_prob': [np.float64(0.5859374314491524)]}}
ITERATION
2
Unique values in column 'piv1': 12
Unique values in column 'piv2': 8
Unique values in column 'piv3': 7
Unique values in colu

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.00192 in the m_probability of piv5, level `Exact match on piv5`
Iteration 2: Largest change in params was 0.00116 in the m_probability of piv5, level `All other comparisons`
Iteratio

2002
85
0.35
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002], 'difficultyInB': [0.7932, 0.7902], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285)], 'difficultyInNL': [np.float64(0.8181395348837209), np.float64(0.8137209302325581)], 'ytrain_synth_prop': [synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64], 'ytest_synth_prop': [synthetic    0.090909
dtype: float64, synthetic    0.090909
dtype: float64], 'auc_synth': [np.float64(0.4827088), np.float64(0.5224783999999999)], 'ytrain_link_prop': [link    0.14
dtype: float64, link    0.14
dtype: float64], 'ytest_link_prop': [link    0.14
dtype: float64, link    0.14
dtype: float64], 'auc_link': [np.float64(0.49102591362126247), np.float64(0.5131189368770764)], 'max_proba_linked': [0.41406256855084755, 0.39850141654742727], 'median_proba_linked': [0.41, 0.39], 'threshold_for_links': [0.

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was 0.00327 in the m_probability of piv5, level `All other comparisons`
Iteration 2: Largest change in params was 0.00232 in the m_probability of piv5, level `All other comparisons`
Iterati

2002
85
0.35
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076], 'difficultyInB': [0.7932, 0.7902, 0.789], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429)], 'difficultyInNL': [np.float64(0.8181395348837209), np.float64(0.8137209302325581), np.float64(0.8118604651162791)], 'ytrain_synth_prop': [synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64], 'ytest_synth_prop': [synthetic    0.090909
dtype: float64, synthetic    0.090909
dtype: float64, synthetic    0.090909
dtype: float64], 'auc_synth': [np.float64(0.4827088), np.float64(0.5224783999999999), np.float64(0.537624)], 'ytrain_link_prop': [link    0.14
dtype: float64, link    0.14
dtype: float64, link    0.14
dtype: float64], 'ytest_link_prop': [link    0.14
dtype: 

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was 0.00243 in the m_probability of piv5, level `All other comparisons`
Iteration 2: Largest change in params was -0.00158 in the m_probability of piv5, level `Exact match on piv5`
Iteratio

2002
85
0.35
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572)], 'difficultyInNL': [np.float64(0.8181395348837209), np.float64(0.8137209302325581), np.float64(0.8118604651162791), np.float64(0.8102325581395349)], 'ytrain_synth_prop': [synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64], 'ytest_synth_prop': [synthetic    0.090909
dtype: float64, synthetic    0.090909
dtype: float64, synthetic    0.090909
dtype: float64, synthetic    0.090909
dtype: float64], 'auc_synth': [np.float64(0.4827088), np.float6

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.00329 in the m_probability of piv4, level `Exact match on piv4`
Iteration 2: Largest change in params was 0.00193 in the m_probability of piv4, level `All other comparisons`
Iteratio

2002
85
0.35
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143)], 'difficultyInNL': [np.float64(0.8181395348837209), np.float64(0.8137209302325581), np.float64(0.8118604651162791), np.float64(0.8102325581395349), np.float64(0.8102325581395349)], 'ytrain_synth_prop': [synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64], 'ytest_synth_prop': [synthetic    0.090909
dtype: float64, synt

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.00255 in the m_probability of piv4, level `Exact match on piv4`
Iteration 2: Largest change in params was 0.00161 in the m_probability of piv4, level `All other comparisons`
Iteratio

2002
85
0.35
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97)], 'difficultyInNL': [np.float64(0.8181395348837209), np.float64(0.8137209302325581), np.float64(0.8118604651162791), np.float64(0.8102325581395349), np.float64(0.8102325581395349), np.float64(0.7988372093023256)], 'ytrain_synth_prop': [synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float6

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.00474 in the m_probability of piv5, level `Exact match on piv5`
Iteration 2: Largest change in params was 0.0033 in the m_probability of piv5, level `All other comparisons`
Iteration

2002
85
0.35
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98)], 'difficultyInNL': [np.float64(0.8181395348837209), np.float64(0.8137209302325581), np.float64(0.8118604651162791), np.float64(0.8102325581395349), np.float64(0.8102325581395349), np.float64(0.7988372093023256), np.float64(0.8123255813953488)], 'ytrain_synth_prop': [synthetic    0.091206
dtype: float64, synthe

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.00126 in the m_probability of piv4, level `Exact match on piv4`
Iteration 2: Largest change in params was 0.000599 in the m_probability of piv4, level `All other comparisons`
Iterati

2002
85
0.35
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143)], 'difficultyInNL': [np.float64(0.8181395348837209), np.float64(0.8137209302325581), np.float64(0.8118604651162791), np.float64(0.8102325581395349), np.float64(0.8102325581395349), np.float64(0.7988372093023256), np.float64(0

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.00311 in the m_probability of piv4, level `Exact match on piv4`
Iteration 2: Largest change in params was 0.00214 in the m_probability of piv4, level `All other comparisons`
Iteratio

2002
85
0.35
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858)], 'difficultyInNL': [np.float64(0.8181395348837209), np.float64(0.8137209302325581), np.float64(0.8118604651162791), np.float64(0.81023255

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.00316 in the m_probability of piv5, level `Exact match on piv5`
Iteration 2: Largest change in params was -0.00199 in the m_probability of piv5, level `Exact match on piv5`
Iteration

2002
85
0.35
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081, 0.8871128871128872], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926, 0.778], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858), np.float64(0.9528571428571428)], 'difficultyInNL': [np.float64(0.8181395348837209),

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was 0.0126 in the m_probability of piv4, level `Exact match on piv4`
Iteration 2: Largest change in params was -0.00592 in the m_probability of piv4, level `All other comparisons`
Iteration

2002
85
0.75
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081, 0.8871128871128872], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926, 0.778], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858), np.float64(0.9528571428571428)], 'difficultyInNL': [np.float64(0.8181395348837209),

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.0118 in the m_probability of piv5, level `All other comparisons`
Iteration 2: Largest change in params was -0.00563 in the m_probability of piv5, level `All other comparisons`
Iterat

2002
85
0.75
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081, 0.8871128871128872], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926, 0.778], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858), np.float64(0.9528571428571428)], 'difficultyInNL': [np.float64(0.8181395348837209),

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.011 in the m_probability of piv4, level `All other comparisons`
Iteration 2: Largest change in params was -0.00522 in the m_probability of piv4, level `All other comparisons`
Iterati

2002
85
0.75
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081, 0.8871128871128872], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926, 0.778], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858), np.float64(0.9528571428571428)], 'difficultyInNL': [np.float64(0.8181395348837209),

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.0124 in the m_probability of piv5, level `All other comparisons`
Iteration 2: Largest change in params was 0.00576 in the m_probability of piv5, level `Exact match on piv5`
Iteration

2002
85
0.75
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081, 0.8871128871128872], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926, 0.778], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858), np.float64(0.9528571428571428)], 'difficultyInNL': [np.float64(0.8181395348837209),

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was 0.0127 in the m_probability of piv4, level `Exact match on piv4`
Iteration 2: Largest change in params was -0.00612 in the m_probability of piv4, level `All other comparisons`
Iteration

2002
85
0.75
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081, 0.8871128871128872], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926, 0.778], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858), np.float64(0.9528571428571428)], 'difficultyInNL': [np.float64(0.8181395348837209),

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was 0.0121 in the m_probability of piv5, level `Exact match on piv5`
Iteration 2: Largest change in params was -0.00556 in the m_probability of piv5, level `All other comparisons`
Iteration

2002
85
0.75
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081, 0.8871128871128872], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926, 0.778], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858), np.float64(0.9528571428571428)], 'difficultyInNL': [np.float64(0.8181395348837209),

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was 0.0118 in the m_probability of piv4, level `Exact match on piv4`
Iteration 2: Largest change in params was -0.00568 in the m_probability of piv4, level `All other comparisons`
Iteration

2002
85
0.75
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081, 0.8871128871128872], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926, 0.778], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858), np.float64(0.9528571428571428)], 'difficultyInNL': [np.float64(0.8181395348837209),

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was 0.0126 in the m_probability of piv5, level `Exact match on piv5`
Iteration 2: Largest change in params was -0.00603 in the m_probability of piv5, level `All other comparisons`
Iteration

2002
85
0.75
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081, 0.8871128871128872], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926, 0.778], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858), np.float64(0.9528571428571428)], 'difficultyInNL': [np.float64(0.8181395348837209),

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was 0.0122 in the m_probability of piv5, level `Exact match on piv5`
Iteration 2: Largest change in params was -0.00577 in the m_probability of piv5, level `All other comparisons`
Iteration

2002
85
0.75
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081, 0.8871128871128872], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926, 0.778], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858), np.float64(0.9528571428571428)], 'difficultyInNL': [np.float64(0.8181395348837209),

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.0126 in the m_probability of piv4, level `All other comparisons`
Iteration 2: Largest change in params was -0.00597 in the m_probability of piv4, level `All other comparisons`
Iterat

2002
85
0.75
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14], 'difficultyInA': [0.8921078921078921, 0.9000999000999002, 0.9075924075924076, 0.8981018981018981, 0.8956043956043956, 0.9015984015984015, 0.9105894105894106, 0.8946053946053946, 0.9080919080919081, 0.8871128871128872], 'difficultyInB': [0.7932, 0.7902, 0.789, 0.7882, 0.7862, 0.7836, 0.7898, 0.7774, 0.7926, 0.778], 'difficultyInL': [np.float64(0.9542857142857143), np.float64(0.9585714285714285), np.float64(0.9728571428571429), np.float64(0.9671428571428572), np.float64(0.9642857142857143), np.float64(0.97), np.float64(0.98), np.float64(0.9442857142857143), np.float64(0.9757142857142858), np.float64(0.9528571428571428)], 'difficultyInNL': [np.float64(0.8181395348837209),

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.00654 in the m_probability of piv4, level `Exact match on piv4`
Iteration 2: Largest change in params was 0.003 in the m_probability of piv4, level `All other comparisons`
Iteration 

2002
95
0.35
{0.35: {'overlapInA': [0.34965034965034963], 'overlapInB': [0.14], 'difficultyInA': [0.9595404595404595], 'difficultyInB': [0.9126], 'difficultyInL': [np.float64(0.99)], 'difficultyInNL': [np.float64(0.9232558139534883)], 'ytrain_synth_prop': [synthetic    0.091206
dtype: float64], 'ytest_synth_prop': [synthetic    0.090909
dtype: float64], 'auc_synth': [np.float64(0.5066392000000001)], 'ytrain_link_prop': [link    0.14
dtype: float64], 'ytest_link_prop': [link    0.14
dtype: float64], 'auc_link': [np.float64(0.48584916943521594)], 'max_proba_linked': [0.676784957278319], 'median_proba_linked': [0.67], 'threshold_for_links': [0.5], 'synth_pairs_prop': [0.066], 'real_pairs_prop': [np.float64(0.0784)], 'true_fdr': [np.float64(0.358974358974359)], 'hat_fdr_us': [0.3021978021978022], 'hat_fdr_prob': [np.float64(0.3232150427216809)]}}
ITERATION
2
Unique values in column 'piv1': 15
Unique values in column 'piv2': 10
Unique values in column 'piv3': 9
Unique values in column 'piv4

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was -0.00458 in the m_probability of piv5, level `Exact match on piv5`
Iteration 2: Largest change in params was 0.00135 in the m_probability of piv5, level `All other comparisons`
Iteratio

2002
95
0.35
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14], 'difficultyInA': [0.9595404595404595, 0.9600399600399601], 'difficultyInB': [0.9126, 0.9138], 'difficultyInL': [np.float64(0.99), np.float64(0.9857142857142858)], 'difficultyInNL': [np.float64(0.9232558139534883), np.float64(0.9265116279069767)], 'ytrain_synth_prop': [synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64], 'ytest_synth_prop': [synthetic    0.090909
dtype: float64, synthetic    0.090909
dtype: float64], 'auc_synth': [np.float64(0.5066392000000001), np.float64(0.5099256)], 'ytrain_link_prop': [link    0.14
dtype: float64, link    0.14
dtype: float64], 'ytest_link_prop': [link    0.14
dtype: float64, link    0.14
dtype: float64], 'auc_link': [np.float64(0.48584916943521594), np.float64(0.49027707641196017)], 'max_proba_linked': [0.676784957278319, 0.671064325987925], 'median_proba_linked': [0.67, 0.67], 'threshold_for_links': [0.5, 0.5], 'synth_p

  if self.factor_cols[j]:


2500.0
RangeIndex(start=0, stop=5000, step=1)


----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - piv1 (no m values are trained).
    - piv2 (no m values are trained).
    - piv3 (no m values are trained).
    - piv4 (no m values are trained).
    - piv5 (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."piv1" = r."piv1") AND (l."piv2" = r."piv2") AND (l."piv3" = r."piv3")

Parameter estimates will be made for the following comparison(s):
    - piv4
    - piv5

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - piv1
    - piv2
    - piv3

Iteration 1: Largest change in params was 0.00656 in the m_probability of piv5, level `All other comparisons`
Iteration 2: Largest change in params was -0.00289 in the m_probability of piv5, level `Exact match on piv5`
Iteratio

2002
95
0.35
{0.35: {'overlapInA': [0.34965034965034963, 0.34965034965034963, 0.34965034965034963], 'overlapInB': [0.14, 0.14, 0.14], 'difficultyInA': [0.9595404595404595, 0.9600399600399601, 0.961038961038961], 'difficultyInB': [0.9126, 0.9138, 0.9174], 'difficultyInL': [np.float64(0.99), np.float64(0.9857142857142858), np.float64(0.9871428571428571)], 'difficultyInNL': [np.float64(0.9232558139534883), np.float64(0.9265116279069767), np.float64(0.9286046511627907)], 'ytrain_synth_prop': [synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64, synthetic    0.091206
dtype: float64], 'ytest_synth_prop': [synthetic    0.090909
dtype: float64, synthetic    0.090909
dtype: float64, synthetic    0.090909
dtype: float64], 'auc_synth': [np.float64(0.5066392000000001), np.float64(0.5099256), np.float64(0.5210752)], 'ytrain_link_prop': [link    0.14
dtype: float64, link    0.14
dtype: float64, link    0.14
dtype: float64], 'ytest_link_prop': [link    0.14
dtype: float64, link

In [None]:
print("done")

In [8]:
for linkage in dico_results_final.keys():

    first_dico = dico_results_final[linkage]
    
    for nA in first_dico.keys():

        second_dico = first_dico[nA]
    
        for list_params_key in second_dico.keys():
        
            third_dico = second_dico[list_params_key]
    
            for overlap in third_dico.keys():

                fourth_dico = third_dico[overlap]

                print(len(np.nonzero(fourth_dico["true_fdr"])[0]))

                print(linkage)
                print(nA)
                print(list_params_key)
                print(overlap)
        
                print(f"overlap A: {np.mean(fourth_dico["overlapInA"])}")
                print(f"difficulty A: {np.mean(fourth_dico["difficultyInA"])}")
                print(f"overlap B: {np.mean(fourth_dico["overlapInB"])}")
                print(f"difficulty B: {np.mean(fourth_dico["difficultyInB"])}")
        
                print(f"difficulty Links: {np.mean(fourth_dico["difficultyInL"])}")
        
                print(f"difficulty Non Links: {np.mean(fourth_dico["difficultyInNL"])}")

                print(np.mean(np.array([np.mean(fourth_dico["difficultyInNL"]),np.mean(fourth_dico["difficultyInL"])])))
                print(np.mean(np.array([np.mean(fourth_dico["difficultyInA"]),np.mean(fourth_dico["difficultyInB"])])))
        
                print(f"threshold max: {np.mean(fourth_dico["max_proba_linked"])}")
                print(f"threshold median: {np.mean(fourth_dico["median_proba_linked"])}")
                print(f"threshold applied: {np.mean(fourth_dico["threshold_for_links"])}")
        
                print(f"auc synth: {np.mean(fourth_dico["auc_synth"])} and {np.std(fourth_dico["auc_synth"])}")
                print(f"auc link: {np.mean(fourth_dico["auc_link"])} and {np.std(fourth_dico["auc_link"])}")
            
                cond_synthfpprop = np.array(fourth_dico["synth_pairs_prop"]) / nA
                cond_realfpprop = np.array(fourth_dico["real_pairs_prop"]) / nA
        
                equation_check = cond_synthfpprop - cond_realfpprop
        
                print(f"condition: {np.mean(equation_check)} and {np.std(equation_check)}")
        
                FDP = np.array(fourth_dico["true_fdr"])
            
                our_estimate = np.array(fourth_dico["hat_fdr_us"])
            
                probabilistic_estimate = np.array(fourth_dico["hat_fdr_prob"])
        
                print(f"True FDP: {np.mean(FDP[np.nonzero(FDP)[0]])} and {np.std(FDP[np.nonzero(FDP)[0]])}")
                print(f"bias our FDP: {np.mean(our_estimate[np.nonzero(FDP)[0]] - FDP[np.nonzero(FDP)[0]])} and {np.std(our_estimate[np.nonzero(FDP)[0]] - FDP[np.nonzero(FDP)[0]])}")
                print(f"bias other FDP: {np.mean(probabilistic_estimate[np.nonzero(FDP)[0]] - FDP[np.nonzero(FDP)[0]])} and {np.std(probabilistic_estimate[np.nonzero(FDP)[0]] - FDP[np.nonzero(FDP)[0]])}")
        
                print("\n")
        
                # plt.show(plt.hist(our_estimate - FDP))
                # plt.show(plt.hist(probabilistic_estimate - FDP))

10
at_random
2002
85
0.35
overlap A: 0.34965034965034963
difficulty A: 0.9973026973026974
overlap B: 0.14000000000000004
difficulty B: 0.9943200000000001
difficulty Links: 0.9994285714285714
difficulty Non Links: 0.9951162790697676
0.9972724252491696
0.9958113486513487
threshold max: 0.9715694467175883
threshold median: 0.9690000000000001
threshold applied: 0.5
auc synth: 0.527422 and 0.03421647340752694
auc link: 0.4944930897009967 and 0.015553908949681148
condition: 1.7982017982018014e-07 and 1.5228131078754226e-06
True FDP: 0.03205488790282754 and 0.004898661780338826
bias our FDP: 0.0025471585663164923 and 0.02115797189943636
bias other FDP: -0.0036243346204159104 and 0.004762302883172832


10
at_random
2002
85
0.75
overlap A: 0.7497502497502497
difficulty A: 0.997152847152847
overlap B: 0.3002000000000001
difficulty B: 0.9931599999999999
difficulty Links: 0.9979347101932046
difficulty Non Links: 0.9952843669619892
0.9966095385775968
0.9951564235764234
threshold max: 0.972680420319