## Reproduce results of Scheme A

Paper: "Statistical supervised meta-ensemble algorithm for data linkage"

Kha Vo, Jitendra Jonnagaddala, Siaw-Teng Liaw

February 2019

Jounal of Biomedical Informatics

Paper: "Statistical supervised meta-ensemble algorithm for data linkage"

Kha Vo, Jitendra Jonnagaddala, Siaw-Teng Liaw

February 2019

Jounal of Biomedical Informatics


In [1]:
import recordlinkage as rl, pandas as pd, numpy as np
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.utils import shuffle
from recordlinkage.preprocessing import phonetic
from numpy.random import choice
import collections, numpy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split, KFold

In [2]:
def generate_true_links(df): 
    # although the match_id column is included in the original df to imply the true links,
    # this function will create the true_link object identical to the true_links properties
    # of recordlinkage toolkit, in order to exploit "Compare.compute()" from that toolkit
    # in extract_function() for extracting features quicker.
    # This process should be deprecated in the future release of the UNSW toolkit.
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    processed = 0
    for match_id in df["match_id"].unique():
        if match_id != -1:    
            processed = processed + 1
            # print("In routine generate_true_links(), count =", processed)
            # clear_output(wait=True)
            linkages = df.loc[df['match_id'] == match_id]
            for j in range(len(linkages)-1):
                for k in range(j+1, len(linkages)):
                    indices_1 = indices_1 + [linkages.iloc[j]["rec_id"]]
                    indices_2 = indices_2 + [linkages.iloc[k]["rec_id"]]    
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def generate_false_links(df, size):
    # A counterpart of generate_true_links(), with the purpose to generate random false pairs
    # for training. The number of false pairs in specified as "size".
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    unique_match_id = df["match_id"].unique()
    for j in range(size):
            false_pair_ids = choice(unique_match_id, 2)
            candidate_1_cluster = df.loc[df['match_id'] == false_pair_ids[0]]
            candidate_1 = candidate_1_cluster.iloc[choice(range(len(candidate_1_cluster)))]
            candidate_2_cluster = df.loc[df['match_id'] == false_pair_ids[1]]
            candidate_2 = candidate_2_cluster.iloc[choice(range(len(candidate_2_cluster)))]    
            indices_1 = indices_1 + [candidate_1["rec_id"]]
            indices_2 = indices_2 + [candidate_2["rec_id"]]  
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def swap_fields_flag(f11, f12, f21, f22):
    return int((f11 == f22) and (f12 == f21))

def extract_features(df, links):
    c = rl.Compare()
    c.string('given_name', 'given_name', method='jarowinkler', label='y_name')
    c.string('given_name_soundex', 'given_name_soundex', method='jarowinkler', label='y_name_soundex')
    c.string('given_name_nysiis', 'given_name_nysiis', method='jarowinkler', label='y_name_nysiis')
    c.string('surname', 'surname', method='jarowinkler', label='y_surname')
    c.string('surname_soundex', 'surname_soundex', method='jarowinkler', label='y_surname_soundex')
    c.string('surname_nysiis', 'surname_nysiis', method='jarowinkler', label='y_surname_nysiis')
    c.exact('street_number', 'street_number', label='y_street_number')
    c.string('address_1', 'address_1', method='levenshtein', threshold=0.7, label='y_address1')
    c.string('address_2', 'address_2', method='levenshtein', threshold=0.7, label='y_address2')
    c.exact('postcode', 'postcode', label='y_postcode')
    c.exact('day', 'day', label='y_day')
    c.exact('month', 'month', label='y_month')
    c.exact('year', 'year', label='y_year')
        
    # Build features
    feature_vectors = c.compute(links, df, df)
    return feature_vectors

def generate_train_X_y(df):
    # This routine is to generate the feature vector X and the corresponding labels y
    # with exactly equal number of samples for both classes to train the classifier.
    pos = extract_features(df, train_true_links)
    train_false_links = generate_false_links(df, len(train_true_links))    
    neg = extract_features(df, train_false_links)
    X = pos.values.tolist() + neg.values.tolist()
    y = [1]*len(pos)+[0]*len(neg)
    X, y = shuffle(X, y, random_state=0)
    X = np.array(X)
    y = np.array(y)
    return X, y

def train_model(modeltype, modelparam, train_vectors, train_labels, modeltype_2):
    if modeltype == 'svm': # Support Vector Machine
        model = svm.SVC(C = modelparam, kernel = modeltype_2)
        model.fit(train_vectors, train_labels) 
    elif modeltype == 'lg': # Logistic Regression
        model = LogisticRegression(C=modelparam, penalty = modeltype_2,class_weight=None, dual=False, fit_intercept=True, 
                                   intercept_scaling=1, max_iter=5000, multi_class='ovr', 
                                   n_jobs=1, random_state=None)
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nb': # Naive Bayes
        model = GaussianNB()
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nn': # Neural Network
        model = MLPClassifier(solver='lbfgs', alpha=modelparam, hidden_layer_sizes=(256, ), 
                              activation = modeltype_2,random_state=None, batch_size='auto', 
                              learning_rate='constant',  learning_rate_init=0.001, 
                              power_t=0.5, max_iter=10000, shuffle=True, 
                              tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                              nesterovs_momentum=True, early_stopping=False, 
                              validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        model.fit(train_vectors, train_labels)
    return model

def classify(model, test_vectors):
    result = model.predict(test_vectors)
    return result

    
def evaluation(test_labels, result):
    true_pos = np.logical_and(test_labels, result)
    count_true_pos = np.sum(true_pos)
    true_neg = np.logical_and(np.logical_not(test_labels),np.logical_not(result))
    count_true_neg = np.sum(true_neg)
    false_pos = np.logical_and(np.logical_not(test_labels), result)
    count_false_pos = np.sum(false_pos)
    false_neg = np.logical_and(test_labels,np.logical_not(result))
    count_false_neg = np.sum(false_neg)
    precision = count_true_pos/(count_true_pos+count_false_pos)
    sensitivity = count_true_pos/(count_true_pos+count_false_neg) # sensitivity = recall
    confusion_matrix = [count_true_pos, count_false_pos, count_false_neg, count_true_neg]
    no_links_found = np.count_nonzero(result)
    no_false = count_false_pos + count_false_neg
    Fscore = 2*precision*sensitivity/(precision+sensitivity)
    metrics_result = {'no_false':no_false, 'confusion_matrix':confusion_matrix ,'precision':precision,
                     'sensitivity':sensitivity ,'no_links':no_links_found, 'F-score': Fscore}
    return metrics_result

def blocking_performance(candidates, true_links, df):
    count = 0
    for candi in candidates:
        if df.loc[candi[0]]["match_id"]==df.loc[candi[1]]["match_id"]:
            count = count + 1
    return count

In [3]:
trainset = 'febrl3_UNSW'
testset = 'febrl4_UNSW'

In [4]:
## TRAIN SET CONSTRUCTION

# Import
print("Import train set...")
df_train = pd.read_csv(trainset+".csv", index_col = "rec_id")
train_true_links = generate_true_links(df_train)
print("Train set size:", len(df_train), ", number of matched pairs: ", str(len(train_true_links)))

# Preprocess train set
df_train['postcode'] = df_train['postcode'].astype(str)
df_train['given_name_soundex'] = phonetic(df_train['given_name'], method='soundex')
df_train['given_name_nysiis'] = phonetic(df_train['given_name'], method='nysiis')
df_train['surname_soundex'] = phonetic(df_train['surname'], method='soundex')
df_train['surname_nysiis'] = phonetic(df_train['surname'], method='nysiis')

# Final train feature vectors and labels
X_train, y_train = generate_train_X_y(df_train)
print("Finished building X_train, y_train")

Import train set...
Train set size: 5000 , number of matched pairs:  1165


  s = s.str.replace(r"[\-\_\s]", "")


Finished building X_train, y_train


In [5]:
# Blocking Criteria: declare non-match of all of the below fields disagree
# Import
print("Import test set...")
df_test = pd.read_csv(testset+".csv", index_col = "rec_id")
test_true_links = generate_true_links(df_test)
leng_test_true_links = len(test_true_links)
print("Test set size:", len(df_test), ", number of matched pairs: ", str(leng_test_true_links))

print("BLOCKING PERFORMANCE:")
blocking_fields = ["given_name", "surname", "postcode"]
all_candidate_pairs = []
for field in blocking_fields:
    block_indexer = rl.BlockIndex(on=field)
    candidates = block_indexer.index(df_test)
    detects = blocking_performance(candidates, test_true_links, df_test)
    all_candidate_pairs = candidates.union(all_candidate_pairs)
    print("Number of pairs of matched "+ field +": "+str(len(candidates)), ", detected ",
         detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )
detects = blocking_performance(all_candidate_pairs, test_true_links, df_test)
print("Number of pairs of at least 1 field matched: " + str(len(all_candidate_pairs)), ", detected ",
     detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )

Import test set...
Test set size: 10000 , number of matched pairs:  5000
BLOCKING PERFORMANCE:
Number of pairs of matched given_name: 154898 , detected  3287 /5000 true matched pairs, missed 1713
Number of pairs of matched surname: 170843 , detected  3325 /5000 true matched pairs, missed 1675
Number of pairs of matched postcode: 53197 , detected  4219 /5000 true matched pairs, missed 781
Number of pairs of at least 1 field matched: 372073 , detected  4894 /5000 true matched pairs, missed 106


In [6]:
## TEST SET CONSTRUCTION

# Preprocess test set
print("Processing test set...")
print("Preprocess...")
df_test['postcode'] = df_test['postcode'].astype(str)
df_test['given_name_soundex'] = phonetic(df_test['given_name'], method='soundex')
df_test['given_name_nysiis'] = phonetic(df_test['given_name'], method='nysiis')
df_test['surname_soundex'] = phonetic(df_test['surname'], method='soundex')
df_test['surname_nysiis'] = phonetic(df_test['surname'], method='nysiis')

# Test feature vectors and labels construction
print("Extract feature vectors...")
df_X_test = extract_features(df_test, all_candidate_pairs)
vectors = df_X_test.values.tolist()
labels = [0]*len(vectors)
feature_index = df_X_test.index
for i in range(0, len(feature_index)):
    if df_test.loc[feature_index[i][0]]["match_id"]==df_test.loc[feature_index[i][1]]["match_id"]:
        labels[i] = 1
X_test, y_test = shuffle(vectors, labels, random_state=0)
X_test = np.array(X_test)
y_test = np.array(y_test)
print("Count labels of y_test:",collections.Counter(y_test))
print("Finished building X_test, y_test")

Processing test set...
Preprocess...
Extract feature vectors...
Count labels of y_test: Counter({0: 367179, 1: 4894})
Finished building X_test, y_test


In [7]:
## BASE LEARNERS CLASSIFICATION AND EVALUATION
# Choose model
print("BASE LEARNERS CLASSIFICATION PERFORMANCE:")
modeltype = 'nn' # choose between 'svm', 'lg', 'nn'
modeltype_2 = 'relu'  # 'linear' or 'rbf' for svm, 'l1' or 'l2' for lg, 'relu' or 'logistic' for nn
modelparam_range = [.001,.002,.005,.01,.02,.05,.1,.2,.5,1,5,10,20,50,100,200,500,1000,2000,5000] # C for svm, C for lg, alpha for NN
print("Model:",modeltype,", Param_1:",modeltype_2, ", tuning range:", modelparam_range)
precision = []
sensitivity = []
Fscore = []
nb_false = []

for modelparam in modelparam_range:
    md = train_model(modeltype, modelparam, X_train, y_train, modeltype_2)
    final_result = classify(md, X_test)
    final_eval = evaluation(y_test, final_result)
    precision += [final_eval['precision']]
    sensitivity += [final_eval['sensitivity']]
    Fscore += [final_eval['F-score']]
    nb_false  += [final_eval['no_false']]
    
print("No_false:",nb_false,"\n")
print("Precision:",precision,"\n")
print("Sensitivity:",sensitivity,"\n")
print("F-score:", Fscore,"\n")
print("")

BASE LEARNERS CLASSIFICATION PERFORMANCE:
Model: nn , Param_1: relu , tuning range: [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]


  precision = count_true_pos/(count_true_pos+count_false_pos)


No_false: [9437, 12318, 10063, 10025, 9861, 14780, 7256, 5283, 3272, 1651, 718, 564, 495, 411, 398, 391, 421, 419, 4894, 367179] 

Precision: [0.3414089227117224, 0.28423622413392235, 0.32708904796949223, 0.32794581181677956, 0.33159286634569746, 0.24865249669480322, 0.40273431065722287, 0.4808694796891905, 0.5994116204952195, 0.7480495640201927, 0.8732570611369325, 0.8982707873436351, 0.9099720410065237, 0.9252513754505787, 0.9281904761904762, 0.9310740953475014, 0.9298481645204689, 0.9318664350511484, nan, 0.013153332813722038] 

Sensitivity: [0.9991826726604005, 0.9991826726604005, 0.9989783408255006, 0.9991826726604005, 0.9991826726604005, 0.9991826726604005, 0.9991826726604005, 0.9989783408255006, 0.9991826726604005, 0.9991826726604005, 0.9981610134859011, 0.9977523498161014, 0.9975480179812015, 0.996526358806702, 0.9957090314671025, 0.9936657131181038, 0.9885574172456069, 0.9865140988966081, 0.0, 1.0] 

F-score: [0.508924389863142, 0.4425739885962531, 0.49281790232347167, 0.49381

In [8]:
## ENSEMBLE CLASSIFICATION AND EVALUATION

print("BAGGING PERFORMANCE:\n")
modeltypes = ['svm', 'nn', 'lg'] 
modeltypes_2 = ['linear', 'relu', 'l2']
modelparams = [0.005, 100, 0.2]
nFold = 10
kf = KFold(n_splits=nFold)
model_raw_score = [0]*3
model_binary_score = [0]*3
model_i = 0
for model_i in range(3):
    modeltype = modeltypes[model_i]
    modeltype_2 = modeltypes_2[model_i]
    modelparam = modelparams[model_i]
    print(modeltype, "per fold:")
    iFold = 0
    result_fold = [0]*nFold
    final_eval_fold = [0]*nFold
    for train_index, valid_index in kf.split(X_train):
        X_train_fold = X_train[train_index]
        y_train_fold = y_train[train_index]
        md =  train_model(modeltype, modelparam, X_train_fold, y_train_fold, modeltype_2)
        result_fold[iFold] = classify(md, X_test)
        final_eval_fold[iFold] = evaluation(y_test, result_fold[iFold])
        print("Fold", str(iFold), final_eval_fold[iFold])
        iFold = iFold + 1
    bagging_raw_score = np.average(result_fold, axis=0)
    bagging_binary_score  = np.copy(bagging_raw_score)
    bagging_binary_score[bagging_binary_score > 0.5] = 1
    bagging_binary_score[bagging_binary_score <= 0.5] = 0
    bagging_eval = evaluation(y_test, bagging_binary_score)
    print(modeltype, "bagging:", bagging_eval)
    print('')
    model_raw_score[model_i] = bagging_raw_score
    model_binary_score[model_i] = bagging_binary_score

BAGGING PERFORMANCE:

svm per fold:
Fold 0 {'no_false': 324, 'confusion_matrix': [4881, 311, 13, 366868], 'precision': 0.9401001540832049, 'sensitivity': 0.9973436861463016, 'no_links': 5192, 'F-score': 0.9678762641284949}
Fold 1 {'no_false': 252, 'confusion_matrix': [4881, 239, 13, 366940], 'precision': 0.9533203125, 'sensitivity': 0.9973436861463016, 'no_links': 5120, 'F-score': 0.9748352306770521}
Fold 2 {'no_false': 290, 'confusion_matrix': [4881, 277, 13, 366902], 'precision': 0.946297014346646, 'sensitivity': 0.9973436861463016, 'no_links': 5158, 'F-score': 0.971150019896538}
Fold 3 {'no_false': 269, 'confusion_matrix': [4881, 256, 13, 366923], 'precision': 0.9501654662254234, 'sensitivity': 0.9973436861463016, 'no_links': 5137, 'F-score': 0.9731831322899013}
Fold 4 {'no_false': 285, 'confusion_matrix': [4881, 272, 13, 366907], 'precision': 0.9472152144381913, 'sensitivity': 0.9973436861463016, 'no_links': 5153, 'F-score': 0.9716333233801134}
Fold 5 {'no_false': 267, 'confusion_m

In [9]:
thres = .99

print("STACKING PERFORMANCE:\n")
stack_raw_score = np.average(model_raw_score, axis=0)
stack_binary_score = np.copy(stack_raw_score)
stack_binary_score[stack_binary_score > thres] = 1
stack_binary_score[stack_binary_score <= thres] = 0
stacking_eval = evaluation(y_test, stack_binary_score)
print(stacking_eval)

STACKING PERFORMANCE:

{'no_false': 213, 'confusion_matrix': [4870, 189, 24, 366990], 'precision': 0.9626408381102984, 'sensitivity': 0.9950960359624029, 'no_links': 5059, 'F-score': 0.9785994172611272}


## CS598 Project Code

The following contain our own code for replicating and validating the results of the original paper. The data sets and therefore data preprocessing constructs from the original paper are reused here rather than being reimplemented.

The code is organized into sections for each base learner

### Neural Network Base Learner

#### Neural Network Model Implementation

In [67]:
# CS598 Project Code / Neural Network Model
import torch
import torch.nn as nn
import torch.nn.functional as F

# Reference implementation NN parameters
class FEBRLReproducerNN(nn.Module):
    def __init__(self, num_features=0, weight_decay=0.0):
        # Create the our PyTorch nn model
        super(FEBRLReproducerNN, self).__init__()

        # STEP 1
        # Specify parameters for our PyTorch nn model based upon the analogous
        # parameters used by the original paper's sklearn MLPClassifier

        # PyTorch nn concept                            Analogous sklearn MLPClassifier parameter
        # ------------------                            -----------------------------------------
        self.optimizer = None                           # solver (optimizer; original paper uses LBFGS, but we will use SGD (defined later) due to PyTorch-sklearn differences)
        self.optimizer_weight_decay = weight_decay      # alpha (L2 penalty/regularization term)
        self.num_hidden_layer_nodes = 256               # hidden_layer_sizes (tuple of hidden layer nodes)
        self.activation = F.relu                        # activation (activation function)
        self.random_state = 12345                       # random_state (static, random state for reproducibility)
        #                                               # batch_size (minibatch size; unused in our model)
        #                                               # learning_rate (tells the model to use the provided initial learning rate; n/a to our model)
        self.optimizer_learning_rate_init = 0.001       # learning_rate_init (initial learning rate)
        self.optimizer_dampening = 0.5                  # power_t (dampening)
        self.num_max_epochs = 10000                     # max_iter (maximum number of epochs when using stochastic optimizers)
        self.shuffle = True                             # shuffle (shuffle samples in each iteration)
        self.tolerance = 0.0001                         # tol (optimization tolorance; early training termination)
        #                                               # verbose (print model progress debug messages to console; specified by unused by original paper)
        #                                               # warm_start (initialize the model with the results of previous executions; specified but unused by original paper)
        self.optimizer_momentum = 0.9                   # momentum (optimizer momentum)
        self.use_nesterov_momentum = True               # nesterovs_momentum (use Nesterov's momentum in the optimizer)
        #                                               # early_stopping (terminate early when validation is not improving; 'False' in original paper)
        #                                               # validation_fraction (validation data set criteria for early stopping; specified by unused by original paper)
        #                                               # beta_1 (parameter for Adam optimizer; specified but unused by original paper)
        #                                               # beta_2 (parameter for Adam optimizer; specified but unused by original paper)
        #                                               # epsilon (parameter for Adam optimizer; specified but unused by original paper)

        # STEP 2
        # Define the layers for our nn model
        self.num_input_features = num_features

        self.fc1 = nn.Linear(in_features=self.num_input_features, out_features=self.num_hidden_layer_nodes, bias=False)
        self.fc2 = nn.Linear(in_features=self.num_hidden_layer_nodes, out_features=1, bias=False)

        # STEP 3
        # Define the criteria and optimizer
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.SGD(self.parameters(),
            lr=self.optimizer_learning_rate_init,
            weight_decay=self.optimizer_weight_decay,
            momentum=self.optimizer_momentum,
            dampening=0,
            nesterov=self.use_nesterov_momentum)

    def forward(self, x):
        # Perform a forward pass on the nn; it is not recommended to call this
        # function directly, but to instead call fit(...) or predict(...) so that model's
        # mode is correctly set automatically
        x = self.activation(self.fc1(x))
        x = self.fc2(x)

        return torch.squeeze(x)

    def fit(self, X_train, y_train):
        # Train the nn with the specified parameters; analogous to sklearn's
        # MLPClassifier.fit(...) method
        self.train()

        kfold = KFold(n_splits=10, shuffle=self.shuffle, random_state=self.random_state)
        loss_previous_epoch = 1.0
        loss_consecutive_epochs_minimal = 0

        for epoch_i in np.arange(self.num_max_epochs):
            loss = None

            for train_indicies, test_indicies in kfold.split(X_train):
                self.optimizer.zero_grad()
                output = self.forward(X_train[train_indicies])
                loss = self.criterion(output, y_train[train_indicies])
                loss.backward()
                self.optimizer.step()

            # Determine if criteria for early training termination is satisfied
            if (loss_previous_epoch - loss.item()) <= self.tolerance:
                loss_consecutive_epochs_minimal = loss_consecutive_epochs_minimal + 1

                if(loss_consecutive_epochs_minimal == 10):
                    break
            else:
                loss_consecutive_epochs_minimal = 0

            loss_previous_epoch = loss.item()

    def predict(self, X_test):
        # Test the nn with the specified parameters; analogous to sklearn's
        # MLPClassifier.predict(...) method
        self.eval()
        return self.forward(X_test)

# Convert the numpy training and testing sets to torch.tensor for us with the PyTorch library
X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train).float()
X_test_tensor = torch.from_numpy(X_test).float()
y_test_tensor = torch.from_numpy(y_test).long()

frn_weight_decay_range = [.001,.002,.005,.01,.02,.05,.1,.2,.5,1,5,10,20,50,100,200,500,1000,2000,5000]  # 0.5 produces highest f1 score in our model
frn_weight_decay_optimal = 0.5  # Determined through search

#### Neural Network Hyperparameter Search

Grid search is performed on the set of candidate hyperparameters from the original paper using our model. Some hyperparameters may take a long time to test if they do not cause the model's loss to converge early (or at all).

In [69]:
# CS598 Project Code / Neural Network Model

# Perform nn base learner evaluation using the hyperparameter search range provided by the original paper
for weight_decay in frn_weight_decay_range:
    # Create an instance of the feed-forward neural network
    febrl_reproducer_nn = FEBRLReproducerNN(num_features=X_train_tensor.shape[1], weight_decay=weight_decay)

    # Train the model
    febrl_reproducer_nn.fit(X_train_tensor, y_train_tensor)

    # Test the model
    frn_output = febrl_reproducer_nn.predict(X_test_tensor).detach()

    y_pred = np.asarray([1 if element > 0.5 else 0 for element in frn_output])

    print("weight_decay = {}: {}".format(weight_decay, evaluation(y_test, y_pred)))

weight_decay = 0.001: {'no_false': 432, 'confusion_matrix': [4854, 392, 40, 366787], 'precision': 0.92527640106748, 'sensitivity': 0.991826726604005, 'no_links': 5246, 'F-score': 0.9573964497041421}
weight_decay = 0.002: {'no_false': 322, 'confusion_matrix': [4862, 290, 32, 366889], 'precision': 0.9437111801242236, 'sensitivity': 0.9934613812832039, 'no_links': 5152, 'F-score': 0.9679474417678678}
weight_decay = 0.005: {'no_false': 464, 'confusion_matrix': [4853, 423, 41, 366756], 'precision': 0.9198256254738438, 'sensitivity': 0.9916223947691051, 'no_links': 5276, 'F-score': 0.9543756145526058}
weight_decay = 0.01: {'no_false': 498, 'confusion_matrix': [4850, 454, 44, 366725], 'precision': 0.9144042232277526, 'sensitivity': 0.9910093992644053, 'no_links': 5304, 'F-score': 0.9511668954696999}
weight_decay = 0.02: {'no_false': 348, 'confusion_matrix': [4854, 308, 40, 366871], 'precision': 0.9403332041844247, 'sensitivity': 0.991826726604005, 'no_links': 5162, 'F-score': 0.96539379474940

KeyboardInterrupt: 

In [63]:
# CS598 Project Code / Neural Network Model

# Perform 10 train-test execution cycles
for frn_i in np.arange(10):
    # Create an instance of the feed-forward neural network
    febrl_reproducer_nn = FEBRLReproducerNN(num_features=X_train_tensor.shape[1], weight_decay=frn_weight_decay_optimal)

    # Train the model
    febrl_reproducer_nn.fit(X_train_tensor, y_train_tensor)

    # Test the model
    frn_output = febrl_reproducer_nn.predict(X_test_tensor).detach()

    y_pred = np.asarray([1 if element > 0.5 else 0 for element in frn_output])

    print("Execution {}: {}".format(frn_i, evaluation(y_test, y_pred)))

Terminating early after 19 epochs
Execution 0: {'no_false': 165, 'confusion_matrix': [4849, 120, 45, 367059], 'precision': 0.9758502716844436, 'sensitivity': 0.9908050674295055, 'no_links': 4969, 'F-score': 0.9832708100983473}
Terminating early after 19 epochs
Execution 1: {'no_false': 394, 'confusion_matrix': [4852, 352, 42, 366827], 'precision': 0.9323597232897771, 'sensitivity': 0.9914180629342052, 'no_links': 5204, 'F-score': 0.9609823727470788}
Terminating early after 16 epochs
Execution 2: {'no_false': 115, 'confusion_matrix': [4851, 72, 43, 367107], 'precision': 0.9853747714808044, 'sensitivity': 0.9912137310993052, 'no_links': 4923, 'F-score': 0.9882856269736172}
Terminating early after 19 epochs
Execution 3: {'no_false': 529, 'confusion_matrix': [4797, 432, 97, 366747], 'precision': 0.9173838209982789, 'sensitivity': 0.9801798120147119, 'no_links': 5229, 'F-score': 0.947742764002766}
Terminating early after 19 epochs
Execution 4: {'no_false': 426, 'confusion_matrix': [4834, 36

## Reference Implementation Validation

### Base Learners

Each of the following cells executes one of the three base learner models from the sklearn library used by the reference implementation with identical parameterization and data set as a sanity check to validate the paper's models' results using the same data set used by the reference implementation above.

In [39]:
# CS598 Project Code / Reference Implementation Validation (Neural Network)

# Sanity check: Create, train, and test a new instance of an sklearn MLPClassifier from scratch using the original
# paper's parameters to confirm that results are reproducible and absent any unexpected/hidden dependencies
nn_validation_model = MLPClassifier(solver='lbfgs', alpha=200, hidden_layer_sizes=(256, ), 
                              activation = 'relu',random_state=None, batch_size='auto', 
                              learning_rate='constant',  learning_rate_init=0.001, 
                              power_t=0.5, max_iter=10000, shuffle=True, 
                              tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                              nesterovs_momentum=True, early_stopping=False, 
                              validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn_validation_model.fit(X_train, y_train)

nn_validation_model_results = classify(nn_validation_model, X_test)

print("Original paper nn: {}".format(evaluation(y_test, nn_validation_model_results)))

Original paper nn: {'no_false': 391, 'confusion_matrix': [4863, 360, 31, 366819], 'precision': 0.9310740953475014, 'sensitivity': 0.9936657131181038, 'no_links': 5223, 'F-score': 0.9613521794998516}


In [44]:
# CS598 Project Code / Reference Implementation Validation (Logistic Regression)

# Sanity check: Create, train, and test a new instance of an sklearn LogisticRegression from scratch using the original
# paper's parameters to confirm that results are reproducible and absent any unexpected/hidden dependencies
lg_validation_model = LogisticRegression(C=0.2, penalty = 'l2',class_weight=None, dual=False, fit_intercept=True, 
                                   intercept_scaling=1, max_iter=5000, multi_class='ovr', 
                                   n_jobs=1, random_state=None)
lg_validation_model.fit(X_train, y_train)

lg_validation_model_results = classify(lg_validation_model, X_test)

print("Original paper nn: {}".format(evaluation(y_test, lg_validation_model_results)))

Original paper nn: {'no_false': 488, 'confusion_matrix': [4882, 476, 12, 366703], 'precision': 0.9111608809257186, 'sensitivity': 0.9975480179812015, 'no_links': 5358, 'F-score': 0.9523995317986734}


In [47]:
# CS598 Project Code / Reference Implementation Validation (Support Vector Machine)

# Sanity check: Create, train, and test a new instance of an sklearn SVC from scratch using the original
# paper's parameters to confirm that results are reproducible and absent any unexpected/hidden dependencies
svm_validation_model = svm.SVC(C = 0.005, kernel = 'linear')
svm_validation_model.fit(X_train, y_train)

svm_validation_model_results = classify(svm_validation_model, X_test)

print("Original paper nn: {}".format(evaluation(y_test, svm_validation_model_results)))

Original paper nn: {'no_false': 329, 'confusion_matrix': [4881, 316, 13, 366863], 'precision': 0.9391956898210506, 'sensitivity': 0.9973436861463016, 'no_links': 5197, 'F-score': 0.9673966901199089}
