## Reproduce results of Scheme B

Paper: "Statistical supervised meta-ensemble algorithm for data linkage"

Kha Vo, Jitendra Jonnagaddala, Siaw-Teng Liaw

February 2019

Jounal of Biomedical Informatics

In [14]:
trainset = 'ePBRN_F_dup' 
testset = 'ePBRN_D_dup'

import recordlinkage as rl, pandas as pd, numpy as np
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.utils import shuffle
from recordlinkage.preprocessing import phonetic
from numpy.random import choice
import collections, numpy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split, KFold


def generate_true_links(df): 
    # although the match_id column is included in the original df to imply the true links,
    # this function will create the true_link object identical to the true_links properties
    # of recordlinkage toolkit, in order to exploit "Compare.compute()" from that toolkit
    # in extract_function() for extracting features quicker.
    # This process should be deprecated in the future release of the UNSW toolkit.
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    processed = 0
    for match_id in df["match_id"].unique():
        if match_id != -1:    
            processed = processed + 1
            # print("In routine generate_true_links(), count =", processed)
            # clear_output(wait=True)
            linkages = df.loc[df['match_id'] == match_id]
            for j in range(len(linkages)-1):
                for k in range(j+1, len(linkages)):
                    indices_1 = indices_1 + [linkages.iloc[j]["rec_id"]]
                    indices_2 = indices_2 + [linkages.iloc[k]["rec_id"]]    
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def generate_false_links(df, size):
    # A counterpart of generate_true_links(), with the purpose to generate random false pairs
    # for training. The number of false pairs in specified as "size".
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    unique_match_id = df["match_id"].unique()
    unique_match_id = unique_match_id[~np.isnan(unique_match_id)] # remove nan values
    for j in range(size):
            false_pair_ids = choice(unique_match_id, 2)
            candidate_1_cluster = df.loc[df['match_id'] == false_pair_ids[0]]
            candidate_1 = candidate_1_cluster.iloc[choice(range(len(candidate_1_cluster)))]
            candidate_2_cluster = df.loc[df['match_id'] == false_pair_ids[1]]
            candidate_2 = candidate_2_cluster.iloc[choice(range(len(candidate_2_cluster)))]    
            indices_1 = indices_1 + [candidate_1["rec_id"]]
            indices_2 = indices_2 + [candidate_2["rec_id"]]  
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def swap_fields_flag(f11, f12, f21, f22):
    return ((f11 == f22) & (f12 == f21)).astype(float)

def join_names_space(f11, f12, f21, f22):
    return ((f11+" "+f12 == f21) | (f11+" "+f12 == f22)| (f21+" "+f22 == f11)| (f21+" "+f22 == f12)).astype(float)

def join_names_dash(f11, f12, f21, f22):
    return ((f11+"-"+f12 == f21) | (f11+"-"+f12 == f22)| (f21+"-"+f22 == f11)| (f21+"-"+f22 == f12)).astype(float)

def abb_surname(f1, f2):
    return ((f1[0]==f2) | (f1==f2[0])).astype(float)

def reset_day(f11, f12, f21, f22):
    return (((f11 == 1) & (f12 == 1))|((f21 == 1) & (f22 == 1))).astype(float)

def extract_features(df, links):
    c = rl.Compare()
    c.string('given_name', 'given_name', method='levenshtein', label='y_name_leven')
    c.string('surname', 'surname', method='levenshtein', label='y_surname_leven')  
    c.string('given_name', 'given_name', method='jarowinkler', label='y_name_jaro')
    c.string('surname', 'surname', method='jarowinkler', label='y_surname_jaro')  
    c.string('postcode', 'postcode', method='jarowinkler', label='y_postcode')      
    exact_fields = ['postcode', 'address_1', 'address_2', 'street_number']
    for field in exact_fields:
        c.exact(field, field, label='y_'+field+'_exact')
    c.compare_vectorized(reset_day,('day', 'month'), ('day', 'month'),label='reset_day_flag')    
    c.compare_vectorized(swap_fields_flag,('day', 'month'), ('day', 'month'),label='swap_day_month')    
    c.compare_vectorized(swap_fields_flag,('surname', 'given_name'), ('surname', 'given_name'),label='swap_names')    
    c.compare_vectorized(join_names_space,('surname', 'given_name'), ('surname', 'given_name'),label='join_names_space')
    c.compare_vectorized(join_names_dash,('surname', 'given_name'), ('surname', 'given_name'),label='join_names_dash')
    c.compare_vectorized(abb_surname,'surname', 'surname',label='abb_surname')
    # Build features
    feature_vectors = c.compute(links, df, df)
    return feature_vectors

def generate_train_X_y(df):
    # This routine is to generate the feature vector X and the corresponding labels y
    # with exactly equal number of samples for both classes to train the classifier.
    pos = extract_features(df, train_true_links)
    train_false_links = generate_false_links(df, len(train_true_links))    
    neg = extract_features(df, train_false_links)
    X = pos.values.tolist() + neg.values.tolist()
    y = [1]*len(pos)+[0]*len(neg)
    X, y = shuffle(X, y, random_state=0)
    X = np.array(X)
    y = np.array(y)
    return X, y

def train_model(modeltype, modelparam, train_vectors, train_labels, modeltype_2):
    if modeltype == 'svm': # Support Vector Machine
        model = svm.SVC(C = modelparam, kernel = modeltype_2)
        model.fit(train_vectors, train_labels) 
    elif modeltype == 'lg': # Logistic Regression
        model = LogisticRegression(C=modelparam, penalty = modeltype_2,class_weight=None, dual=False, fit_intercept=True, 
                                   intercept_scaling=1, max_iter=5000, multi_class='ovr', 
                                   n_jobs=1, random_state=None)
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nb': # Naive Bayes
        model = GaussianNB()
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nn': # Neural Network
        model = MLPClassifier(solver='lbfgs', alpha=modelparam, hidden_layer_sizes=(256, ), 
                              activation = modeltype_2,random_state=None, batch_size='auto', 
                              learning_rate='constant',  learning_rate_init=0.001, 
                              power_t=0.5, max_iter=30000, shuffle=True, 
                              tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                              nesterovs_momentum=True, early_stopping=False, 
                              validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        model.fit(train_vectors, train_labels)
    return model

def classify(model, test_vectors):
    result = model.predict(test_vectors)
    return result

    
def evaluation(test_labels, result):
    true_pos = np.logical_and(test_labels, result)
    count_true_pos = np.sum(true_pos)
    true_neg = np.logical_and(np.logical_not(test_labels),np.logical_not(result))
    count_true_neg = np.sum(true_neg)
    false_pos = np.logical_and(np.logical_not(test_labels), result)
    count_false_pos = np.sum(false_pos)
    false_neg = np.logical_and(test_labels,np.logical_not(result))
    count_false_neg = np.sum(false_neg)
    precision = count_true_pos/(count_true_pos+count_false_pos)
    sensitivity = count_true_pos/(count_true_pos+count_false_neg) # sensitivity = recall
    confusion_matrix = [count_true_pos, count_false_pos, count_false_neg, count_true_neg]
    no_links_found = np.count_nonzero(result)
    no_false = count_false_pos + count_false_neg
    Fscore = 2*precision*sensitivity/(precision+sensitivity)
    metrics_result = {'no_false':no_false, 'confusion_matrix':confusion_matrix ,'precision':precision,
                     'sensitivity':sensitivity ,'no_links':no_links_found, 'F-score': Fscore}
    return metrics_result

def blocking_performance(candidates, true_links, df):
    count = 0
    for candi in candidates:
        if df.loc[candi[0]]["match_id"]==df.loc[candi[1]]["match_id"]:
            count = count + 1
    return count

In [35]:
## TRAIN SET CONSTRUCTION

# Import
print("Import train set...")
df_train = pd.read_csv(trainset+".csv", index_col = "rec_id")
train_true_links = generate_true_links(df_train)
print("Train set size:", len(df_train), ", number of matched pairs: ", str(len(train_true_links)))

# Preprocess train set
df_train['postcode'] = df_train['postcode'].astype(str)

# Final train feature vectors and labels
X_train, y_train = generate_train_X_y(df_train)
print("Finished building X_train, y_train")

Import train set...
Train set size: 14078 , number of matched pairs:  3192


  s = s.str.replace(r"[\-\_\s]", "")


Finished building X_train, y_train


In [36]:
# Blocking Criteria: declare non-match of all of the below fields disagree
# Import
print("Import test set...")
df_test = pd.read_csv(testset+".csv", index_col = "rec_id")
test_true_links = generate_true_links(df_test)
leng_test_true_links = len(test_true_links)
print("Test set size:", len(df_test), ", number of matched pairs: ", str(leng_test_true_links))

print("BLOCKING PERFORMANCE:")
blocking_fields = ["given_name", "surname", "postcode"]
all_candidate_pairs = []
for field in blocking_fields:
    block_indexer = rl.BlockIndex(on=field)
    candidates = block_indexer.index(df_test)
    detects = blocking_performance(candidates, test_true_links, df_test)
    all_candidate_pairs = candidates.union(all_candidate_pairs)
    print("Number of pairs of matched "+ field +": "+str(len(candidates)), ", detected ",
         detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )
detects = blocking_performance(all_candidate_pairs, test_true_links, df_test)
print("Number of pairs of at least 1 field matched: " + str(len(all_candidate_pairs)), ", detected ",
     detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )

Import test set...
Test set size: 11731 , number of matched pairs:  2653
BLOCKING PERFORMANCE:
Number of pairs of matched given_name: 252552 , detected  1567 /2653 true matched pairs, missed 1086
Number of pairs of matched surname: 33832 , detected  1480 /2653 true matched pairs, missed 1173
Number of pairs of matched postcode: 79940 , detected  2462 /2653 true matched pairs, missed 191
Number of pairs of at least 1 field matched: 362910 , detected  2599 /2653 true matched pairs, missed 54


In [37]:
## TEST SET CONSTRUCTION

# Preprocess test set
print("Processing test set...")
print("Preprocess...")
df_test['postcode'] = df_test['postcode'].astype(str)

# Test feature vectors and labels construction
print("Extract feature vectors...")
df_X_test = extract_features(df_test, all_candidate_pairs)
vectors = df_X_test.values.tolist()
labels = [0]*len(vectors)
feature_index = df_X_test.index
for i in range(0, len(feature_index)):
    if df_test.loc[feature_index[i][0]]["match_id"]==df_test.loc[feature_index[i][1]]["match_id"]:
        labels[i] = 1
X_test, y_test = shuffle(vectors, labels, random_state=0)
X_test = np.array(X_test)
y_test = np.array(y_test)
print("Count labels of y_test:",collections.Counter(y_test))
print("Finished building X_test, y_test")

Processing test set...
Preprocess...
Extract feature vectors...
Count labels of y_test: Counter({0: 360311, 1: 2599})
Finished building X_test, y_test


In [18]:
## BASE LEARNERS CLASSIFICATION AND EVALUATION
# Choose model
print("BASE LEARNERS CLASSIFICATION PERFORMANCE:")
modeltype = 'svm' # choose between 'svm', 'lg', 'nn'
modeltype_2 = 'rbf'  # 'linear' or 'rbf' for svm, 'l1' or 'l2' for lg, 'relu' or 'logistic' for nn
modelparam_range = [.001,.002,.005,.01,.02,.05,.1,.2,.5,1,5,10,20,50,100,200,500,1000,2000,5000] # C for svm, C for lg, alpha for NN

print("Model:",modeltype,", Param_1:",modeltype_2, ", tuning range:", modelparam_range)
precision = []
sensitivity = []
Fscore = []
nb_false = []

for modelparam in modelparam_range:
    md = train_model(modeltype, modelparam, X_train, y_train, modeltype_2)
    final_result = classify(md, X_test)
    final_eval = evaluation(y_test, final_result)
    precision += [final_eval['precision']]
    sensitivity += [final_eval['sensitivity']]
    Fscore += [final_eval['F-score']]
    nb_false  += [final_eval['no_false']]
    
print("No_false:",nb_false,"\n")
print("Precision:",precision,"\n")
print("Sensitivity:",sensitivity,"\n")
print("F-score:", Fscore,"\n")
print("")

BASE LEARNERS CLASSIFICATION PERFORMANCE:
Model: svm , Param_1: rbf , tuning range: [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]
No_false: [5468, 9207, 76204, 81599, 81385, 78545, 78233, 78568, 79306, 83314, 92616, 87369, 85390, 86085, 84369, 86029, 76675, 75894, 76380, 76731] 

Precision: [0.32048554623951947, 0.21909539194014624, 0.032957271031358266, 0.030856572758800892, 0.03093520040008573, 0.032017943851519556, 0.03214153160955091, 0.03200887081870264, 0.031731884500335754, 0.030251533528104012, 0.027296119308932415, 0.02887757597314541, 0.029527089229090663, 0.029285069914298602, 0.029862931787866243, 0.029303574652464345, 0.032761448214961526, 0.03308743900419167, 0.032883823994935106, 0.032738320348939816] 

Sensitivity: [0.9853789919199692, 0.9915352058484033, 0.9992304732589458, 0.9996152366294728, 0.9996152366294728, 0.9996152366294728, 0.9996152366294728, 0.9996152366294728, 1.0, 1.0, 1.0, 0.9996152366294728, 0.9

In [38]:
## ENSEMBLE CLASSIFICATION AND EVALUATION

print("BAGGING PERFORMANCE:\n")
modeltypes = ['svm', 'nn', 'lg'] 
modeltypes_2 = ['rbf', 'relu', 'l2']
modelparams = [0.001, 2000, 0.005]
nFold = 10
kf = KFold(n_splits=nFold)
model_raw_score = [0]*3
model_binary_score = [0]*3
model_i = 0
for model_i in range(3):
    modeltype = modeltypes[model_i]
    modeltype_2 = modeltypes_2[model_i]
    modelparam = modelparams[model_i]
    print(modeltype, "per fold:")
    iFold = 0
    result_fold = [0]*nFold
    final_eval_fold = [0]*nFold
    for train_index, valid_index in kf.split(X_train):
        X_train_fold = X_train[train_index]
        y_train_fold = y_train[train_index]
        md =  train_model(modeltype, modelparam, X_train_fold, y_train_fold, modeltype_2)
        result_fold[iFold] = classify(md, X_test)
        final_eval_fold[iFold] = evaluation(y_test, result_fold[iFold])
        print("Fold", str(iFold), final_eval_fold[iFold])
        iFold = iFold + 1
    bagging_raw_score = np.average(result_fold, axis=0)
    bagging_binary_score  = np.copy(bagging_raw_score)
    bagging_binary_score[bagging_binary_score > 0.5] = 1
    bagging_binary_score[bagging_binary_score <= 0.5] = 0
    bagging_eval = evaluation(y_test, bagging_binary_score)
    print(modeltype, "bagging:", bagging_eval)
    print('')
    model_raw_score[model_i] = bagging_raw_score
    model_binary_score[model_i] = bagging_binary_score
    
thres = .99
print("STACKING PERFORMANCE:\n")
stack_raw_score = np.average(model_raw_score, axis=0)
stack_binary_score = np.copy(stack_raw_score)
stack_binary_score[stack_binary_score > thres] = 1
stack_binary_score[stack_binary_score <= thres] = 0
stacking_eval = evaluation(y_test, stack_binary_score)
print(stacking_eval)

BAGGING PERFORMANCE:

svm per fold:
Fold 0 {'no_false': 4356, 'confusion_matrix': [2544, 4301, 55, 356010], 'precision': 0.3716581446311176, 'sensitivity': 0.9788380146210081, 'no_links': 6845, 'F-score': 0.5387547649301144}
Fold 1 {'no_false': 4032, 'confusion_matrix': [2544, 3977, 55, 356334], 'precision': 0.39012421407759545, 'sensitivity': 0.9788380146210081, 'no_links': 6521, 'F-score': 0.5578947368421053}
Fold 2 {'no_false': 4192, 'confusion_matrix': [2546, 4139, 53, 356172], 'precision': 0.38085265519820494, 'sensitivity': 0.9796075413620623, 'no_links': 6685, 'F-score': 0.5484704868591125}
Fold 3 {'no_false': 4233, 'confusion_matrix': [2544, 4178, 55, 356133], 'precision': 0.3784587920261827, 'sensitivity': 0.9788380146210081, 'no_links': 6722, 'F-score': 0.5458641776633409}
Fold 4 {'no_false': 4203, 'confusion_matrix': [2541, 4145, 58, 356166], 'precision': 0.3800478612025127, 'sensitivity': 0.9776837245094268, 'no_links': 6686, 'F-score': 0.5473344103392569}
Fold 5 {'no_false

## CS598 Project Code

The following contain our own code for replicating and validating the results of the original paper. The data sets and therefore data preprocessing constructs from the original paper are reused here rather than being reimplemented. Note that the implementation of both the paper's and our models are identical for both Schemes A and B; only hyperparameters differ.

The code is organized into sections:
1. Base Learner Bagging
     1. Support Vector Machine
     1. Neural Network
     1. Linear Regression
1. Stacking

Following our code is an "appendix" section which performs out-of-context validation on the reference implementation's base learners' performance using the same parameterization and data set.

### 1. Base Learners

In [20]:
# Import the torch library which we'll use for our implementation
import torch
import torch.nn as nn
import torch.nn.functional as F

In [21]:
# Convert the numpy training and testing sets to torch.tensor for us with the PyTorch library
X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train).float()
X_test_tensor = torch.from_numpy(X_test).float()
y_test_tensor = torch.from_numpy(y_test).long()

#### 1.1. Support Vector Machine Base Learner

##### 1.1.1 SVM Model Implementation

The support vector machine implementation, FEBRLReproducerSVM, reproduces the results of the support vector machine base learner from the original paper. This model has two initialization parameters:
1. `num_features`: The number of features in the dataset; for the base dataset, this value is 13, but this value can differ if a dataset with fewer or additional features to be used.
1. `inverse_reg`: The hyperparameter that the paper explored via grid search to determine optimal inverse regularization strength for the optimizer.

In [24]:
# CS598 Project Code / Support Vector Machine / Implementation

# For SVM, shift our data from its native range of [0.0, 1.0] to [-1.0, 1.0]
X_train_tensor_svm = (X_train_tensor * 2) - 1
y_train_tensor_svm = (y_train_tensor * 2) - 1
X_test_tensor_svm = (X_test_tensor * 2) - 1
y_test_tensor_svm = (y_test_tensor * 2) - 1

class FEBRLReproducerSVM(nn.Module):
    def __init__(self, num_features, inverse_reg=0.0):
        # Create the our PyTorch support vector machine model
        super(FEBRLReproducerSVM, self).__init__()

        # STEP 1
        # Specify parameters for our PyTorch nn model based upon the analogous
        # parameters used by the original paper's sklearn SVC

        # PyTorch SVM (nn) concept                       Analogous sklearn SVC parameter
        # ------------------------                      -------------------------------
        self.inverse_reg = inverse_reg                  # C (inverse of the regularization strength)
        #                                               # kernel (original paper uses linear, as do we)

        # STEP 2
        # Define the layers for our lr model
        self.num_input_features = num_features
        
        self.fc1 = nn.Linear(in_features=self.num_input_features, out_features=1, bias=False)

        # STEP 3
        # Define the criteria and optimizer
        self.num_max_epochs = 5000
        self.criterion = nn.HingeEmbeddingLoss()
        self.optimizer = torch.optim.SGD(self.parameters(),
            lr = 0.001,
            weight_decay=inverse_reg)

    def forward(self, x):
        # Perform a forward pass on the nn; it is not recommended to call this
        # function directly, but to instead call fit(...) or predict(...) so that model's
        # mode is correctly set automatically
        x = self.fc1(x)

        return torch.squeeze(x)

    def fit(self, X_train, y_train):
        # Train the nn with the specified parameters; analogous to sklearn's
        # SVC.fit(...) method
        self.train()

        loss_previous_epoch = 1.0
        loss_consecutive_epochs_minimal = 0

        for epoch_i in np.arange(self.num_max_epochs):
            loss = None
            kfold = KFold(n_splits=10, shuffle=True, random_state=12345)

            for train_indicies, _ in kfold.split(X_train):
                self.optimizer.zero_grad()
                output = self.forward(X_train[train_indicies])
                output *= -1
                loss = self.criterion(output, y_train[train_indicies])
                loss.backward()
                self.optimizer.step()

            # Determine if criteria for early training termination is satisfied
            if (np.abs(loss_previous_epoch - loss.item())) <= 0.0001:
                loss_consecutive_epochs_minimal = loss_consecutive_epochs_minimal + 1

                if(loss_consecutive_epochs_minimal == 50):
                    break
            else:
                loss_consecutive_epochs_minimal = 0

            loss_previous_epoch = loss.item()

    def predict(self, X_test):
        # Test the nn with the specified parameters; analogous to sklearn's
        # SVC.predict(...) method
        self.eval()
        return self.forward(X_test)

frs_inverse_reg_range = [.001,.002,.005,.01,.02,.05,.1,.2,.5,1,5,10,20,50,100,200,500,1000,2000,5000] 
frs_inverse_reg_optimal = 1000  # Determined through search

##### 1.1.2. Support Vector Machine Hyperparameter Search

Grid search is performed on the set of candidate hyperparameters from the original paper using our model. Some hyperparameters may take a long time to test if they do not cause the model's loss to converge early (or at all).

In [25]:
# CS598 Project Code / Support Vector Machine / Hyperparameter Search

# Perform SVM base learner evaluation using the hyperparameter search range provided by the original paper
for inverse_reg in frs_inverse_reg_range:
    # Create an instance of the SVM
    febrl_reproducer_svm = FEBRLReproducerSVM(num_features=X_train_tensor.shape[1], inverse_reg=inverse_reg)

    # Train the model
    febrl_reproducer_svm.fit(X_train_tensor_svm, y_train_tensor_svm)

    # Test the model
    frs_output = febrl_reproducer_svm.predict(X_test_tensor_svm).detach()

    y_pred = np.asarray([1 if element > 0 else 0 for element in frs_output])

    print("weight_decay = {}: {}".format(inverse_reg, evaluation(y_test, y_pred)))

weight_decay = 0.001: {'no_false': 358153, 'confusion_matrix': [2597, 358151, 2, 2160], 'precision': 0.007198931109805183, 'sensitivity': 0.9992304732589458, 'no_links': 360748, 'F-score': 0.014294875146898145}
weight_decay = 0.002: {'no_false': 358202, 'confusion_matrix': [2597, 358200, 2, 2111], 'precision': 0.007197953419790076, 'sensitivity': 0.9992304732589458, 'no_links': 360797, 'F-score': 0.014292947638388976}
weight_decay = 0.005: {'no_false': 358167, 'confusion_matrix': [2597, 358165, 2, 2146], 'precision': 0.007198651742700173, 'sensitivity': 0.9992304732589458, 'no_links': 360762, 'F-score': 0.014294324377134585}
weight_decay = 0.01: {'no_false': 358147, 'confusion_matrix': [2597, 358145, 2, 2166], 'precision': 0.007199050845202388, 'sensitivity': 0.9992304732589458, 'no_links': 360742, 'F-score': 0.014295111204075511}
weight_decay = 0.02: {'no_false': 358154, 'confusion_matrix': [2597, 358152, 2, 2159], 'precision': 0.007198911154292874, 'sensitivity': 0.9992304732589458, 

  precision = count_true_pos/(count_true_pos+count_false_pos)


##### 1.1.3. Support Vector Machine Training, Evaluation, and Bagging

The reference implementation uses a 10-split k-fold as their bootstrapping technique. We will do the same here to ensure that our implementation is trained with the same data as the reference implementation. After each base learner is trained, it is evaluated with the test data set. After all base learners have evaluated the test data set, their outputs are passed through the bagging classifier.

In [26]:
# CS598 Project Code / Support Vector Machine / Training, Evaluation, and Bagging

# Perform bagging across 10 models
frs_kfold_count = 10
frs_kfold = KFold(n_splits=frs_kfold_count, shuffle=True, random_state=12345)
frs_kfold_i = 0

frs_results = [0] * frs_kfold_count

for train_indicies, _ in frs_kfold.split(X_train):
    # Create an instance of the feed-forward neural network
    febrl_reproducer_svm = FEBRLReproducerSVM(num_features=X_train_tensor.shape[1], inverse_reg=frs_inverse_reg_optimal)

    # Train the model
    febrl_reproducer_svm.fit(X_train_tensor_svm[train_indicies], y_train_tensor_svm[train_indicies])

    # Test the model
    frs_results[frs_kfold_i] = febrl_reproducer_svm.predict(X_test_tensor_svm).detach().numpy()

    # Print the results of the current base learner for convenience
    y_pred = np.asarray([1 if element > 0 else 0 for element in frs_results[frs_kfold_i]])
    print("Execution {}: {}".format(frs_kfold_i, evaluation(y_test, y_pred)))

    frs_kfold_i = frs_kfold_i + 1

frs_bagging_raw_score = np.average(frs_results, axis=0)
frs_bagging_binary_score = np.copy(frs_bagging_raw_score)
frs_bagging_binary_score[frs_bagging_binary_score > 0] = 1
frs_bagging_binary_score[frs_bagging_binary_score <= 0] = 0
frs_bagging_evaluation = evaluation(y_test, frs_bagging_binary_score)
print("SVM bagging: {}".format(frs_bagging_evaluation))

Execution 0: {'no_false': 271, 'confusion_matrix': [2486, 158, 113, 360153], 'precision': 0.9402420574886535, 'sensitivity': 0.9565217391304348, 'no_links': 2644, 'F-score': 0.9483120350944115}
Execution 1: {'no_false': 260, 'confusion_matrix': [2486, 147, 113, 360164], 'precision': 0.9441701481200152, 'sensitivity': 0.9565217391304348, 'no_links': 2633, 'F-score': 0.9503058103975535}
Execution 2: {'no_false': 337, 'confusion_matrix': [2489, 227, 110, 360084], 'precision': 0.916421207658321, 'sensitivity': 0.9576760292420161, 'no_links': 2716, 'F-score': 0.9365945437441204}
Execution 3: {'no_false': 355, 'confusion_matrix': [2490, 246, 109, 360065], 'precision': 0.9100877192982456, 'sensitivity': 0.9580607926125433, 'no_links': 2736, 'F-score': 0.9334582942830365}
Execution 4: {'no_false': 270, 'confusion_matrix': [2486, 157, 113, 360154], 'precision': 0.9405978055240257, 'sensitivity': 0.9565217391304348, 'no_links': 2643, 'F-score': 0.9484929416253338}
Execution 5: {'no_false': 320, 

#### 1.2 Neural Network Base Learner

##### 1.2.1. Neural Network Model Implementation

The neural network implementation, FEBRLReproducerNN, reproduces the results of the neural network base learner from the original paper. This model has two initialization parameters:
1. `num_features`: The number of features in the dataset; for the base dataset, this value is 13, but this value can differ if a dataset with fewer or additional features to be used.
1. `weight_decay`: The hyperparameter that the paper explored via grid search to determine optimal weight decay for the optimizer.

In [27]:
# CS598 Project Code / Neural Network Model / Implementation

class FEBRLReproducerNN(nn.Module):
    def __init__(self, num_features, weight_decay=0.0):
        # Create the our PyTorch nn model
        super(FEBRLReproducerNN, self).__init__()

        # STEP 1
        # Specify parameters for our PyTorch nn model based upon the analogous
        # parameters used by the original paper's sklearn MLPClassifier

        # PyTorch nn concept                            Analogous sklearn MLPClassifier parameter
        # ------------------                            -----------------------------------------
        self.optimizer = None                           # solver (optimizer; original paper uses LBFGS, but we will use SGD (defined later) due to PyTorch-sklearn differences)
        self.optimizer_weight_decay = weight_decay      # alpha (L2 penalty/regularization term)
        self.num_hidden_layer_nodes = 256               # hidden_layer_sizes (tuple of hidden layer nodes)
        self.activation = F.relu                        # activation (activation function)
        self.random_state = 12345                       # random_state (static, random state for reproducibility)
        #                                               # batch_size (minibatch size; unused in our model)
        #                                               # learning_rate (tells the model to use the provided initial learning rate; n/a to our model)
        self.optimizer_learning_rate_init = 0.001       # learning_rate_init (initial learning rate)
        self.optimizer_dampening = 0.5                  # power_t (dampening)
        self.num_max_epochs = 30000                     # max_iter (maximum number of epochs when using stochastic optimizers)
        self.shuffle = True                             # shuffle (shuffle samples in each iteration)
        self.tolerance = 0.0001                         # tol (optimization tolorance; early training termination)
        #                                               # verbose (print model progress debug messages to console; specified by unused by original paper)
        #                                               # warm_start (initialize the model with the results of previous executions; specified but unused by original paper)
        self.optimizer_momentum = 0.9                   # momentum (optimizer momentum)
        self.use_nesterov_momentum = True               # nesterovs_momentum (use Nesterov's momentum in the optimizer)
        #                                               # early_stopping (terminate early when validation is not improving; 'False' in original paper)
        #                                               # validation_fraction (validation data set criteria for early stopping; specified by unused by original paper)
        #                                               # beta_1 (parameter for Adam optimizer; specified but unused by original paper)
        #                                               # beta_2 (parameter for Adam optimizer; specified but unused by original paper)
        #                                               # epsilon (parameter for Adam optimizer; specified but unused by original paper)

        # STEP 2
        # Define the layers for our nn model
        self.num_input_features = num_features

        self.fc1 = nn.Linear(in_features=self.num_input_features, out_features=self.num_hidden_layer_nodes, bias=False)
        self.fc2 = nn.Linear(in_features=self.num_hidden_layer_nodes, out_features=1, bias=False)

        # STEP 3
        # Define the criteria and optimizer
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.SGD(self.parameters(),
            lr=self.optimizer_learning_rate_init,
            weight_decay=self.optimizer_weight_decay,
            momentum=self.optimizer_momentum,
            dampening=0,
            nesterov=self.use_nesterov_momentum)

    def forward(self, x):
        # Perform a forward pass on the nn; it is not recommended to call this
        # function directly, but to instead call fit(...) or predict(...) so that model's
        # mode is correctly set automatically
        x = self.activation(self.fc1(x))
        x = self.fc2(x)

        return torch.squeeze(x)

    def fit(self, X_train, y_train):
        # Train the nn with the specified parameters; analogous to sklearn's
        # MLPClassifier.fit(...) method
        self.train()
    
        loss_previous_epoch = 1.0
        loss_consecutive_epochs_minimal = 0

        for epoch_i in np.arange(self.num_max_epochs):
            loss = None
            kfold = KFold(n_splits=10, shuffle=self.shuffle, random_state=self.random_state)

            for train_indicies, _ in kfold.split(X_train):
                self.optimizer.zero_grad()
                output = self.forward(X_train[train_indicies])
                loss = self.criterion(output, y_train[train_indicies])
                loss.backward()
                self.optimizer.step()

            # Determine if criteria for early training termination is satisfied
            if (loss_previous_epoch - loss.item()) <= self.tolerance:
                loss_consecutive_epochs_minimal = loss_consecutive_epochs_minimal + 1

                if(loss_consecutive_epochs_minimal == 50):
                    break
            else:
                loss_consecutive_epochs_minimal = 0

            loss_previous_epoch = loss.item()

    def predict(self, X_test):
        # Test the nn with the specified parameters; analogous to sklearn's
        # MLPClassifier.predict(...) method
        self.eval()
        return self.forward(X_test)

frn_weight_decay_range = [.001,.002,.005,.01,.02,.05,.1,.2,.5,1,5,10,20,50,100,200,500,1000,2000,5000]
frn_weight_decay_optimal = 0.5  # Determined through search

##### 1.2.2. Neural Network Hyperparameter Search

Grid search is performed on the set of candidate hyperparameters from the original paper using our model. Some hyperparameters may take a long time to test if they do not cause the model's loss to converge early (or at all).

In [34]:
# CS598 Project Code / Neural Network Model / Hyperparameter Search

# Perform nn base learner evaluation using the hyperparameter search range provided by the original paper
for weight_decay in frn_weight_decay_range:
    # Create an instance of the feed-forward neural network
    febrl_reproducer_nn = FEBRLReproducerNN(num_features=X_train_tensor.shape[1], weight_decay=weight_decay)

    # Train the model
    febrl_reproducer_nn.fit(X_train_tensor, y_train_tensor)

    # Test the model
    frn_output = febrl_reproducer_nn.predict(X_test_tensor).detach()

    y_pred = np.asarray([1 if element > 0.5 else 0 for element in frn_output])

    print("weight_decay = {}: {}".format(weight_decay, evaluation(y_test, y_pred)))

weight_decay = 0.001: {'no_false': 67060, 'confusion_matrix': [2581, 67042, 18, 293269], 'precision': 0.03707108283182282, 'sensitivity': 0.9930742593305117, 'no_links': 69623, 'F-score': 0.07147406607404946}
weight_decay = 0.002: {'no_false': 67160, 'confusion_matrix': [2579, 67140, 20, 293171], 'precision': 0.03699135099470732, 'sensitivity': 0.9923047325894575, 'no_links': 69719, 'F-score': 0.07132387510716556}
weight_decay = 0.005: {'no_false': 77438, 'confusion_matrix': [2580, 77419, 19, 282892], 'precision': 0.03225040313003912, 'sensitivity': 0.9926894959599846, 'no_links': 79999, 'F-score': 0.062471246277149575}
weight_decay = 0.01: {'no_false': 70490, 'confusion_matrix': [2582, 70473, 17, 289838], 'precision': 0.03534323454931216, 'sensitivity': 0.9934590227010388, 'no_links': 73055, 'F-score': 0.06825812250508895}
weight_decay = 0.02: {'no_false': 70099, 'confusion_matrix': [2580, 70080, 19, 290231], 'precision': 0.03550784475639967, 'sensitivity': 0.9926894959599846, 'no_lin

  precision = count_true_pos/(count_true_pos+count_false_pos)


weight_decay = 5: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
weight_decay = 10: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
weight_decay = 20: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
weight_decay = 50: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
weight_decay = 100: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
weight_decay = 200: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
weight_decay = 500: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensi

KeyboardInterrupt: 

##### 1.2.3. Neural Network Training, Evaluation, and Bagging

The reference implementation uses a 10-split k-fold as their bootstrapping technique. We will do the same here to ensure that our implementation is trained with the same data as the reference implementation. After each base learner is trained, it is evaluated with the test data set. After all base learners have evaluated the test data set, their outputs are passed through the bagging classifier.

In [28]:
# CS598 Project Code / Neural Network Model / Training, Evaluation, and Bagging

# Perform bagging across 10 models
frn_kfold_count = 10
frn_kfold = KFold(n_splits=frn_kfold_count, shuffle=True, random_state=12345)
frn_kfold_i = 0

frn_results = [0] * frn_kfold_count

for train_indicies, _ in frn_kfold.split(X_train):
    # Create an instance of the feed-forward neural network
    febrl_reproducer_nn = FEBRLReproducerNN(num_features=X_train_tensor.shape[1], weight_decay=frn_weight_decay_optimal)

    # Train the model
    febrl_reproducer_nn.fit(X_train_tensor[train_indicies], y_train_tensor[train_indicies])

    # Test the model
    frn_results[frn_kfold_i] = febrl_reproducer_nn.predict(X_test_tensor).detach().numpy()

    # Print the results of the current base learner for convenience
    y_pred = np.asarray([1 if element > 0.5 else 0 for element in frn_results[frn_kfold_i]])
    print("Execution {}: {}".format(frn_kfold_i, evaluation(y_test, y_pred)))

    frn_kfold_i = frn_kfold_i + 1

frn_bagging_raw_score = np.average(frn_results, axis=0)
frn_bagging_binary_score = np.copy(frn_bagging_raw_score)
frn_bagging_binary_score[frn_bagging_binary_score > 0.5] = 1
frn_bagging_binary_score[frn_bagging_binary_score <= 0.5] = 0
frn_bagging_evaluation = evaluation(y_test, frn_bagging_binary_score)
print("Neural Network bagging: {}".format(frn_bagging_evaluation))

Execution 0: {'no_false': 327, 'confusion_matrix': [2486, 214, 113, 360097], 'precision': 0.9207407407407407, 'sensitivity': 0.9565217391304348, 'no_links': 2700, 'F-score': 0.9382902434421589}
Execution 1: {'no_false': 281, 'confusion_matrix': [2483, 165, 116, 360146], 'precision': 0.9376888217522659, 'sensitivity': 0.9553674490188534, 'no_links': 2648, 'F-score': 0.9464455879550219}
Execution 2: {'no_false': 312, 'confusion_matrix': [2487, 200, 112, 360111], 'precision': 0.9255675474506885, 'sensitivity': 0.9569065025009619, 'no_links': 2687, 'F-score': 0.9409761634506242}
Execution 3: {'no_false': 333, 'confusion_matrix': [2486, 220, 113, 360091], 'precision': 0.9186991869918699, 'sensitivity': 0.9565217391304348, 'no_links': 2706, 'F-score': 0.9372290292177192}
Execution 4: {'no_false': 296, 'confusion_matrix': [2484, 181, 115, 360130], 'precision': 0.9320825515947467, 'sensitivity': 0.9557522123893806, 'no_links': 2665, 'F-score': 0.9437689969604864}
Execution 5: {'no_false': 356,

#### 1.3. Logistic Regression Base Learner

##### 1.3.1 Logistic Regression Model Implementation

The logistic regression implementation, FEBRLReproducerLR, reproduces the results of the logistic regression base learner from the original paper. This model has two initialization parameters:
1. `num_features`: The number of features in the dataset; for the base dataset, this value is 13, but this value can differ if a dataset with fewer or additional features to be used.
1. `inverse_reg`: The hyperparameter that the paper explored via grid search to determine optimal inverse regularization strength for the optimizer.

In [29]:
# CS598 Project Code / Logistic Regression / Implementation

class FEBRLReproducerLR(nn.Module):
    def __init__(self, num_features, inverse_reg=0.0):
        # Create the our PyTorch logistic regression model
        super(FEBRLReproducerLR, self).__init__()

        # STEP 1
        # Specify parameters for our PyTorch LR model based upon the analogous
        # parameters used by the original paper's sklearn LogisticRegression

        # PyTorch LR (nn) concept                       Analogous sklearn LogisticRegression parameter
        # -----------------------                       ----------------------------------------------
        self.inverse_reg = inverse_reg                  # C (inverse of the regularization strength)
        #                                               # penalty (original paper uses L2)
        #                                               # dual (specifies dual formulation; specified but unused by the original paper)
        self.use_bias = True                            # fit_intercept (specified if bias should be added to decision function; original paper specified this as true)
        #                                               # intercept_scaling (intercept scaling, the original paper specifies this as 1)
        self.num_max_epochs = 5000                      # max_iter (maximum number of epochs when using stochastic optimizers)
        #                                               # multi_class (specifies class of problem; ours is a binary classification problem)
        #                                               # n_jobs (the number of CPU cores used for parallelization; specified but unused by the original paper)
        self.random_state = 12345                       # random_state (static, random state for reproducibility)

        # STEP 2
        # Define the layers for our lr model
        self.num_input_features = num_features

        self.fc1 = nn.Linear(in_features=self.num_input_features, out_features=1, bias=self.use_bias)

        # STEP 3
        # Define the criteria and optimizer
        self.criterion = nn.BCEWithLogitsLoss()
        self.optimizer = torch.optim.SGD(self.parameters(),
            lr = 0.01,
            weight_decay=self.inverse_reg)

    def forward(self, x):
        # Perform a forward pass on the nn; it is not recommended to call this
        # function directly, but to instead call fit(...) or predict(...) so that model's
        # mode is correctly set automatically
        x = self.fc1(x)

        return torch.squeeze(x)

    def fit(self, X_train, y_train):
        # Train the nn with the specified parameters; analogous to sklearn's
        # LogisticRegression.fit(...) method
        self.train()

        loss_previous_epoch = 1.0
        loss_consecutive_epochs_minimal = 0

        for epoch_i in np.arange(self.num_max_epochs):
            loss = None
            kfold = KFold(n_splits=10, shuffle=True, random_state=self.random_state)

            for train_indicies, _ in kfold.split(X_train):
                self.optimizer.zero_grad()
                output = self.forward(X_train[train_indicies])
                loss = self.criterion(output, y_train[train_indicies])
                loss.backward()
                self.optimizer.step()

            # Determine if criteria for early training termination is satisfied
            if (loss_previous_epoch - loss.item()) <= 0.0001:
                loss_consecutive_epochs_minimal = loss_consecutive_epochs_minimal + 1

                if(loss_consecutive_epochs_minimal == 50):
                    break
            else:
                loss_consecutive_epochs_minimal = 0

            loss_previous_epoch = loss.item()

    def predict(self, X_test):
        # Test the nn with the specified parameters; analogous to sklearn's
        # LogisticRegression.predict(...) method
        self.eval()
        return self.forward(X_test)

frl_inverse_reg_range = [.001,.002,.005,.01,.02,.05,.1,.2,.5,1,5,10,20,50,100,200,500,1000,2000,5000] 
frl_inverse_reg_optimal = 0.5  # Determined through search

##### 1.3.2 Logistic Regression Hyperparameter Search

Grid search is performed on the set of candidate hyperparameters from the original paper using our model. Some hyperparameters may take a long time to test if they do not cause the model's loss to converge early (or at all).

In [33]:
# CS598 Project Code / Logistic Regression Model / Hyperparameter Search

# Perform logistic regression base learner evaluation using the hyperparameter search range provided by the original paper
for inverse_reg in frl_inverse_reg_range:
    # Create an instance of the logistic regression model
    febrl_reproducer_lr = FEBRLReproducerLR(num_features=X_train_tensor.shape[1], inverse_reg=inverse_reg)

    # Train the model
    febrl_reproducer_lr.fit(X_train_tensor, y_train_tensor)

    # Test the model
    frl_output = febrl_reproducer_lr.predict(X_test_tensor).detach()

    y_pred = np.asarray([1 if element > 0.5 else 0 for element in frl_output])

    print("inverse_reg = {}: {}".format(inverse_reg, evaluation(y_test, y_pred)))

inverse_reg = 0.001: {'no_false': 1331, 'confusion_matrix': [2503, 1235, 96, 359076], 'precision': 0.6696094168004281, 'sensitivity': 0.9630627164293959, 'no_links': 3738, 'F-score': 0.7899637052232918}
inverse_reg = 0.002: {'no_false': 1413, 'confusion_matrix': [2507, 1321, 92, 358990], 'precision': 0.6549111807732497, 'sensitivity': 0.9646017699115044, 'no_links': 3828, 'F-score': 0.7801462579741714}
inverse_reg = 0.005: {'no_false': 1277, 'confusion_matrix': [2502, 1180, 97, 359131], 'precision': 0.6795219989136339, 'sensitivity': 0.9626779530588688, 'no_links': 3682, 'F-score': 0.7966884254099664}
inverse_reg = 0.01: {'no_false': 1216, 'confusion_matrix': [2498, 1115, 101, 359196], 'precision': 0.6913921948519236, 'sensitivity': 0.9611388995767602, 'no_links': 3613, 'F-score': 0.8042498390212492}
inverse_reg = 0.02: {'no_false': 1209, 'confusion_matrix': [2499, 1109, 100, 359202], 'precision': 0.6926274944567627, 'sensitivity': 0.9615236629472874, 'no_links': 3608, 'F-score': 0.805

  precision = count_true_pos/(count_true_pos+count_false_pos)


inverse_reg = 5: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
inverse_reg = 10: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
inverse_reg = 20: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
inverse_reg = 50: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
inverse_reg = 100: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
inverse_reg = 200: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity': 0.0, 'no_links': 0, 'F-score': nan}
inverse_reg = 500: {'no_false': 2599, 'confusion_matrix': [0, 0, 2599, 360311], 'precision': nan, 'sensitivity'

##### 1.3.3 Logistic Regression Training, Evaluation, and Bagging

The reference implementation uses a 10-split k-fold as their bootstrapping technique. We will do the same here to ensure that our implementation is trained with the same data as the reference implementation. After each base learner is trained, it is evaluated with the test data set. After all base learners have evaluated the test data set, their outputs are passed through the bagging classifier.

In [31]:
# CS598 Project Code / Logistic Regression Model / Training, Evaluation, and Bagging

# Perform bagging across 10 models
frl_kfold_count = 10
frl_kfold = KFold(n_splits=frl_kfold_count, shuffle=True, random_state=12345)
frl_kfold_i = 0

frl_results = [0] * frl_kfold_count

for train_indicies, _ in frl_kfold.split(X_train):
    # Create an instance of the feed-forward neural network
    febrl_reproducer_nn = FEBRLReproducerLR(num_features=X_train_tensor.shape[1], inverse_reg=frl_inverse_reg_optimal)

    # Train the model
    febrl_reproducer_nn.fit(X_train_tensor[train_indicies], y_train_tensor[train_indicies])

    # Test the model
    frl_results[frl_kfold_i] = febrl_reproducer_nn.predict(X_test_tensor).detach().numpy()

    # Print the results of the current base learner for convenience
    y_pred = np.asarray([1 if element > 0.5 else 0 for element in frl_results[frl_kfold_i]])
    print("Execution {}: {}".format(frl_kfold_i, evaluation(y_test, y_pred)))

    frl_kfold_i = frl_kfold_i + 1

frl_bagging_raw_score = np.average(frl_results, axis=0)
frl_bagging_binary_score = np.copy(frl_bagging_raw_score)
frl_bagging_binary_score[frl_bagging_binary_score > 0.5] = 1
frl_bagging_binary_score[frl_bagging_binary_score <= 0.5] = 0
frl_bagging_evaluation = evaluation(y_test, frl_bagging_binary_score)
print("Logistic Regression bagging: {}".format(frl_bagging_evaluation))

Execution 0: {'no_false': 243, 'confusion_matrix': [2391, 35, 208, 360276], 'precision': 0.9855729596042869, 'sensitivity': 0.9199692189303579, 'no_links': 2426, 'F-score': 0.9516417910447761}
Execution 1: {'no_false': 237, 'confusion_matrix': [2390, 28, 209, 360283], 'precision': 0.9884201819685691, 'sensitivity': 0.9195844555598307, 'no_links': 2418, 'F-score': 0.9527606139126967}
Execution 2: {'no_false': 247, 'confusion_matrix': [2392, 40, 207, 360271], 'precision': 0.9835526315789473, 'sensitivity': 0.9203539823008849, 'no_links': 2432, 'F-score': 0.9509043927648578}
Execution 3: {'no_false': 253, 'confusion_matrix': [2393, 47, 206, 360264], 'precision': 0.9807377049180328, 'sensitivity': 0.9207387456714121, 'no_links': 2440, 'F-score': 0.9497916253224846}
Execution 4: {'no_false': 246, 'confusion_matrix': [2391, 38, 208, 360273], 'precision': 0.9843557019349527, 'sensitivity': 0.9199692189303579, 'no_links': 2429, 'F-score': 0.951073985680191}
Execution 5: {'no_false': 250, 'conf

### 2. Stacking

The bagged base learners are stacked to produce a the model's final prediction.

In [32]:
# CS598 Project Code / Stacking / Ensemble Prediction

fr_stacking_threshold = 0.99

fr_stacking_binary_score = np.average([frs_bagging_binary_score, frn_bagging_binary_score, frl_bagging_binary_score], axis=0)
fr_stacking_binary_score[fr_stacking_binary_score > fr_stacking_threshold] = 1
fr_stacking_binary_score[fr_stacking_binary_score <= fr_stacking_threshold] = 0
fr_stacking_evaluation = evaluation(y_test, fr_stacking_binary_score)
print("Ensemble bagging-stacking: {}".format(fr_stacking_evaluation))

Ensemble bagging-stacking: {'no_false': 251, 'confusion_matrix': [2388, 40, 211, 360271], 'precision': 0.9835255354200988, 'sensitivity': 0.9188149288187765, 'no_links': 2428, 'F-score': 0.9500696240302366}


### Appendix: Reference Implementation Validation

#### Base Learners

Each of the following cells executes one of the three base learner models from the sklearn library used by the reference implementation with identical parameterization and data set as a sanity check to validate the paper's models' results using the same data set used by the reference implementation above.

In [None]:
# CS598 Project Code / Reference Implementation Validation (Neural Network)

# Sanity check: Create, train, and test a new instance of an sklearn MLPClassifier from scratch using the original
# paper's parameters to confirm that results are reproducible and absent any unexpected/hidden dependencies
nn_validation_model = MLPClassifier(solver='lbfgs', alpha=2000, hidden_layer_sizes=(256, ), 
                              activation = 'relu',random_state=None, batch_size='auto', 
                              learning_rate='constant',  learning_rate_init=0.001, 
                              power_t=0.5, max_iter=10000, shuffle=True, 
                              tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                              nesterovs_momentum=True, early_stopping=False, 
                              validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn_validation_model.fit(X_train, y_train)

nn_validation_model_results = classify(nn_validation_model, X_test)

print("Original paper nn: {}".format(evaluation(y_test, nn_validation_model_results)))

In [None]:
# CS598 Project Code / Reference Implementation Validation (Logistic Regression)

# Sanity check: Create, train, and test a new instance of an sklearn LogisticRegression from scratch using the original
# paper's parameters to confirm that results are reproducible and absent any unexpected/hidden dependencies
lg_validation_model = LogisticRegression(C=0.005, penalty = 'l2',class_weight=None, dual=False, fit_intercept=True, 
                                   intercept_scaling=1, max_iter=5000, multi_class='ovr', 
                                   n_jobs=1, random_state=None)
lg_validation_model.fit(X_train, y_train)

lg_validation_model_results = classify(lg_validation_model, X_test)

print("Original paper logistic regression: {}".format(evaluation(y_test, lg_validation_model_results)))

In [None]:
# CS598 Project Code / Reference Implementation Validation (Support Vector Machine)

# Sanity check: Create, train, and test a new instance of an sklearn SVC from scratch using the original
# paper's parameters to confirm that results are reproducible and absent any unexpected/hidden dependencies
svm_validation_model = svm.SVC(C = 0.001, kernel = 'linear')
svm_validation_model.fit(X_train, y_train)

svm_validation_model_results = classify(svm_validation_model, X_test)

print("Original paper support vector machine: {}".format(evaluation(y_test, svm_validation_model_results)))