In [1]:
# 6/16/21 BMNT/KCA Demo
# Based on JingWei's bmnt.ipynb notebook

# (assumption that before we get here, we would have talked about Orchestrator access - 
# running in cloud for performance reasons, discussion about certificate warning, as well as 
# characterization of the Breast Cancer data set)

# We will go through the Computation Orchestrator tool. Right now, as you can see, we use 
# Jupyter Lab as the interface to run Python scripts to get federated computation results from our
# federated data set.

# The current Orchestrator works with Python API - meaning any computation requires calling them using
# our API. The product roadmap has a GUI version coming in the near future.

# This part of the demo will be pretty "technical", in that you'll be executing actual Python code.
# But don't worry, we've prepared the script for you so you don't have to come up with them or even type them,
# and the tasks that we'll be performing are all pretty simple.

# To execute the script in this "notebook", all you do is to make sure that the cursor is in the "cell" that
# we want to execute, and hit "shift-return", and Jupyter Lab kernel will execute the script. 

# We're in the process of "refactoring" the API - meaning we're making it simpler to use, and also
# making it work with a more powerful version of the Orchestrator and the Job Engine (that's the software that
# actually runs all the SAFE Functions). We will be providing plenty of documentation and sample code so that
# the KCA staff member who will be actually executing the queries will know how to make this all work. 
# And of course we will be there to help as well.

# We have set up the demo to simulate the KCA use-case, but using 3 total enclaves: Hospital 1, Hospital 2, and KCA-Arm.
# We split up the Breast Cancer data set into three - the data size is not equal (300, 200, 69), just to 
# simulate what we'll most likely see. 

# On with the Demo!


In [2]:
# We'll start with basic setup - these are commands to the Python kernel to load in the necessary libraries
# used for accessing the Orchestrator API. You don't have to worry about this too much, it's just boilerplate code,
# and it should disappear as we polish the UI.

In [1]:
import pandas as pd
import numpy as np
from sail.data import DataFrameGroup
from sail.core import connect
from sail.algo import FdLogistic

In [2]:
# So in this next step, we are going to have the Orchestrator connect to the Secure Computation Enclaves, 
# and then define the federated computation cluster. 

# (TODO: rename variables so it directly ties to hospital1, hospital2, kca)

# By the way, you can see the connection credentials here, but we'll be moving this out of the script, and
# making the user type in the username/password at connection time, for obvious security reasons. You can of course
# use a password manager for that.

# "workplace" is the temporary file location. That will disappear as well.

In [3]:
# Connect to Secure Computation Enclave
vm1 = connect("23.100.16.62", 7000, "lbart@igr.com", "sailpassword")
vm2 = connect("104.41.147.38", 7000, "lbart@igr.com", "sailpassword")
vm3 = connect("104.41.150.207", 7000, "lbart@igr.com", "sailpassword")

# Set up federated computation cluster 
vms = [vm1, vm2, vm3]

# temporary file location
workplace = "/home/jjj/playground/tmp"

In [4]:
# "vms" contains the GUID (that's short for Globally Unique ID) - basically the identifiers for the 
# Secure Computation Enclaves.

In [5]:
vms

['92443FA3CCE24C189D647717506BC2A4',
 'CAA577ABAF294B98BEB6E6D16D79E985',
 'FFC431D8360241EEADA87CD21C134DB3']

In [8]:
# Next we will set up the actual federated data set. That is, the collection of tables from the three Enclaves.
# Here, the "[0,0,0]" refers to the tables within the dataset. It's just taking the first table from the three
# enclaves (because of zero-index), and putting them in the DataFrameGroup (which is the set of tables - i.e. the
# Federated Data Set). 

# We're omitting in this demo, but normally you'd be able to ask the system about the data sets - e.g. name and description
# of the tables and columns - and be able to reference them by name, not indices. 

In [9]:
dfg = DataFrameGroup(vms, workplace)
dfg.import_data([0,0,0])

In [10]:
# Now we will do some very simple "exploration" of the data. 

# First, we will look at "class distribution" across the hospitals. That is, the percentage of 'Benign' samples,
# as a percentage of all samples. I just want to see how evenly the data sets are distributed. 

In [11]:
# You'll see that there is a little bit of non-IIDness in terms of class distribution
precentage = dfg.label_precentage('diagnosis', 'B', dfg.df)
precentage

[0.5133333333333333, 0.755, 0.7536231884057971, 0.6274165202108963]

In [12]:
# There were four numbers returned. The first three for the three data sets, and the last is the 
# "federated percentage", that is, the percentage of the entire federation. (it's a weighted average, by the way)

In [13]:
# Next, we'll look at the distribution of the values of the data in each column. 
# We're just going to look at the per-hospital "mean" of each column just to see if there's anything unusual.

In [14]:
mean_arr = dfg.df_mean(dfg.df)
pd.DataFrame(mean_arr)

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,26229790.0,14.442953,19.319867,94.2509,683.729333,0.09798,0.110838,0.099588,0.054394,0.184987,...,16.83421,25.9028,111.0952,940.108,0.135359,0.27545,0.299886,0.124679,0.300126,0.085716
1,42231540.0,13.896085,18.736,90.1402,631.518,0.092976,0.093612,0.075471,0.042023,0.175944,...,15.803635,24.91705,104.003,829.846,0.126892,0.227636,0.241222,0.102931,0.28034,0.080871
2,14004730.0,13.425014,20.763043,87.348841,597.23913,0.099128,0.107191,0.080527,0.045103,0.179654,...,15.162014,26.899855,100.035797,768.843478,0.135241,0.239342,0.241523,0.104651,0.274594,0.085163
3,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946


In [15]:
# Of course, to do this "correctly", you can't just look at the mean and say everything is fine - 
# you'd want to look at the number of samples, look at variance, maybe even run some statistical tests
# to see if they are IID, stuff like that. 

# We'll be expanding this "exploration" use-case to include more visual explorations as well - things like 
# histograms and correlation tables or heat maps and such. 

# Also, in this demo, we are omitting Cohort Selection (i.e. filter by gender or admission dates or any other criteria), 
# and also any data preprocessing (e.g. flattening of longitudinal data, imputing missing values, detecting
# and removing outliers, or normalization/standardization that's often needed in certain statistical computations).
# We'll be supporting those use-cases in the production system. 

# As this is a per-hospital aggregation result, as we discussed, we shouldn't be showing results if 
# the number of samples is very low. It's not a problem here, but when we do cohorts, we don't want to be
# showing a mean value of a data set consisting of one subject!  We're working on what we are calling the 
# "Privacy Sentinel" - which will detect such potential privacy-leaking computation results, and either flag or 
# prevent such computation. Rules, including the threshold number of minimum sample size, will be configurable in 
# the system. Something that we'll discuss in the future in terms of details on how we want to set that up.

In [16]:
# Any questions on data exploration phase?

# So again, what we omitted today were:
#   Visualization, cohort selection, longitudinal data processing, other data cleaning and pre-processing including
#   normalization/standardization/mapping/transformation/etc, use of multiple tables, selection of columns to use

# Ok, so now, on to statistical test!

In [17]:
# So what we are going to do is to run a t-test, to see how the Malignant and Benign tumors are characterized
# by the data set. 
# Now, I know this isn't the best test to run here, but we wanted to run a simple test against this limited data set.
# So ignore the fact that this setup of the scientific inquiry is a classic case of p-value hacking!

In [18]:
# First we're just going to set up the data set so that we split up the Benign and Malignant samples.
# And then we'll run the Federated t-test.

In [19]:
b_df = dfg.df_select('diagnosis', 'B', dfg.df)
b_df_no_diag = dfg.drop(['diagnosis','diagnosis','diagnosis'], [1,1,1], b_df)
m_df = dfg.df_select('diagnosis', 'M', dfg.df)
m_df_no_diag = dfg.drop(['diagnosis','diagnosis','diagnosis'], [1,1,1], m_df)
tt_result = dfg.ttest_ind(b_df_no_diag, m_df_no_diag)
tt_result

(array([ -0.94945105, -25.48762285, -10.88607835, -26.45930869,
        -23.9911101 ,  -9.16195118, -17.73350088, -23.15059758,
        -29.41628155,  -8.3534167 ,   0.30626599, -16.43275127,
          0.19805554, -15.96984312, -15.6457782 ,   1.60214185,
         -7.31032795,  -6.25614616, -10.6610848 ,   0.1555981 ,
         -1.86534295, -29.40101867, -12.25246905, -30.02905861,
        -25.77968773, -11.08669353, -17.48110286, -20.93651786,
        -31.11302668, -10.9244796 ,  -8.16739497]),
 array([1.71397881e-001, 2.28554211e-096, 1.70986526e-025, 2.22091039e-101,
        1.26685305e-088, 4.62784167e-019, 1.31779585e-056, 2.85786047e-084,
        1.72387627e-116, 2.55825118e-016, 3.79757277e-001, 3.24291631e-050,
        4.21536297e-001, 5.56740067e-048, 1.97675845e-046, 5.48405852e-002,
        4.55845018e-013, 3.89020133e-010, 1.30067726e-024, 4.38202599e-001,
        3.13248054e-002, 2.05962158e-116, 4.37180719e-031, 1.38333014e-119,
        7.09000890e-098, 2.73752963e-026, 2.

In [20]:
tt_result_mono = dfg.ttest_ind_mono(b_df_no_diag, m_df_no_diag)
tt_result_mono

[[array([3.76408575e-01, 1.62492217e-45, 2.14872862e-19, 8.31833771e-48,
         6.46025624e-40, 5.84995816e-10, 2.63355755e-28, 3.35185652e-34,
         5.80571651e-52, 4.78364914e-08, 4.85825145e-01, 1.61804039e-21,
         2.86830402e-01, 4.38279479e-21, 1.48739378e-21, 5.80228217e-02,
         5.15329114e-06, 2.51454934e-03, 7.11783890e-09, 1.94603540e-01,
         2.85706389e-01, 3.22998774e-57, 3.07226678e-22, 7.05147308e-59,
         4.85971357e-47, 2.23680556e-15, 9.69473973e-30, 1.12269872e-33,
         8.69734087e-63, 1.20764436e-14, 3.09842477e-10]),
  array([ -0.31521748, -16.85006577,  -9.5698641 , -17.4591834 ,
         -15.35781645,  -6.28378501, -12.18996945, -13.81789851,
         -18.5657272 ,  -5.46946799,  -0.03556844, -10.21991504,
           0.5632903 , -10.08906542, -10.23093453,   1.57617847,
          -4.48747688,  -2.82618055,  -5.83210326,   0.86231593,
          -0.56659953, -19.9725185 , -10.43637941, -20.41999415,
         -17.25534771,  -8.27077478, -12

In [180]:
# So the result of the t-test is in two parts: the statistic and p-value.
# The first array contains the statistic, and the second array contains the p-values.

# Let's reformat that into a table with headers. We're using a lot of code to do the reformatting here, 
# but we'll be improving these federated function calls so that they return results that are easier for the user
# to parse.

In [181]:
df_new = dfg.drop(['diagnosis','diagnosis','diagnosis'], [1,1,1], dfg.df)
#h1_df_no_diag = df_new[0]
#h2_df_no_diag = df_new[1]
#kca_df_no_diag = df_new[2]
df_new

['D25896F6708B42D58694B25BD6D07957399B892847A743B3912272E4FA8CD060',
 '7BC1C7758FC446128B2A2DE7CFA4C489399B892847A743B3912272E4FA8CD060',
 '18E3E308502B421C9E8192A27A12875A399B892847A743B3912272E4FA8CD060']

In [182]:
col_name = dfg.df_col_name(df_new)[0]
tt_result_df = pd.DataFrame(columns = col_name)
tt_result_df = tt_result_df.append(pd.DataFrame(tt_result[0].reshape(1,-1), columns = col_name), ignore_index = True)
tt_result_df = tt_result_df.append(pd.DataFrame(tt_result[1].reshape(1,-1), columns = col_name), ignore_index = True)
tt_result_df.rename(index={0:'statistic'},inplace=True)
tt_result_df.rename(index={1:'pvalue'},inplace=True)
tt_result_df

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
statistic,-0.949451,-25.48762,-10.88608,-26.45931,-23.99111,-9.161951,-17.7335,-23.1506,-29.41628,-8.353417,...,-29.40102,-12.25247,-30.02906,-25.77969,-11.08669,-17.4811,-20.93652,-31.11303,-10.92448,-8.167395
pvalue,0.171398,2.285542e-96,1.709865e-25,2.22091e-101,1.266853e-88,4.627841999999999e-19,1.317796e-56,2.8578600000000002e-84,1.723876e-116,2.558251e-16,...,2.059622e-116,4.3718070000000005e-31,1.3833300000000002e-119,7.090009e-98,2.7375299999999997e-26,2.3528920000000003e-55,7.736438e-73,5.02376e-125,1.206099e-25,1.028157e-15


In [183]:
# So here you see that the p-values of the test are tiny! 10^ -100?? 
# Is that really true? Did we messs something up? 

# You might realize at this point that we just used the Student's t-test, and one of the assumptions is that
# the variance of the data points has to be similar across the two groups being tested. So let's just make sure 
# that assumption was met.

# We'll run Federated Variance for all the columns.

In [184]:
variance = []
variance.append(dfg.df_var(b_df_no_diag))
variance.append(dfg.df_var(m_df_no_diag))
variance = pd.DataFrame(variance)
variance

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1.362815e+16,3.170222,15.961021,139.415582,18033.0301,0.000181,0.001139,0.001887,0.000253,0.000615,...,3.925817,30.183536,182.982188,26765.425899,0.000401,0.008497,0.019703,0.001281,0.001743,0.000191
1,1.901546e+16,10.265431,14.284393,477.62587,135378.355365,0.000159,0.002915,0.005628,0.001182,0.000764,...,18.348967,29.537095,867.718099,357565.42185,0.000478,0.029027,0.032945,0.002144,0.005578,0.000465


In [185]:
# comparable enough, but some look pretty different. Maybe we can understand more by taking the log difference in variance. 

In [186]:
# so the variance for some of the columns are pretty different across the cohorts

# to see the log difference of the variance among cohorts:
# (base 2... so not quite a whole magnitude difference in variance,
var_diff = np.abs(np.log(variance)[0:1].reset_index() - np.log(variance)[1:2].reset_index())
var_diff

Unnamed: 0,index,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,0.333115,1.17498,0.110982,1.231368,2.015868,0.128674,0.939547,1.092631,1.540854,...,1.541999,0.02165,1.556478,2.592207,0.177406,1.228482,0.514054,0.514871,1.163409,0.891096


In [187]:
# Is it comparable enough? So at this point, the researcher may decide to run a different flavor of t-test,
# for example, Welch's un-equal variance t-test. But we'll just stop this path of inquiry here.

# And we are now going to move on to something a little different - i.e. Federated Learning.
# "Learning" in the sense that we will be actually training a model (you can say it's machine learning)
# for classifying tumors. In this scenario, it would be to create a computational model, which takes in the 
# features obtained from single cell images from fine needle biopsy.

In [188]:
# We'll start with some setup. We'll be splitting up a training set, and test set.
# The training set will be used to train the model, and then the test set will be used to 
# evaluate the model - we don't want to evaluate the model with the samples it has seen during training
# because that's cheating; instead we will hold out some samples to be used exclusively for evaluation.

In [189]:
df_tmp = dfg.bin_cate(dfg.df, 'diagnosis')
y_df = dfg.get_col(['diagnosis','diagnosis','diagnosis'], df_tmp)
X_df = dfg.drop(['diagnosis','diagnosis','diagnosis'], [1,1,1], df_tmp)
y_arr = dfg.to_numpy(y_df)
X_arr = dfg.to_numpy(X_df)
X_train = dfg.to_tensor(X_arr, 1)
y_train = dfg.to_tensor(y_arr, 0)

In [151]:
data = {}
data['X_train']=[X_train[0], X_train[1]]
data['y_train']=[y_train[0], y_train[1]]
X_test = X_train[2]
y_test = y_train[2]

In [152]:
# We'll set up the Logistic Regression function.
# There's a lot of code here - and again, we are improving the API to make it more user-friendly, so by the time
# we get to production, we would be using far less code to run simple models like Logistic Regression.

In [153]:
fdl = FdLogistic([vm1, vm2], vm3, data, workplace)
num_feature = dfg.df_get_feature_num(X_df[0])
model = fdl.initmodel(num_feature)

In [154]:
import torch.nn as nn
import torch
class LogisticRegression(nn.Module):
    def __init__(self, n_input_features):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)
    
    def forward(self,x):
        y_predict = torch.sigmoid(self.linear(x))
        return y_predict

In [155]:
# Now the the setup is complete, one thing to note is that we are using a framework called PyTorch
# to run this Logistic Regression. It's a neural network framework used by Facebook to do very advanced
# machine learning - they use this for all the face or image recoginition, for example. We'll also be using 
# established and proven statistical libraries to make sure that the computation results are reliable.


In [156]:
# so now, let's Train the model. Since we are doing Federated Learning, it'll take longer than 
# it would for a "pooled data" model, in which one runs the algorithm on a single computer with all the data.

# We'll set it up to do 100 iterations - that means the Secure Computation Enclaves will run their "local" computations,
# and coordinate with the aggregator node up to 100 times. 

In [157]:
model = fdl.train(model, num_feature, 100)

In [158]:
# and now that it's done, let's evaluate the model

In [159]:
acc = fdl.test(model, num_feature, X_test, y_test)
acc

0.9275362491607666

In [160]:
# So 94% accuracy! What that means is that we tested the model using the held out data - and used the model to predict
# whether a sample was Benign or Malignant. And it got the prediction correct 91% of the time. 

# I'd say that's pretty good. Perhaps we can make the model perform even better by "tuning the hyperparameters" 
# things like "learning rate", "maximum number of iterations", "convergence criteria", etc.
# But we'll omit all that - we'll keep this demo short for now. And we'll be showing a more sophisticated use-case 
# in what we're calling the "KCA Stakeholder Demo" to come at a later date, for the PIs and also Pharma, non-member 
# hospitals, and potentially even donors.