In [19]:
import pandas as pd
import numpy as np
from sail.data import DataFrameGroup
from sail.core import connect
from sail.algo import FdLogistic

In [3]:
# notebook for preparing for BMNT/KCA Federated Mean demo: 
# https://secureailabs.atlassian.net/wiki/spaces/SAILConfluenceHome/pages/1736048641/KCA%2BSummer%2BTrial%2BFor%2BBMNT?focusedCommentId=1748303897#comment-1748303897 

# data from https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

# What this notebook does:
# 1) clean the data
#    - no need for imputation or outlier removal, so simply remove the unneeded last column 
# 2) split the data set into 3 sets: hospital1, hospital2, kca
#    - The three data sets are not uniform in size. 569 = 300 + 200 + 69 subjects each
#    - The three data sets are split totally non-randomly; hospita1 contains the first 500 subjects, hospital2 the next 200, kca the rest
# 3) for fun, there's some exploration of the data post splitting (mean comparisons, t-test across data sets, t-test across diagnosis)

In [4]:
vm1 = connect("40.121.60.102", 7000, "lbart@igr.com", "sailpassword")
vm2 = connect("40.121.63.225", 7000, "lbart@igr.com", "sailpassword")
vm3 = connect("40.121.47.115", 7000, "lbart@igr.com", "sailpassword")
vms = [vm1, vm2, vm3]
workplace = "/home/jjj/playground/tmp/"

In [5]:
vms

['406874BC016F4CC9BB0F229CBBFD5469',
 '445378323A9A4FF493365FD97FA35A40',
 '5EA32D13A1C145B693154B97345FFE88']

In [6]:
dfg = DataFrameGroup(vms, workplace)
dfg.import_data([0,0,0])

In [7]:
# some exploration on distribution of data

# Let's see if the classes (i.e. diagnosis) are distributed evenly across data sets

# You'll see that there is a little bit of non-IIDness in terms of class distribution
precentage = dfg.label_precentage('diagnosis', 'B', dfg.df)
precentage

[0.5133333333333333, 0.755, 0.7536231884057971, 0.6274165202108963]

In [8]:
# let's look at the mean() of each column for each data set

# you can see that there's a bit (not much) of discrepancy of the mean across the data sets.
mean_arr = dfg.df_mean(dfg.df)
pd.DataFrame(mean_arr)

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,26229790.0,14.442953,19.319867,94.2509,683.729333,0.09798,0.110838,0.099588,0.054394,0.184987,...,16.83421,25.9028,111.0952,940.108,0.135359,0.27545,0.299886,0.124679,0.300126,0.085716
1,42231540.0,13.896085,18.736,90.1402,631.518,0.092976,0.093612,0.075471,0.042023,0.175944,...,15.803635,24.91705,104.003,829.846,0.126892,0.227636,0.241222,0.102931,0.28034,0.080871
2,14004730.0,13.425014,20.763043,87.348841,597.23913,0.099128,0.107191,0.080527,0.045103,0.179654,...,15.162014,26.899855,100.035797,768.843478,0.135241,0.239342,0.241523,0.104651,0.274594,0.085163
3,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946


In [9]:
df_new = dfg.drop(['diagnosis','diagnosis','diagnosis'], [1,1,1], dfg.df)
#h1_df_no_diag = df_new[0]
#h2_df_no_diag = df_new[1]
#kca_df_no_diag = df_new[2]
df_new

['01CF2B4FC7A24D13B7E7EBAF95FC67C2515D67064F4F46619BCFBB6E6869AC92',
 '685008C7E84B45EFB68E4E2E43411C28515D67064F4F46619BCFBB6E6869AC92',
 '4761D3D780F041CD9BE9FE9324B89D4C515D67064F4F46619BCFBB6E6869AC92']

In [10]:

# let's run a t-test!

# Will use t-test from scipy
#   https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html
# The return value is t-statistic, and p-value. The default is 2-sided test.
# Test assumes same variance (which I didn't check for), but seems like data sets are surely statistically different
# (i.e. non-IID) based on the way we split the data.

# Note I'm removing the diagnosis label temporarily to run the test, since it's just the label so it can't compute on it.

# Since there are three parties, the t-test is run 3 ways.
ttest_res = dfg.ttest_ind_mutual(df_new)
ttest_res

[id                        -1.324923
 radius_mean                1.729867
 texture_mean               1.540833
 perimeter_mean             1.886071
 area_mean                  1.643169
 smoothness_mean            4.080905
 compactness_mean           3.603032
 concavity_mean             3.331270
 concave points_mean        3.532788
 symmetry_mean              3.623981
 fractal_dimension_mean     1.715715
 radius_se                  2.639431
 texture_se                 0.589613
 perimeter_se               2.317933
 area_se                    1.795681
 smoothness_se              1.410884
 compactness_se             2.487478
 concavity_se               1.923935
 concave points_se          2.956684
 symmetry_se                1.487036
 fractal_dimension_se       2.088755
 radius_worst               2.364189
 texture_worst              1.788369
 perimeter_worst            2.347272
 area_worst                 2.126722
 smoothness_worst           4.152631
 compactness_worst          3.330422
 

In [11]:
# some of the p-values are pretty small, indicating they look like they come from distinct classes
# (indicating that the ordering is not really random)

# Actually something more interesting might be doing a t-test between the two classes
b_df = dfg.df_select('diagnosis', 'B', dfg.df)
b_df_no_diag = dfg.drop(['diagnosis','diagnosis','diagnosis'], [1,1,1], b_df)
m_df = dfg.df_select('diagnosis', 'M', dfg.df)
m_df_no_diag = dfg.drop(['diagnosis','diagnosis','diagnosis'], [1,1,1], m_df)
tt_result = dfg.ttest_ind(b_df_no_diag, m_df_no_diag)
tt_result

(array([ -0.94945105, -25.48762285, -10.88607835, -26.45930869,
        -23.9911101 ,  -9.16195118, -17.73350088, -23.15059758,
        -29.41628155,  -8.3534167 ,   0.30626599, -16.43275127,
          0.19805554, -15.96984312, -15.6457782 ,   1.60214185,
         -7.31032795,  -6.25614616, -10.6610848 ,   0.1555981 ,
         -1.86534295, -29.40101867, -12.25246905, -30.02905861,
        -25.77968773, -11.08669353, -17.48110286, -20.93651786,
        -31.11302668, -10.9244796 ,  -8.16739497]),
 array([1.71397881e-001, 2.28554211e-096, 1.70986526e-025, 2.22091039e-101,
        1.26685305e-088, 4.62784167e-019, 1.31779585e-056, 2.85786047e-084,
        1.72387627e-116, 2.55825118e-016, 3.79757277e-001, 3.24291631e-050,
        4.21536297e-001, 5.56740067e-048, 1.97675845e-046, 5.48405852e-002,
        4.55845018e-013, 3.89020133e-010, 1.30067726e-024, 4.38202599e-001,
        3.13248054e-002, 2.05962158e-116, 4.37180719e-031, 1.38333014e-119,
        7.09000890e-098, 2.73752963e-026, 2.

In [12]:
# let's reformat into a table with headers
col_name = dfg.df_col_name(df_new)[0]
tt_result_df = pd.DataFrame(columns = col_name)
tt_result_df = tt_result_df.append(pd.DataFrame(tt_result[0].reshape(1,-1), columns = col_name), ignore_index = True)
tt_result_df = tt_result_df.append(pd.DataFrame(tt_result[1].reshape(1,-1), columns = col_name), ignore_index = True)
tt_result_df.rename(index={0:'statistic'},inplace=True)
tt_result_df.rename(index={1:'pvalue'},inplace=True)
tt_result_df

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
statistic,-0.949451,-25.48762,-10.88608,-26.45931,-23.99111,-9.161951,-17.7335,-23.1506,-29.41628,-8.353417,...,-29.40102,-12.25247,-30.02906,-25.77969,-11.08669,-17.4811,-20.93652,-31.11303,-10.92448,-8.167395
pvalue,0.171398,2.285542e-96,1.709865e-25,2.22091e-101,1.266853e-88,4.627841999999999e-19,1.317796e-56,2.8578600000000002e-84,1.723876e-116,2.558251e-16,...,2.059622e-116,4.3718070000000005e-31,1.3833300000000002e-119,7.090009e-98,2.7375299999999997e-26,2.3528920000000003e-55,7.736438e-73,5.02376e-125,1.206099e-25,1.028157e-15


In [13]:
# the p-values are tiny! so it seems like t-test is successful?!
# but remember that variance matters - one of the t-test assumptions is that the variance across the cohorts are
# the same. so let's test for that
variance = []
variance.append(dfg.df_var(b_df_no_diag))
variance.append(dfg.df_var(m_df_no_diag))
variance = pd.DataFrame(variance)
variance

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1.362815e+16,3.170222,15.961021,139.415582,18033.0301,0.000181,0.001139,0.001887,0.000253,0.000615,...,3.925817,30.183536,182.982188,26765.425899,0.000401,0.008497,0.019703,0.001281,0.001743,0.000191
1,1.901546e+16,10.265431,14.284393,477.62587,135378.355365,0.000159,0.002915,0.005628,0.001182,0.000764,...,18.348967,29.537095,867.718099,357565.42185,0.000478,0.029027,0.032945,0.002144,0.005578,0.000465


In [14]:
# so the variance for some of the columns are pretty different across the cohorts

# to see the log difference of the variance among cohorts:
# (base 2... so not quite a whole magnitude difference in variance,
var_diff = np.abs(np.log(variance)[0:1].reset_index() - np.log(variance)[1:2].reset_index())
var_diff

Unnamed: 0,index,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,0.333115,1.17498,0.110982,1.231368,2.015868,0.128674,0.939547,1.092631,1.540854,...,1.541999,0.02165,1.556478,2.592207,0.177406,1.228482,0.514054,0.514871,1.163409,0.891096


In [15]:
df_tmp = dfg.bin_cate(dfg.df, 'diagnosis')
y_df = dfg.get_col(['diagnosis','diagnosis','diagnosis'], df_tmp)
X_df = dfg.drop(['diagnosis','diagnosis','diagnosis'], [1,1,1], df_tmp)
y_arr = dfg.to_numpy(y_df)
X_arr = dfg.to_numpy(X_df)
X_train = dfg.to_tensor(X_arr, 1)
y_train = dfg.to_tensor(y_arr, 0)

In [16]:
data = {}
data['X_train']=[X_train[0], X_train[1]]
data['y_train']=[y_train[0], y_train[1]]
X_test = X_train[2]
y_test = y_train[2]

In [20]:
fdl = FdLogistic([vm1, vm2], vm3, data, workplace)
num_feature = dfg.df_get_feature_num(X_df[0])
model = fdl.initmodel(num_feature)

In [None]:
import torch.nn as nn
import torch
class LogisticRegression(nn.Module):
    def __init__(self, n_input_features):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)
    
    def forward(self,x):
        y_predict = torch.sigmoid(self.linear(x))
        return y_predict

In [21]:
model = fdl.train(model, num_feature, 100)

AttributeError: Can't get attribute 'LogisticRegression' on <module '__main__'>

In [None]:
acc = fdl.test(model, num_feature, X_test, y_test)
acc