In [80]:
import pandas as pd
import numpy as np
import nest_asyncio
import sklearn
import tensorflow as tf
import tensorflow_federated as tff
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from collections import defaultdict, OrderedDict

nest_asyncio.apply()
np.random.seed(10)
tf.random.set_seed(10)
tff.framework.set_default_context(tff.backends.native.create_thread_debugging_execution_context(clients_per_thread=50))

# Data initialization


In [81]:
def download_file(url, save_folder):
    '''If the file isn't already in the save folder, download it.'''
    local_filename = url.split('/')[-1]  #Get the name of the file being downloaded

    # if the file exists, don't download it again
    if os.path.isfile(os.path.join(save_folder, local_filename)): return os.path.join(save_folder, local_filename)

    # NOTE the stream=True parameter
    r = requests.get(url, stream=True)
    with open(os.path.join(save_folder, local_filename), 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)
    return os.path.join(save_folder, local_filename)  #Returns the path pointing to the local file
cleveland = download_file(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", os.getcwd())
switzerland = download_file(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.switzerland.data",
        os.getcwd())
va = download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.va.data",
                       os.getcwd())
hungarian = download_file(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/reprocessed.hungarian.data", os.getcwd())


# Set column names and nan values

In [242]:
def import_data(cleveland, switzerland, va, hungarian):
    # Step 2: load the data into pandas DataFrames and then merge the four files into a single dataframe
    cleveland_df = pd.read_csv(cleveland, header=None, na_values =["?", -9.0])
    switzerland_df = pd.read_csv(switzerland, header=None, na_values =["?", -9.0])
    va_df = pd.read_csv(va, header=None, na_values =["?", -9.0])
    hungarian_df = pd.read_csv(hungarian, sep=" ", header=None, na_values =["?", -9.0])

    # add headers to the data frames
    headers = {0 : "Age",
            1 : "Sex", # 1 = male; 0 = female
            2 : "ChestPainType",  # chest pain type, 
                        # Value 1: typical angina, 
                        # Value 2: atypical angina, 
                        # Value 3: non-anginal pain
                        # Value 4: asymptomatic
            3 : "RestingBP", # resting blood pressure 
                            #(in mm Hg on admission to the hospital)
            4 : "Cholesterol", # serum cholestoral in mg/dl
            5 : "FastingBS", # fasting blood sugar > 120 mg/dl 
                        # (1 = true; 0 = false)
            6 : "RestingECG",#  resting electrocardiographic results
                            # Value 0: normal
                            # Value 1: having ST-T wave abnormality 
                            # (T wave inversions and/or ST elevation or depression of > 0.05 mV)
                            # Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
            7 : "MaxHR", #maximum heart rate achieved
            8 : "ExerciseAngina",# exercise induced angina (1 = yes; 0 = no)
            9 : "Oldpeak", # ST depression induced by exercise relative to rest
            10 : "ST_Slope", # the slope of the peak exercise ST segment
                            # Value 1: upsloping
                            # Value 2: flat
                            # Value 3: downsloping
            11 : "ca", # number of major vessels (0-3) colored by flourosopy    
            12 : "thal", # Value 3: normal
                            # Value 6: fixed defect
                            # Value 7: reversable defect
            13 : "HeartDisease" # diagnosis of heart disease (angiographic disease status)
                                # Value 0 = < 50% diameter narrowing,
                                # Value 1 = > 50% diameter narrowing 
                                # It takes 5 levels based on angiographic disease status.
                                # 0-Healthy, 1-diagnosed with stage 1, 2-diagnosed with stage 2, 
                                # 3-diagnosed with stage 3, 4-diagnosed with stage 4.
            }
    cleveland_df = cleveland_df.rename(columns=headers)
    switzerland_df = switzerland_df.rename(columns=headers)
    va_df = va_df.rename(columns=headers)
    hungarian_df = hungarian_df.rename(columns=headers)
    cleveland_df['Location'] = 'Cleveland'
    switzerland_df['Location'] = 'Switzerland'
    va_df['Location']='VA'
    hungarian_df['Location']='Hungarian'
    hungarian_df = hungarian_df[:-1]
    
    return cleveland_df, switzerland_df, va_df, hungarian_df

    
    
    binary_feature_names = ['Sex', 'FastingBS', 'ExerciseAngina']
    categorical_feature_names = ['ChestPainType', 'RestingECG', 'ST_Slope']
    numeric_feature_names = ['Age', 'MaxHR', 'RestingBP',  'Cholesterol', 'Oldpeak']


# # make a binary target variable
# cleveland_df['target'] = cleveland_df['HeartDisease'].copy()
# cleveland_df.target = cleveland_df.target.replace([1, 2, 3, 4], 1)
# switzerland_df['target'] = switzerland_df['HeartDisease'].copy()
# switzerland_df.target = switzerland_df.target.replace([1, 2, 3, 4], 1)
# va_df['target'] = va_df['HeartDisease'].copy()
# va_df.target = va_df.target.replace([1, 2, 3, 4], 1)
# hungarian_df['target'] = hungarian_df['HeartDisease'].copy()
# hungarian_df.target = hungarian_df.target.replace([1, 2, 3, 4], 1)


In [243]:
df = pd.concat([cleveland_df, switzerland_df, va_df, hungarian_df])
df.HeartDisease = df.HeartDisease.replace([1, 2, 3, 4], 1)
df = df.drop(columns=['thal', 'ca'])
df = df.drop_duplicates()

# DROP ROWS WITH MORE THAN 35% MISSING VALUES
perc = 35.0 
min_count =  int(((100-perc)/100)*df.shape[1] + 1)
df = df.dropna( axis=0, thresh=min_count)
# msno.matrix(df)

# FILL MISSING VALUES WITH MEDIAN
df.ST_Slope = df.ST_Slope.fillna(df.ST_Slope.median())
df.RestingBP = df.RestingBP.fillna(df.RestingBP.median())
df.FastingBS = df.FastingBS.fillna(df.FastingBS.median())
df.RestingECG = df.RestingECG.fillna(df.RestingECG.median())
df.Cholesterol = df.Cholesterol.fillna(df.Cholesterol.median())
df.MaxHR = df.MaxHR.fillna(df.MaxHR.median())
df.ExerciseAngina = df.ExerciseAngina.fillna(df.ExerciseAngina.median())
df.Oldpeak = df.Oldpeak.fillna(df.Oldpeak.median())

# DROP ROW WHERE RESTING BP = 0
df = df[df.RestingBP != 0]

# # REPLACE CHOLESTROL VALUE = 0 WITH MEDIAN
# df.Cholesterol = df.Cholesterol.replace(0, df.Cholesterol.median())

## Split in x and y variables

In [244]:
# X = df[df.columns.drop('HeartDisease')]
# y = df.HeartDisease
hungarian_df = df[df.Location == 'Hungarian'].drop(columns=['Location'])
switzerland_df = df[df.Location == 'Switzerland'].drop(columns=['Location'])
cleveland_df = df[df.Location == 'Cleveland'].drop(columns=['Location'])
va_df = df[df.Location == 'VA'].drop(columns=['Location'])

In [245]:
dfs = [hungarian_df, switzerland_df, cleveland_df, va_df]

In [246]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Location
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,Cleveland
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,1.0,Cleveland
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,1.0,Cleveland
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,Cleveland
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,Cleveland
...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,48.0,0.0,2.0,130.0,308.0,0.0,1.0,140.0,0.0,2.0,1.0,0.0,Hungarian
290,36.0,1.0,2.0,120.0,166.0,0.0,0.0,180.0,0.0,0.0,2.0,0.0,Hungarian
291,48.0,1.0,3.0,110.0,211.0,0.0,0.0,138.0,0.0,0.0,2.0,0.0,Hungarian
292,47.0,0.0,2.0,140.0,257.0,0.0,0.0,135.0,0.0,1.0,1.0,0.0,Hungarian


In [86]:
client_id_colname = 'Location'
client_ids = df[client_id_colname].unique()
SHUFFLE_BUFFER = 1000
NUM_EPOCHS = 1
def create_tf_dataset_for_client_fn(client_id):
  # a function which takes a client_id and returns a
  # tf.data.Dataset for that client
  client_data = df[df[client_id_colname] == client_id[0]]
  dataset = tf.data.Dataset.from_tensor_slices(client_data.fillna('').to_dict("list"))
  dataset = dataset.shuffle(SHUFFLE_BUFFER).batch(1).repeat(NUM_EPOCHS)
  return dataset

# dataset_train = tf.data.Dataset.from_tensor_slices((data_train, labels_train)).batch(n_train)
# dataset_test = tf.data.Dataset.from_tensor_slices((data_test, labels_test)).batch(n_train)
# train_client_ids = pd.DataFrame(client_ids).sample(frac=0.5).values.tolist()
# test_client_ids = [x for x in client_ids if x not in train_client_ids]


cle_tf =  tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
    client_ids=client_ids[0],
    serializable_dataset_fn=create_tf_dataset_for_client_fn
)

swi_tf =  tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
    client_ids=client_ids[1],
    serializable_dataset_fn=create_tf_dataset_for_client_fn
)

va_tf =  tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
    client_ids=client_ids[2],
    serializable_dataset_fn=create_tf_dataset_for_client_fn
)

hun_tf =  tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
    client_ids=client_ids[3],
    serializable_dataset_fn=create_tf_dataset_for_client_fn
)


In [87]:

NUM_FEATURES = hungarian_df.shape[1]-1


def preprocess(dataset):
  return dataset.repeat(1).batch(1)

target = hungarian_df.iloc[:,-1]
hun_tf =  tf.data.Dataset.from_tensor_slices(
    (hungarian_df.iloc[:,:-1],
    target))

preprocessed_example_dataset = preprocess(hun_tf)

def create_keras_model():
  return tf.keras.models.Sequential([
      tf.keras.layers.Dense(
          1,
          activation='sigmoid',
          input_shape=(NUM_FEATURES,),
          kernel_regularizer=tf.keras.regularizers.l2(0.01),
      )
  ])

def model_fn():
  keras_model = create_keras_model()
  return tff.learning.from_keras_model(
      keras_model,
      input_spec=preprocessed_example_dataset.element_spec,
      loss=tf.keras.losses.BinaryCrossentropy(),
      metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy'),
               tf.keras.metrics.AUC(name='auc')])

def data_prep(data):

    def scale_variables(data):
        X = data.iloc[:,:-1]
        scaler = StandardScaler()
        scaler.fit(X)
        X_new = scaler.transform(X)
        return X_new

    X = scale_variables(data)
    y = data.HeartDisease

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test


iterative_process = tff.learning.build_federated_averaging_process(
    model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0),
    server_optimizer_fn=lambda: tf.keras.optimizers.Nadam(learning_rate=0.5),
    use_experimental_simulation_loop = True
)

state = iterative_process.initialize()
tff_model = create_keras_model()
tff_auc = defaultdict(lambda:0)
# NUM_clients = 4
# NUM_ROUNDS = 12
# # Test various sizes of subsets of eligible devices participating in each round.
# # for participation in list(range(1, NUM_clients+1)) :

#Per dataset train the model
for df in dfs:
    X_train, X_test, y_train, y_test = data_prep(df)
    federated_train_data = [preprocess(tf.data.Dataset.from_tensor_slices((X_train, y_train)))]
    state, metrics = iterative_process.next(state, federated_train_data)
    print( str(metrics))
    state.model.assign_weights_to(tff_model)
    labels_proba = tff_model.predict( tf.data.Dataset.from_tensor_slices((X_test,y_test)).batch(1))
    fpr, tpr, threshold = sklearn.metrics.roc_curve(y_test, labels_proba)
    test_loss = tf.keras.losses.binary_crossentropy(y_test, np.reshape(labels_proba, [-1]))
    print('validation auc={}, loss={}'.format(sklearn.metrics.auc(fpr, tpr), test_loss))


OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('mean_value', ()), ('mean_weight', ())])), ('train', OrderedDict([('accuracy', 0.75641024), ('auc', 0.7790288), ('loss', 0.9358528), ('num_examples', 234), ('num_batches', 234)]))])
validation auc=0.8302631578947368, loss=0.4817800223827362
OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('mean_value', ()), ('mean_weight', ())])), ('train', OrderedDict([('accuracy', 0.9072165), ('auc', 0.47311828), ('loss', 0.34987694), ('num_examples', 97), ('num_batches', 97)]))])
validation auc=0.6428571428571428, loss=1.1485294103622437
OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('mean_value', ()), ('mean_weight', ())])), ('train', OrderedDict([('accuracy', 0.6694215), ('auc', 0.71748006), ('loss', 1.2923571), ('num_examples', 242), ('num_batches', 242)]))])
validation auc=0.8168103448275862, loss=0.565361499786377
OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('mean_value', ()), ('mean_weigh

# Paillier encryption

In [268]:
# import tf_encrypted as tfe
# from tfe import ModelOwner, DataOwner
from phe import paillier
pubkey, privkey = paillier.generate_paillier_keypair()
newFeatures = hungarian_df.Age
np.array([pubkey.encrypt(x) for x in newFeatures])

array([<phe.paillier.EncryptedNumber object at 0x7fa36ff223a0>,
       <phe.paillier.EncryptedNumber object at 0x7fa36ff22d30>,
       <phe.paillier.EncryptedNumber object at 0x7fa36ff22820>,
       <phe.paillier.EncryptedNumber object at 0x7fa3576e2f70>,
       <phe.paillier.EncryptedNumber object at 0x7fa3577914c0>,
       <phe.paillier.EncryptedNumber object at 0x7fa357779fd0>,
       <phe.paillier.EncryptedNumber object at 0x7fa359e63400>,
       <phe.paillier.EncryptedNumber object at 0x7fa359e63760>,
       <phe.paillier.EncryptedNumber object at 0x7fa359e63790>,
       <phe.paillier.EncryptedNumber object at 0x7fa359e57e20>,
       <phe.paillier.EncryptedNumber object at 0x7fa359e57460>,
       <phe.paillier.EncryptedNumber object at 0x7fa359e572e0>,
       <phe.paillier.EncryptedNumber object at 0x7fa359e57310>,
       <phe.paillier.EncryptedNumber object at 0x7fa359e57d30>,
       <phe.paillier.EncryptedNumber object at 0x7fa359e57c40>,
       <phe.paillier.EncryptedNumber obj

In [233]:
X_train = []
X_test = [] 
y_train, y_test = []
for i in dfs:
    X_train.append(data_prep(dfs[i])[0])
    y_train.append(data_prep(dfs[i])[1])
    X_test.append(data_prep(dfs[i])[2])
    y_test.append(data_prep(dfs[i])[3])

In [234]:
X_train, X_test, y_train, y_test = test_train_split_dfs(dfs)

In [267]:
X = hungarian_df.iloc[:,:-1]
y = hungarian_df.HeartDisease
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.363655
         Iterations 7
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.444     
Dependent Variable: HeartDisease     AIC:              235.1019  
Date:               2022-04-19 16:21 BIC:              275.5838  
No. Observations:   293              Log-Likelihood:   -106.55   
Df Model:           10               LL-Null:          -191.75   
Df Residuals:       282              LLR p-value:      2.2973e-31
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     7.0000                                       
-----------------------------------------------------------------
                   Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
-----------------------------------------------------------------
Age               -0.0378   0.0228 -1.6580 0.0973 -0.0824  0.0069
Sex                1.2114   0.4465  2.7131 0.0067  0.3363  2.0865


In [266]:
x_cols = ['Sex', 'ChestPainType',  'MaxHR', 'ExerciseAngina', 'Oldpeak']
X_new = hungarian_df[x_cols]
logit_model=sm.Logit(y,X_new)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.394687
         Iterations 7
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.397     
Dependent Variable: HeartDisease     AIC:              241.2863  
Date:               2022-04-19 15:58 BIC:              259.6872  
No. Observations:   293              Log-Likelihood:   -115.64   
Df Model:           4                LL-Null:          -191.75   
Df Residuals:       288              LLR p-value:      6.8369e-32
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     7.0000                                       
-----------------------------------------------------------------
                   Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
-----------------------------------------------------------------
Sex                1.1653   0.4085  2.8527 0.0043  0.3647  1.9659
ChestPainType      0.4984   0.1660  3.0023 0.0027  0.1730  0.8238
