In [1]:
import os
from vantage6.tools.mock_client import ClientMockProtocol 

In [2]:
# Start mock client
data_dir = os.path.join(os.getcwd(), 'local')
client = ClientMockProtocol(
    datasets=[
        os.path.join(data_dir, 'bcdata1scaledn.csv'),
        os.path.join(data_dir, 'bcdata2scaledn.csv')
    ],
    module='v6_kmeans_py'
)

In [3]:
# Get mock organisations
organizations = client.get_organizations_in_my_collaboration()
print(organizations)
ids = [organization['id'] for organization in organizations]

[{'id': 0, 'name': 'mock-0', 'domain': 'mock-0.org'}, {'id': 1, 'name': 'mock-1', 'domain': 'mock-1.org'}]


In [4]:
# Check master method
master_task = client.create_new_task(
    input_={
        'master': True,
        'method': 'master',
        'kwargs': {
            'org_ids': [0, 1],
            'k': 2,
            'epsilon': 0.05,
            'max_iter': 30,
            'columns': [
                "radius_mean","texture_mean","smoothness_mean","compactness_mean","concave points_mean","fractal_dimension_mean"
        ,"concavity_mean","symmetry_mean","radius_se","texture_se","smoothness_se","concavity_se","concave points_se","symmetry_se"
        ,"fractal_dimension_se","symmetry_worst", "fractal_dimension_worst"
            ],
            'd_init': 'all',
            'init_method': 'random',
            'avg_method': 'k-means'
        }
    },
    organization_ids=[0]
)
results = client.get_results(master_task.get('id')) 

info > Collecting participating organizations
info > Initializing k global cluster centres
info > Dispatching node tasks
info > Randomly sample 2 data points to use as initial centroids
info > Randomly sample 2 data points to use as initial centroids
info > Waiting for results
info > Obtaining results
{'id': 999, 'result': b"\x80\x03]q\x00(]q\x01(G?\xcc\xe4\x06'\xaa\xfeiG?\xd0\xfd}E\xdd\x8b\x1dG?\xdf\x96>\x89\xc9\x98\xcbG?\xce\x86\x96\x8b<\x92\x8cG?\xcc?\x1b\xfd\xf6\xd6\xc6G?\xd6\xa8\xa7\\\x17J\xd7G?\xbf\xae\x9e\x10\x06\xe8\xe6G?\xd1c\xd0k\x92\\\x10G?\xbeN\x00!9\x8f4G?\xd5\xe0\xdc'\x1b(0G?\xc557*2\xbc;G?\xa6uKgT\xb6lG?\xcf\xe8\xb9\r\xa0w{G?\xc2\xda\x94\x9c\x98\x19;G?\xa9,\xa6Iv\xdbNG?\xc7\tU\xf0[<\xd0G?\xc2\xeaV\xd84c\x88e]q\x02(G?\xe5^@0\xda\x16\xc2G?\xe2>\xdaW\x17\xfb&G?\xe1\xc2\x044\xc9\x10\xb6G?\xe9\xe5\x15\x17ZI\x17G?\xe7\xcb\x11\xd0(\xb79G?\xe00O]\xd0\xfb\xe8G?\xe2\xa5\x8c[+\xe8\x01G?\xe5\x93d\xd96M\x94G?\xc8\x06'\xce\x88\x96\xc5G?\xb6\xc8\x99\x87\x0f\x91\xe7G?\xb6\x1a\xae\x84\xd

In [5]:
print(results)

[{'centroids': [[0.2745526442714027, 0.2932606971916164, 0.3506039956766047, 0.17649737105242247, 0.13917874992027573, 0.23577635874677227, 0.105961314237903, 0.3329446453773701, 0.06897782763006408, 0.1812324230959341, 0.1722371912945126, 0.05539015803081464, 0.1769541786546261, 0.16516041550987032, 0.07510003459354264, 0.22677313502309085, 0.14740754441289766], [0.48025086104586, 0.39021859736390363, 0.49185068159248896, 0.44467499664150434, 0.47280369680650264, 0.3440154548750708, 0.4318222717188852, 0.4810491944764097, 0.18957288393530383, 0.2075405682662524, 0.2015095637027241, 0.1344339870221198, 0.324316995211502, 0.20691045231773028, 0.15411774838774345, 0.34260111236703616, 0.28032971503627424]]}]


In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df = pd.DataFrame(results)

In [8]:
centroids_fl = (df.iloc[0].to_list())
print(centroids_fl)
df.columns

[[[0.2745526442714027, 0.2932606971916164, 0.3506039956766047, 0.17649737105242247, 0.13917874992027573, 0.23577635874677227, 0.105961314237903, 0.3329446453773701, 0.06897782763006408, 0.1812324230959341, 0.1722371912945126, 0.05539015803081464, 0.1769541786546261, 0.16516041550987032, 0.07510003459354264, 0.22677313502309085, 0.14740754441289766], [0.48025086104586, 0.39021859736390363, 0.49185068159248896, 0.44467499664150434, 0.47280369680650264, 0.3440154548750708, 0.4318222717188852, 0.4810491944764097, 0.18957288393530383, 0.2075405682662524, 0.2015095637027241, 0.1344339870221198, 0.324316995211502, 0.20691045231773028, 0.15411774838774345, 0.34260111236703616, 0.28032971503627424]]]


Index(['centroids'], dtype='object')

In [9]:
#Flatten the list
#Reference: https://stackoverflow.com/questions/25674169/how-does-the-list-comprehension-to-flatten-a-python-list-workvbcfdg
centroids_fl = [item for sublist in centroids_fl for item in sublist]

In [10]:
centroids_fl

[[0.2745526442714027,
  0.2932606971916164,
  0.3506039956766047,
  0.17649737105242247,
  0.13917874992027573,
  0.23577635874677227,
  0.105961314237903,
  0.3329446453773701,
  0.06897782763006408,
  0.1812324230959341,
  0.1722371912945126,
  0.05539015803081464,
  0.1769541786546261,
  0.16516041550987032,
  0.07510003459354264,
  0.22677313502309085,
  0.14740754441289766],
 [0.48025086104586,
  0.39021859736390363,
  0.49185068159248896,
  0.44467499664150434,
  0.47280369680650264,
  0.3440154548750708,
  0.4318222717188852,
  0.4810491944764097,
  0.18957288393530383,
  0.2075405682662524,
  0.2015095637027241,
  0.1344339870221198,
  0.324316995211502,
  0.20691045231773028,
  0.15411774838774345,
  0.34260111236703616,
  0.28032971503627424]]

In [11]:
#Sometimes when running the output, the order of the clusters is flipped so in that case we have to re-order them  
#Run only if flipped
centroids_fl = [centroids_fl[1], centroids_fl[0]]

In [12]:
centroids_fl

[[0.48025086104586,
  0.39021859736390363,
  0.49185068159248896,
  0.44467499664150434,
  0.47280369680650264,
  0.3440154548750708,
  0.4318222717188852,
  0.4810491944764097,
  0.18957288393530383,
  0.2075405682662524,
  0.2015095637027241,
  0.1344339870221198,
  0.324316995211502,
  0.20691045231773028,
  0.15411774838774345,
  0.34260111236703616,
  0.28032971503627424],
 [0.2745526442714027,
  0.2932606971916164,
  0.3506039956766047,
  0.17649737105242247,
  0.13917874992027573,
  0.23577635874677227,
  0.105961314237903,
  0.3329446453773701,
  0.06897782763006408,
  0.1812324230959341,
  0.1722371912945126,
  0.05539015803081464,
  0.1769541786546261,
  0.16516041550987032,
  0.07510003459354264,
  0.22677313502309085,
  0.14740754441289766]]

In [19]:
#Construct Dataframe of the centroids abtained in federated part
df_cols = pd.DataFrame(centroids_fl)
df_new = df_cols.T
df_new.columns= ['cluster1','cluster2']

In [20]:
df_new

Unnamed: 0,cluster1,cluster2
0,0.480251,0.274553
1,0.390219,0.293261
2,0.491851,0.350604
3,0.444675,0.176497
4,0.472804,0.139179
5,0.344015,0.235776
6,0.431822,0.105961
7,0.481049,0.332945
8,0.189573,0.068978
9,0.207541,0.181232


In [13]:
#We copy here the centroids obtained form centralized kmeans clustering
centroids_cent= [[0.48010439, 0.39144693, 0.49183404, 0.44705496, 0.47317353,
        0.34751595, 0.43359673, 0.48305339, 0.19052399, 0.20721244,
        0.20037102, 0.1359733 , 0.32548816, 0.20864259, 0.15629238,
        0.34490158, 0.28278368],
       [0.27520311, 0.29399223, 0.35167877, 0.17778447, 0.14096328,
        0.23611812, 0.10788271, 0.33365764, 0.06895603, 0.18137869,
        0.17256804, 0.05591816, 0.17813427, 0.16459688, 0.07527563,
        0.22706555, 0.14820564]]

In [14]:
#To find Euclidean distance
from scipy.spatial import distance
distance_c1 = distance.euclidean(centroids_fl[0],centroids_cent[0])
distance_c2 = distance.euclidean(centroids_fl[1],centroids_cent[1])

In [15]:
distance_c1

0.007200248954613143

In [16]:
distance_c2

0.003760994762333353

In [532]:
#We store manually the distance between centroids obtained for cluster 1 in centralized and federated settings in a list
#for each iteration of every experiment 
distances1=[0.03143528272765035,0.033716997322332146,0.025872045757177546,0.028847437850228905,0.027581230767583984]

In [533]:
# we store maually the distance between centroids obtained for cluster 2 in centralized and federated settings in a list
#for each iteration of every experiment
distances2=[0.014121646365191778,0.016620494676596795,0.012792047188747525,0.01067331262228887,0.013322705176279008]

In [469]:
import numpy as np
from statistics import mean

In [534]:
mean(distances1)

0.029490598884994588

In [535]:
np.std(distances1)

0.0027855882130090664

In [536]:
mean(distances2)

0.013506041205820796

In [537]:
np.std(distances2)

0.0019311416787991607