In [1]:
from sail.data import DataFrameGroup
from sail.algo import Fdlr
from sail.core import spawnvms, connect
import optuna

In [2]:
# show two data servers (vms) and an aggregation server (vmagg)
#vms = spawnvms(3)
#vmagg = vms[-1]
#vms = vms[:-1]
vm1 = connect("52.179.194.116", 7000, "lbart@igr.com", "sailpassword")
vm2 = connect("52.167.141.119", 7000, "lbart@igr.com", "sailpassword")
vmagg = connect("52.167.162.82", 7000, "lbart@igr.com", "sailpassword")
vms = [vm1, vm2]

In [3]:
# vm will be represented by a VM identifier, which will be used for future computations
vms

['A18096AA26E8417C9A8B8C6CF6B6E16E', '5737EF4C9E8B4C199895A3BCB1211846']

In [4]:
# workplace is a local place in the file system to store temporary files, 
# which includes some parameters users input for model training
workplace = "/home/jjj/playground/tmp"

In [None]:
#DataFrameGroup is the collection of all dataset distributed among the VMs.
dfg = DataFrameGroup(vms, workplace)
#The import_data function takes in the data IDs and import the data from the data connector
dfg.import_data([0,0])

In [5]:
#dfg has a public data field shape, which depicts the shape of the datasets in rows and cols
dfg.shape

[(3, 6), (5, 6)]

In [6]:
# the col_label field has the fetures names of each col.
dfg.col_label

[Index(['Region', 'Temp(F)', 'Rainfall(mm)', 'Humidity(%)', 'Apples(ton)',
        'Oranges(ton)'],
       dtype='object'),
 Index(['Region', 'Temp(F)', 'Rainfall(mm)', 'Humidity(%)', 'Apples(ton)',
        'Oranges(ton)'],
       dtype='object')]

In [7]:
#the dtypes function gives the data types of each column
dfg.dtypes()

[Region          object
 Temp(F)          int64
 Rainfall(mm)     int64
 Humidity(%)      int64
 Apples(ton)      int64
 Oranges(ton)     int64
 dtype: object,
 Region          object
 Temp(F)          int64
 Rainfall(mm)     int64
 Humidity(%)      int64
 Apples(ton)      int64
 Oranges(ton)     int64
 dtype: object]

In [8]:
#make this part actual synthetic data
dfg.sample(vms[0], dfg.df[0])

Unnamed: 0,Region,Temp(F),Rainfall(mm),Humidity(%),Apples(ton),Oranges(ton)
0,Kanto,73,67,43,56,70
1,Johto,91,88,64,81,101
2,Hoenn,87,124,58,119,133


In [9]:
#The private_intersect compute the duplicate items between two data sets, 
#the droprow function drops the duplicated terms
duplicate = dfg.private_intersect(vms[0], vms[1], dfg.df[0], dfg.df[1], 'Region')
dfg.df[1] = dfg.droprow(vms[1], duplicate[1], dfg.df[1])

In [11]:
#We add a new feature 'Temp(C)' temperature in celsius by doing transformation from the 'Temp(F)'
newdf = dfg.apply_and_append(['Temp(F)']*len(vms), ['Temp(C)']*len(vms), dfg.df)

In [12]:
#We can calculate the pearson correlation between the new feature and the target 
#to evaluate its usefulness
dfg.pearson_corr('Temp(C)', 'Apples(ton)', newdf)

-0.47835011346874645

In [None]:
#We can do a normal transformation on the new feature we generated
dfg.norm_transform('Temp(C)', newdf)

In [15]:
#Defining the features and targets cols
#The get_col/to_numpy are done remotely
data = {}
X = dfg.get_col([['Temp(F)', 'Rainfall(mm)', 'Humidity(%)']]*len(vms), dfg.df)
y = dfg.get_col([['Apples(ton)', 'Oranges(ton)']]*len(vms), dfg.df)
data['X_train'] = dfg.to_numpy(X)
data['y_train'] = dfg.to_numpy(y)
#data['X_train'], data['X_test'], data['y_train'], data['y_test'] = dfg.train_test_split(X, y, 0.2, 42)

In [16]:
#construct the model
model_one = Fdlr(vms, vmagg, data, workplace)

In [17]:
#sigle round training
model_one.initmodel(3,2,5e-5)
#explain what fit is doing
model_one.fit(100)

processing round: 1
processing round: 10
processing round: 19
processing round: 28
processing round: 37
processing round: 46
processing round: 55
processing round: 64
processing round: 73
processing round: 82
processing round: 91
processing round: 100
processing round: 109
processing round: 118
processing round: 127
processing round: 136
processing round: 145
processing round: 154
processing round: 163
processing round: 172
processing round: 181
processing round: 190
processing round: 199
processing round: 208
processing round: 217
processing round: 226
processing round: 235
processing round: 244
processing round: 253
processing round: 262
processing round: 271
processing round: 280
processing round: 289
processing round: 298
processing round: 307
processing round: 316
processing round: 325
processing round: 334
processing round: 343
processing round: 352
processing round: 361
processing round: 370
processing round: 379
processing round: 388
processing round: 397
processing round: 406


In [19]:
#Compute the mean absolute error
model_one.mae(data['X_train'], data['y_train'])

[1.2953567504882812, 0.7586970329284668]

In [20]:
#The block below does hyperparameter optimization by using Optuna package
model_op = Fdlr(vms, vmagg, data, workplace)

In [21]:
def objective(trail, model):
    n_lr = trail.suggest_float('n_lr', 1e-5, 1e-4)
    model.initmodel(3,2,n_lr)
    model.fit(100)
    return model.score()

In [22]:
study = optuna.create_study(direction="minimize")
study.optimize(lambda trial: objective(trial, model_op), n_trials=3)

[32m[I 2021-04-15 18:49:56,980][0m A new study created in memory with name: no-name-db7f2f1c-6202-4411-85ba-21502e55fd3f[0m


processing round: 1
processing round: 10
processing round: 19
processing round: 28
processing round: 37
processing round: 46
processing round: 55
processing round: 64
processing round: 73
processing round: 82
processing round: 91
processing round: 100


[32m[I 2021-04-15 18:54:12,676][0m Trial 0 finished with value: 4.267003456751506 and parameters: {'n_lr': 2.6397712625955436e-05}. Best is trial 0 with value: 4.267003456751506.[0m


processing round: 1
processing round: 10
processing round: 19
processing round: 28
processing round: 37
processing round: 46
processing round: 55
processing round: 64
processing round: 73
processing round: 82
processing round: 91
processing round: 100


[32m[I 2021-04-15 18:58:28,845][0m Trial 1 finished with value: 5.76754872004191 and parameters: {'n_lr': 2.4960121595429822e-05}. Best is trial 0 with value: 4.267003456751506.[0m


processing round: 1
processing round: 10
processing round: 19
processing round: 28
processing round: 37
processing round: 46
processing round: 55
processing round: 64
processing round: 73
processing round: 82
processing round: 91
processing round: 100


[32m[I 2021-04-15 19:02:47,076][0m Trial 2 finished with value: 1.6436131000518799 and parameters: {'n_lr': 7.985380168418907e-05}. Best is trial 2 with value: 1.6436131000518799.[0m


In [None]:
#Show prediction?