In [1]:
from sail.data import DataFrameGroup
from sail.algo import Fdlr
from sail.core import spawnvms, connect
import optuna

In [2]:
# show two data servers (vms) and an aggregation server (vmagg)
vm1 = connect("20.44.107.205", 7000, "lbart@igr.com", "sailpassword")
vm2 = connect("20.44.107.254", 7000, "lbart@igr.com", "sailpassword")
vmagg = connect("20.44.107.111", 7000, "lbart@igr.com", "sailpassword")
vms = [vm1, vm2]

In [3]:
# vm will be represented by a VM identifier, which will be used for future computations
vms

['6816123528CD41A5A6675236637F72C2', '66F116B630914FB59832CE9884D00A34']

In [4]:
# workplace is a local place in the file system to store temporary files, 
# which includes some parameters users input for model training
workplace = "/home/jjj/playground/tmp"

In [5]:
#DataFrameGroup is the collection of all dataset distributed among the VMs.
dfg = DataFrameGroup(vms, workplace)
#The import_data function takes in the data IDs and import the data from the data connector
dfg.import_data([0,0])

In [6]:
#dfg has a public data field shape, which depicts the shape of the datasets in rows and cols
dfg.shape

[(3, 6), (5, 6)]

In [7]:
# the col_label field has the fetures names of each col.
dfg.col_label

[Index(['Region', 'Temp(F)', 'Rainfall(mm)', 'Humidity(%)', 'Apples(ton)',
        'Oranges(ton)'],
       dtype='object'),
 Index(['Region', 'Temp(F)', 'Rainfall(mm)', 'Humidity(%)', 'Apples(ton)',
        'Oranges(ton)'],
       dtype='object')]

In [8]:
dfg.df

['32CC56FB661546ED99F429130490E9B2D8732F22BD0941F386DF3AEE048F3736',
 '161CA6F34EE049A39D0B2165598D74A0D8732F22BD0941F386DF3AEE048F3736']

In [9]:
#the dtypes function gives the data types of each column
dfg.dtypes()

[Region          object
 Temp(F)          int64
 Rainfall(mm)     int64
 Humidity(%)      int64
 Apples(ton)      int64
 Oranges(ton)     int64
 dtype: object,
 Region          object
 Temp(F)          int64
 Rainfall(mm)     int64
 Humidity(%)      int64
 Apples(ton)      int64
 Oranges(ton)     int64
 dtype: object]

In [10]:
#The private_intersect compute the duplicate items between two data sets, 
#the droprow function drops the duplicated terms
duplicate = dfg.private_intersect(vms[0], vms[1], dfg.df[0], dfg.df[1], 'Region')
dfg.df[1] = dfg.droprow(vms[1], duplicate[1], dfg.df[1])

In [11]:
#We add a new feature 'Temp(C)' temperature in celsius by doing transformation from the 'Temp(F)'
newdf = dfg.apply_and_append(['Temp(F)']*len(vms), ['Temp(C)']*len(vms), dfg.df)

In [12]:
#We can calculate the pearson correlation between the new feature and the target 
#to evaluate its usefulness
dfg.pearson_corr('Temp(C)', 'Apples(ton)', newdf)

-0.47835011346874645

In [13]:
#We can do a normal transformation on the new feature we generated
dfg.norm_transform('Temp(C)', newdf)

In [13]:
#Defining the features and targets cols
#The get_col/to_numpy are done remotely
data = {}
X = dfg.get_col([['Temp(F)', 'Rainfall(mm)', 'Humidity(%)']]*len(vms), dfg.df)
y = dfg.get_col([['Apples(ton)', 'Oranges(ton)']]*len(vms), dfg.df)
data['X_train'] = dfg.to_numpy(X)
data['y_train'] = dfg.to_numpy(y)
#data['X_train'], data['X_test'], data['y_train'], data['y_test'] = dfg.train_test_split(X, y, 0.2, 42)

In [15]:
#construct the model
model_one = Fdlr(vms, vmagg, data, workplace)

In [16]:
#sigle round training
model_one.initmodel(3,2,5e-5)
#explain what fit is doing
model_one.fit(100)

processing round: 1
processing round: 10
processing round: 19
processing round: 28
processing round: 37
processing round: 46
processing round: 55
processing round: 64
processing round: 73
processing round: 82
processing round: 91
processing round: 100


In [17]:
#Compute the mean absolute error
model_one.mae(data['X_train'], data['y_train'])

1.4131690979003906

In [14]:
#The block below does hyperparameter optimization by using Optuna package
model_op = Fdlr(vms, vmagg, data, workplace)

In [15]:
def objective(trail, model):
    n_lr = trail.suggest_float('n_lr', 1e-5, 1e-4)
    model.initmodel(3,2,n_lr)
    model.fit(100)
    return model.score()

In [16]:
study = optuna.create_study(direction="minimize")
study.optimize(lambda trial: objective(trial, model_op), n_trials=3)

[32m[I 2021-04-16 12:34:34,860][0m A new study created in memory with name: no-name-36983a54-92da-4a90-b8ea-ea70d6838532[0m


processing round: 1
processing round: 10
processing round: 19
processing round: 28
processing round: 37
processing round: 46
processing round: 55
processing round: 64
processing round: 73
processing round: 82
processing round: 91
processing round: 100


[32m[I 2021-04-16 12:40:44,032][0m Trial 0 finished with value: 2.5347335815429686 and parameters: {'n_lr': 7.77735837165351e-05}. Best is trial 0 with value: 2.5347335815429686.[0m


processing round: 1
processing round: 10
processing round: 19
processing round: 28
processing round: 37
processing round: 46
processing round: 55
processing round: 64
processing round: 73
processing round: 82
processing round: 91
processing round: 100


[32m[I 2021-04-16 12:46:22,573][0m Trial 1 finished with value: 2.8942440032958983 and parameters: {'n_lr': 6.03870188968855e-05}. Best is trial 0 with value: 2.5347335815429686.[0m


processing round: 1
processing round: 10
processing round: 19
processing round: 28
processing round: 37
processing round: 46
processing round: 55
processing round: 64
processing round: 73
processing round: 82
processing round: 91
processing round: 100


[32m[I 2021-04-16 12:52:05,408][0m Trial 2 finished with value: 4.319192504882812 and parameters: {'n_lr': 3.09930875201408e-05}. Best is trial 0 with value: 2.5347335815429686.[0m


In [None]:
Thanks