In [1]:
from sail.data import DataFrameGroup
from sail.algo import Fdlr
from sail.core import spawnvms, connect
import optuna

In [2]:
# show two data servers (vms) and an aggregation server (vmagg)
vm1 = connect("52.242.77.90", 7000, "lbart@igr.com", "sailpassword")
vm2 = connect("52.242.73.31", 7000, "lbart@igr.com", "sailpassword")
vmagg = connect("52.242.77.87", 7000, "lbart@igr.com", "sailpassword")
vms = [vm1, vm2]

In [3]:
# vm will be represented by a VM identifier, which will be used for future computations
vms

['5D296CBCC0974D62B77001460398FB89', 'DE4839CF6EB04E659907A6DBA10AAA77']

In [4]:
# workplace is a local place in the file system to store temporary files, 
# which includes some parameters users input for model training
workplace = "/home/jjj/playground/tmp"

In [None]:
#DataFrameGroup is the collection of all dataset distributed among the VMs.
dfg = DataFrameGroup(vms, workplace)
#The import_data function takes in the data IDs and import the data from the data connector
dfg.import_data([0,0])

In [6]:
#dfg has a public data field shape, which depicts the shape of the datasets in rows and cols
dfg.shape

[(3, 6), (5, 6)]

In [7]:
# the col_label field has the fetures names of each col.
dfg.col_label

[Index(['Region', 'Temp(F)', 'Rainfall(mm)', 'Humidity(%)', 'Apples(ton)',
        'Oranges(ton)'],
       dtype='object'),
 Index(['Region', 'Temp(F)', 'Rainfall(mm)', 'Humidity(%)', 'Apples(ton)',
        'Oranges(ton)'],
       dtype='object')]

In [8]:
dfg.df

['11B0D640ABDF4C6DAE59DD9D153CAD64490ACAFCEAD74A10AEDFAD471A705271',
 'F70C8BE87AE7486183AD526E9AA561C9490ACAFCEAD74A10AEDFAD471A705271']

In [9]:
#the dtypes function gives the data types of each column
dfg.dtypes()

[Region          object
 Temp(F)          int64
 Rainfall(mm)     int64
 Humidity(%)      int64
 Apples(ton)      int64
 Oranges(ton)     int64
 dtype: object,
 Region          object
 Temp(F)          int64
 Rainfall(mm)     int64
 Humidity(%)      int64
 Apples(ton)      int64
 Oranges(ton)     int64
 dtype: object]

In [10]:
#The private_intersect compute the duplicate items between two data sets, 
#the droprow function drops the duplicated terms
duplicate = dfg.private_intersect(vms[0], vms[1], dfg.df[0], dfg.df[1], 'Region')
dfg.df[1] = dfg.droprow(vms[1], duplicate[1], dfg.df[1])

In [11]:
#We add a new feature 'Temp(C)' temperature in celsius by doing transformation from the 'Temp(F)'
newdf = dfg.apply_and_append(['Temp(F)']*len(vms), ['Temp(C)']*len(vms), dfg.df)

In [12]:
#We can calculate the pearson correlation between the new feature and the target 
#to evaluate its usefulness
dfg.pearson_corr('Temp(C)', 'Apples(ton)', newdf)

-0.47835011346874645

In [13]:
#We can do a normal transformation on the new feature we generated
dfg.norm_transform('Temp(C)', newdf)

In [14]:
#Defining the features and targets cols
#The get_col/to_numpy are done remotely
data = {}
X = dfg.get_col([['Temp(F)', 'Rainfall(mm)', 'Humidity(%)']]*len(vms), dfg.df)
y = dfg.get_col([['Apples(ton)', 'Oranges(ton)']]*len(vms), dfg.df)
data['X_train'] = dfg.to_numpy(X)
data['y_train'] = dfg.to_numpy(y)
#data['X_train'], data['X_test'], data['y_train'], data['y_test'] = dfg.train_test_split(X, y, 0.2, 42)

In [15]:
#construct the model
model_one = Fdlr(vms, vmagg, data, workplace)

In [17]:
#sigle round training
model_one.initmodel(3,2,5e-5)
#explain what fit is doing
model_one.fit(100)

processing round: 1
processing round: 10
processing round: 19
processing round: 28
processing round: 37
processing round: 46
processing round: 55
processing round: 64
processing round: 73
processing round: 82
processing round: 91
processing round: 100


In [18]:
#Compute the mean absolute error
model_one.mae(data['X_train'], data['y_train'])

1.854658317565918

In [19]:
#The block below does hyperparameter optimization by using Optuna package
model_op = Fdlr(vms, vmagg, data, workplace)

In [20]:
def objective(trail, model):
    n_lr = trail.suggest_float('n_lr', 1e-5, 1e-4)
    model.initmodel(3,2,n_lr)
    model.fit(100)
    return model.score()

In [21]:
study = optuna.create_study(direction="minimize")
study.optimize(lambda trial: objective(trial, model_op), n_trials=3)

[32m[I 2021-04-16 11:19:08,399][0m A new study created in memory with name: no-name-dd8ebd8c-81c8-4029-9f1c-a71e03138527[0m


processing round: 1
processing round: 10
processing round: 19
processing round: 28
processing round: 37
processing round: 46
processing round: 55
processing round: 64
processing round: 73
processing round: 82
processing round: 91
processing round: 100


[33m[W 2021-04-16 11:24:40,352][0m Trial 0 failed because of the following error: TypeError("'numpy.float64' object is not iterable")
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/optuna/_optimize.py", line 217, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-21-c878e1811333>", line 2, in <lambda>
    study.optimize(lambda trial: objective(trial, model_op), n_trials=3)
  File "<ipython-input-20-df54f1688262>", line 5, in objective
    return model.score()
  File "/usr/local/lib/python3.8/dist-packages/sail/algo/fdlr.py", line 110, in score
    return sum(scores)/len(scores)
TypeError: 'numpy.float64' object is not iterable[0m


TypeError: 'numpy.float64' object is not iterable