In [1]:
%cd ../

/Users/sarchey1/paper/FederatedLearning


In [2]:
import warnings
warnings.simplefilter('ignore')

## Stochastic Block Model Experiment

Before geting into the experiment details, let's review algorithm 1 and the primal and dual updates.

### Algorithm 1

![title](../algorithm1.png)

In [2]:
# %load algorithm/main.py
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import pdist

# from algorithm.penalty import *


def get_B(N):
    E = int(N*(N-1)/2)
    
    B = np.zeros((E, N))
    cnt = 0
    for i in range(N):
        for j in range(N):
            if i >= j:
                continue
            B[cnt, i] = 1
            B[cnt, j] = -1

            cnt += 1
    return B


def algorithm_1(K, datapoints, true_labels, samplingset, lambda_lasso, penalty_func_name='norm1', calculate_score=False):
    '''
    :param K: the number of iterations
    :param D: the block incidence matrix
    :param weight_vec: a list containing the edges's weights of the graph
    :param datapoints: a dictionary containing the data of each node in the graph needed for the algorithm 1
    :param true_labels: a list containing the true labels of the nodes
    :param samplingset: the sampling set
    :param lambda_lasso: the parameter lambda
    :param penalty_func_name: the name of the penalty function used in the algorithm

    :return iteration_scores: the mean squared error of the predicted weight vectors in each iteration
    :return new_w: the predicted weigh vectors for each node
    '''

#     Sigma = np.diag(np.full(weight_vec.shape, 0.9 / 2))
#     '''
#     Sigma: the block diagonal matrix Sigma
#     '''
#     T_matrix = np.diag(np.array((1.0 / (np.sum(abs(D), 0)))).ravel())
#     '''
#     T_matrix: the block diagonal matrix T
#     '''

#     if np.linalg.norm(np.dot(Sigma ** 0.5, D).dot(T_matrix ** 0.5), 2) > 1:
#         print ('product norm', np.linalg.norm(np.dot(Sigma ** 0.5, D).dot(T_matrix ** 0.5), 2))

    N = len(datapoints)
    E = int(N*(N-1)/2)
    
    D = get_B(N)
    T_matrix = np.diag(np.array((1.0 / (np.sum(abs(D), 0)))).ravel())
    '''
    T_matrix: the block diagonal matrix T
    '''
    Sigma = np.diag(np.full((E), 0.9 / 2))
    '''
    Sigma: the block diagonal matrix Sigma
    '''
        
    
    m, n = datapoints[0]['features'].shape

    # starting algorithm 1

    new_w = np.array([np.zeros(n) for i in range(N)])
    '''
    new_w: the primal variable of the algorithm 1
    '''
    new_u = np.array([np.zeros(n) for i in range(E)])
    '''
    new_u: the dual variable of the algorithm 1
    '''

    iteration_scores = []
    for iterk in range(K):
        # if iterk % 20 == 0:
        #     print ('iter:', iterk)
        #     print('-------------------------')
        prev_w = np.copy(new_w)

        # algorithm 1, line 2
        hat_w = new_w - np.dot(T_matrix, np.dot(D.T, new_u))

        
        for i in range(N):
            optimizer = datapoints[i]['optimizer']
            new_w[i] = optimizer.optimize(datapoints[i]['features'], datapoints[i]['label'], hat_w[i], datapoints[i]['degree'])
        
        weight_vec = np.exp(-pdist(new_w, metric='euclidean'))
        
        # define the penalty function
        if penalty_func_name == 'norm1':
            penalty_func = Norm1Pelanty(lambda_lasso, weight_vec, Sigma, n)

        elif penalty_func_name == 'norm2':
            penalty_func = Norm2Pelanty(lambda_lasso, weight_vec, Sigma, n)

        elif penalty_func_name == 'mocha':
            penalty_func = MOCHAPelanty(lambda_lasso, weight_vec, Sigma, n)
            
        elif penalty_func_name == 'sq_norm2':
            penalty_func = SquaredNorm2Pelanty(lambda_lasso, weight_vec, Sigma, n)

        else:
            raise Exception('Invalid penalty name')
        

        # algorithm 1, line 9
        tilde_w = 2 * new_w - prev_w
        new_u = new_u + np.dot(Sigma, np.dot(D, tilde_w))

        # algorithm 1, line 10
        new_u = penalty_func.update(new_u)

        # calculate the MSE of the predicted weight vectors
        if calculate_score:
            Y_pred = []
            for i in range(N):
                Y_pred.append(np.dot(datapoints[i]['features'], new_w[i]))

            iteration_scores.append(mean_squared_error(true_labels.reshape(N, m), Y_pred))

    # print (np.max(abs(new_w - prev_w)))

    return iteration_scores, new_w


### FedAvg

In [3]:
from sklearn.metrics import mean_squared_error

# from algorithm.penalty import *


def fedAvg(K, datapoints, true_labels, samplingset, calculate_score=False):
    '''
    :param K: the number of iterations
    :param datapoints: a dictionary containing the data of each node in the graph needed for the algorithm 1
    :param true_labels: a list containing the true labels of the nodes
    :param penalty_func_name: the name of the penalty function used in the algorithm

    :return iteration_scores: the mean squared error of the predicted weight vectors in each iteration
    :return new_w: the predicted weigh vectors for each node
    '''

    N = len(datapoints)
    m, n = datapoints[0]['features'].shape

    new_w = np.array([np.zeros(n) for i in range(N)])
    '''
    new_w: the weights
    '''

    iteration_scores = []
    for iterk in range(K):
        for i in samplingset:
            optimizer = datapoints[i]['optimizer']
            new_w[i] = optimizer.optimize(
                datapoints[i]['features'], 
                datapoints[i]['label'], 
                new_w[i], 
                datapoints[i]['degree']
            )
        new_w[:, :] = np.mean(new_w[samplingset], axis=0)

        # calculate the MSE of the predicted weight vectors
        if calculate_score:
            Y_pred = []
            for i in range(N):
                Y_pred.append(np.dot(datapoints[i]['features'], new_w[i]))

            iteration_scores.append(mean_squared_error(true_labels.reshape(N, m), Y_pred))

    # print (np.max(abs(new_w - prev_w)))

    return iteration_scores, new_w



### Primal Update 

As you see in the algorithm picture, the primal update needs a optimizer operator for the sampling set (line 6). We have implemented the optimizers discussed in the paper, both the logistic loss and squared error loss optimizers implementations with pytorch is available, also we have implemented the squared error loss optimizer using the fixed point equation in the `Networked Linear Regression` section of the paper.  

In [4]:
# %load algorithm/optimizer.py 
import torch
import abc
import numpy as np

from abc import ABC


# The linear model which is implemented by pytorch
class TorchLinearModel(torch.nn.Module):
    def __init__(self, n):
        super(TorchLinearModel, self).__init__()
        self.linear = torch.nn.Linear(n, 1, bias=False)

    def forward(self, x):
        y_pred = self.linear(x)
        return y_pred


# The abstract optimizer model which should have model, optimizer, and criterion as the input
class Optimizer(ABC):
    def __init__(self, model, optimizer, criterion):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion

    @abc.abstractmethod
    def optimize(self, x_data, y_data, old_weight, regularizer_term):
        torch_old_weight = torch.from_numpy(np.array(old_weight, dtype=np.float32))
        self.model.linear.weight.data = torch_old_weight
        for iterinner in range(40):
            self.optimizer.zero_grad()
            y_pred = self.model(x_data)
            loss1 = self.criterion(y_pred, y_data)
            loss2 = 1 / (2 * regularizer_term) * torch.mean((self.model.linear.weight - torch_old_weight) ** 2)  # + 10000*torch.mean((model.linear.bias+0.5)**2)#model.linear.weight.norm(2)
            loss = loss1 + loss2
            loss.backward()
            self.optimizer.step()

        return self.model.linear.weight.data.numpy()


# The linear model in Networked Linear Regression section of the paper
class LinearModel:
    def __init__(self, degree, features, label):
        mtx1 = 2 * degree * np.dot(features.T, features).astype('float64')
        mtx1 += 1 * np.eye(mtx1.shape[0])
        mtx1_inv = np.linalg.inv(mtx1)

        mtx2 = 2 * degree * np.dot(features.T, label).T

        self.mtx1_inv = mtx1_inv
        self.mtx2 = mtx2

    def forward(self, x):
        mtx2 = x + self.mtx2
        mtx_inv = self.mtx1_inv

        return np.dot(mtx_inv, mtx2)
    

class RidgeLinearModel:
    def __init__(self, degree, features, label, eta=1.0):
        m, n = features.shape
        self.m = m
        z_denominator = 2.0/m * np.sum(features*features) + 2.0*eta + 1.0/degree
        z_partly_nominator = 2.0/m * np.dot(features.T, label)

        self.z_denom = z_denominator
        self.z_nom = z_partly_nominator
        
        self.degree = degree
        self.features = features
        self.label = label
        self.eta = eta

    def forward(self, x):
        z = (1.0/self.degree*x + self.z_nom) / self.z_denom
        return z


# The Linear optimizer in Networked Linear Regression section of the paper
class LinearOptimizer(Optimizer):

    def __init__(self, model):
        super(LinearOptimizer, self).__init__(model, None, None)

    def optimize(self, x_data, y_data, old_weight, regularizer_term):
        return self.model.forward(old_weight)


# The Linear optimizer model which is implemented by pytorch
class TorchLinearOptimizer(Optimizer):
    def __init__(self, model):
        criterion = torch.nn.MSELoss(reduction='mean')
        optimizer = torch.optim.RMSprop(model.parameters())
        super(TorchLinearOptimizer, self).__init__(model, optimizer, criterion)

    def optimize(self, x_data, y_data, old_weight, regularizer_term):
        return super(TorchLinearOptimizer, self).optimize(x_data, y_data, old_weight, regularizer_term)


# The Logistic optimizer model which is implemented by pytorch
class TorchLogisticOptimizer(Optimizer):
    def __init__(self, model):
        criterion = torch.nn.BCELoss(reduction='mean')
        optimizer = torch.optim.RMSprop(model.parameters())
        super(TorchLogisticOptimizer, self).__init__(model, optimizer, criterion)

    def optimize(self, x_data, y_data, old_weight, regularizer_term):
        return super(TorchLogisticOptimizer, self).optimize(x_data, y_data, old_weight, regularizer_term)


### Dual Update 

As mentioned in the paper, the dual update has a penalty function(line 10) which is either norm1, norm2, or mocha.

In [5]:
# %load algorithm/penalty.py
import abc
import numpy as np

from abc import ABC


# The abstract penalty function which has a function update
class Penalty(ABC):
    def __init__(self, lambda_lasso, weight_vec, Sigma, n):
        self.lambda_lasso = lambda_lasso
        self.weight_vec = weight_vec
        self.Sigma = Sigma

    @abc.abstractmethod
    def update(self, new_u):
        pass


# The norm2 penalty function
class Norm2Pelanty(Penalty):
    def __init__(self, lambda_lasso, weight_vec, Sigma, n):
        super(Norm2Pelanty, self).__init__(lambda_lasso, weight_vec, Sigma, n)
        self.limit = np.array(lambda_lasso * weight_vec)

    def update(self, new_u):
        normalized_u = np.where(np.linalg.norm(new_u, axis=1) >= self.limit)
        new_u[normalized_u] = (new_u[normalized_u].T * self.limit[normalized_u] / np.linalg.norm(new_u[normalized_u], axis=1)).T
        return new_u


# The squared norm2 penalty function
class SquaredNorm2Pelanty(Penalty):
    def __init__(self, lambda_lasso, weight_vec, Sigma, n):
        super(SquaredNorm2Pelanty, self).__init__(lambda_lasso, weight_vec, Sigma, n)
        self.normalize_factor = 1 + np.dot(2 * self.Sigma, 1/(self.lambda_lasso * self.weight_vec))

    def update(self, new_u):
        for i in range(new_u.shape[1]):
            new_u[:, i] /= self.normalize_factor

        return new_u
    
    

# The MOCHA penalty function
class MOCHAPelanty(Penalty):
    def __init__(self, lambda_lasso, weight_vec, Sigma, n):
        super(MOCHAPelanty, self).__init__(lambda_lasso, weight_vec, Sigma, n)
        self.normalize_factor = 1 + np.dot(self.Sigma, 1/(self.lambda_lasso * self.weight_vec))

    def update(self, new_u):
        for i in range(new_u.shape[1]):
            new_u[:, i] /= self.normalize_factor

        return new_u


# The norm1 penalty function
class Norm1Pelanty(Penalty):
    def __init__(self, lambda_lasso, weight_vec, Sigma, n):
        super(Norm1Pelanty, self).__init__(lambda_lasso, weight_vec, Sigma, n)
        self.limit = np.array([np.zeros(n) for i in range(len(weight_vec))])
        for i in range(n):
            self.limit[:, i] = lambda_lasso * weight_vec

    def update(self, new_u):
        normalized_u = np.where(abs(new_u) >= self.limit)
        new_u[normalized_u] = self.limit[normalized_u] * new_u[normalized_u] / abs(new_u[normalized_u])
        return new_u


## Create Graph

Each node $i \in V$ represents a local dataset consisting of $m$ feature vectors $x^{(i,1)}, ... , x^{(i,m)} \in R^n$. The feature vectors are i.i.d. realizations of a standard Gaussian random vector x ∼ N(0,I). The labels $y_1^{(i)}, . . . , y_m^{(i)} \in R$ of the nodes $i \in V$ are generated according to the linear model $y_r^{(i)} = (x^{(i, r)})^T w^{(i)} + \epsilon$, with $\epsilon ∼ N(0,\sigma)$. To learn the weight $w^{(i)}$ ,we apply Algorithm 1 to a training set M obtained by randomly selecting 40% of the nodes.

In [6]:
# from algorithm.optimizer import *
from torch.autograd import Variable


def _get_graph_data(cluster_sizes, W, m=100, n=1000, noise_sd=0, is_torch_model=True, eta=1.0):
    '''
    :param W: a list containing the weight vectors for each cluster
    :param m, n: shape of features vector for each node
    :param pin: the probability of edges inside each cluster
    :param pout: the probability of edges between the clusters
    :param noise_sd: the standard deviation of the noise for calculating the labels
    
    :return B: adjacency matrix of the graph
    :return weight_vec: a list containing the edges's weights of the graph
    :return true_labels: a list containing the true labels of the nodes
    :return datapoints: a dictionary containing the data of each node in the graph needed for the algorithm 1 
    '''

    N = np.sum(cluster_sizes)
    '''
    N: total number of nodes
    '''
        
    # create the data of each node needed for the algorithm 1 
    
    datapoints = {}
    '''
    datapoints: a dictionary containing the data of each node in the graph needed for the algorithm 1,
    which are features, label, degree, and also the optimizer model for each node
    '''
    true_labels = []
    '''
    true_labels: the true labels for the nodes of the graph
    '''
    cnt = 0
    for i, cluster_size in enumerate(cluster_sizes):
        for j in range(cluster_size):
            features = np.random.normal(loc=0.0, scale=1.0, size=(m, n))
            '''
            features: the feature vector of node i which are i.i.d. realizations of a standard Gaussian random vector x~N(0,I)
            '''
            label = np.dot(features, W[i]) + np.random.normal(0,noise_sd)
            '''
            label: the label of the node i that is generated according to the linear model y = x^T w + e
            '''
            
            true_labels.append(label)
            
            node_degree = 1.0/(N-1)

            if is_torch_model:
                features = Variable(torch.from_numpy(features)).to(torch.float32)
                label = Variable(torch.from_numpy(label)).to(torch.float32)      
                model = TorchLinearModel(n)
                optimizer = TorchLinearOptimizer(model)
            else:
                if eta:
                    model = RidgeLinearModel(node_degree, features, label, eta)
                else:
                    model = LinearModel(node_degree, features, label)
                optimizer = LinearOptimizer(model) 
            '''
            model : the linear model for the node i 
            optimizer : the optimizer model for the node i 
            ''' 
            
            datapoints[cnt] = {
                'features': features,
                'w': W[i],
                'label': label,
                'optimizer': optimizer,
                'degree': node_degree
            }
            cnt += 1

    return np.array(true_labels), datapoints




### Compare Results

As the result we compare the MSE of Algorithm 1 with plain linear regression 
and decision tree regression

In [7]:
# %load results/compare_results.py
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error



def get_algorithm1_w_MSE(datapoints, predicted_w):
    '''
    :param datapoints:  a dictionary containing the data of each node in the graph needed for the algorithm 1
    :param predicted_w: the predicted weigh vectors for each node
    :param samplingset: the sampling set for the algorithm 1

    :return alg1_MSE: the MSE of the algorithm 1 for all the nodes, the samplingset and other nodes (test set)
    '''
    true_w = []
    for i in range(len(datapoints)):
        true_w.append(datapoints[i]['w'])

    true_w = np.array(true_w)

    alg1_MSE = {'total': mean_squared_error(true_w, predicted_w)}

    return alg1_MSE


def get_linear_regression_MSE(x, y, samplingset, not_samplingset):
    '''
    :param x: a list containing the features of the nodes
    :param y: a list containing the labels of the nodes
    :param samplingset: the training dataset
    :param not_samplingset: the test dataset
    :return linear_regression_MSE : the MSE of linear regression for all the nodes, the samplingset and other nodes (test set)
    '''

    model = LinearRegression().fit(x[samplingset], y[samplingset])
    pred_y = model.predict(x)

    linear_regression_MSE = {'total': mean_squared_error(y, pred_y),
                             'train': mean_squared_error(y[samplingset],
                                                         pred_y[samplingset]),
                             'test': mean_squared_error(y[not_samplingset],
                                                        pred_y[not_samplingset])}

    return linear_regression_MSE


def get_decision_tree_MSE(x, y, samplingset, not_samplingset):
    '''
    :param x: a list containing the features of the nodes
    :param y: a list containing the labels of the nodes
    :param samplingset: the training dataset
    :param not_samplingset: the test dataset
    :return decision_tree_MSE : the MSE of decision tree for all the nodes, the samplingset and other nodes (test set)
    '''

    max_depth = 2

    regressor = DecisionTreeRegressor(max_depth=max_depth)
    regressor.fit(x[samplingset], y[samplingset])
    pred_y = regressor.predict(x)

    decision_tree_MSE = {'total': mean_squared_error(y, pred_y),
                         'train': mean_squared_error(y[samplingset],
                                                     pred_y[samplingset]),
                         'test': mean_squared_error(y[not_samplingset],
                                                    pred_y[not_samplingset])}
    return decision_tree_MSE


def get_scores(datapoints, predicted_w, fl_w, samplingset, others=True):
    N = len(datapoints)
    '''
    N : the total number of nodes
    '''

    # calculate algorithm1 MSE
    alg_1_score = get_algorithm1_MSE(datapoints, predicted_w, samplingset)
    fl_score = get_algorithm1_MSE(datapoints, fl_w, samplingset)
    
    linear_regression_score = None
    decision_tree_score = None

    if others:
        # prepare the data for calculating the linear regression and decision tree regression MSEs
        X = []
        '''
        X: an array containing the features of all the nodes
        '''
        true_labels = []
        '''
        true_labels: an array containing the labels of all the nodes
        '''
        for i in range(len(datapoints)):
            X.append(np.array(datapoints[i]['features']))
            true_labels.append(np.array(datapoints[i]['label']))

        X = np.array(X)
        true_labels = np.array(true_labels)
        m, n = X[0].shape

        x = X.reshape(-1, n)
        y = true_labels.reshape(-1, 1)

        reformated_samplingset = []
        for item in samplingset:
            for i in range(m):
                reformated_samplingset.append(m * item + i)
        reformated_not_samplingset = [i for i in range(m * N) if i not in reformated_samplingset]
        
        # calculate linear regression MSE
        linear_regression_score = get_linear_regression_MSE(x, y, reformated_samplingset, reformated_not_samplingset)

        # calculate decision tree MSE
        decision_tree_score = get_decision_tree_MSE(x, y, reformated_samplingset, reformated_not_samplingset)

    return alg_1_score, fl_score, linear_regression_score, decision_tree_score


### Graph with Two Clusters

This chain graph has two clusters $|C_1| = |C_2| = 100$.
Each node $i \in V$ represents a local dataset consisting of feature vectors $x^{(i,1)}, ... , x^{(i,5)} \in R^2$.
The feature vectors are i.i.d. realizations of a standard Gaussian random vector x ~ N(0,I).
The labels $y_1^{(i)}, . . . , y_5^{(i)} \in R$ for each node $i \in V$
are generated according to the linear model $y_r^{(i)} = (x^{(i, r)})^T w^{(i)} + \epsilon$, with $\epsilon = 0$. 
The tuning parameter $\lambda$ in algorithm1 
is manually chosen, guided by the resulting MSE, as $\lambda=0.01$ for norm1 and norm2 and also $\lambda=0.05$ for mocha penalty function. 
To learn the weight $w^{(i)}$ ,we apply Algorithm 1 to a training set M obtained by randomly selecting 40% of the nodes and use the rest as test set. As the result we compare the mean MSE of Algorithm 1 with plain linear regression and decision tree regression with respect to the different random sampling sets.

In [8]:
%%capture
# from sparsebm import generate_SBM_dataset
import networkx as nx


# def get_graph_data(m=100, n=1000, p=2, noise_sd=0.001, r=1.0,is_torch_model=True):
def get_graph_data(m=10, n=100, p=2, noise_sd=0.001, r=1.0,is_torch_model=True, eta=1.0):

    '''
    :param m, n: shape of features vector for each node
    :param pin: the probability of edges inside each cluster
    :param pout: the probability of edges between the clusters
    :param noise_sd: the standard deviation of the noise for calculating the labels
    
    :return B: adjacency matrix of the graph
    :return weight_vec: a list containing the edges's weights of the graph
    :return true_labels: a list containing the true labels of the nodes
    :return datapoints: a dictionary containing the data of each node in the graph needed for the algorithm 1 
    '''
    cluster_sizes = [50, 50]
    
    # define weight vectors for each cluster of the graph
    param_settings = [ -p + 1 + 2*i for i in range(p)]
    params = []
    for p_i in range(p):
        loc = param_settings[p_i]

        param = np.random.binomial(1, 0.5, size=(n)).astype(np.float32) * r

        params.append(param)
    
    return _get_graph_data(cluster_sizes, params, m, n, noise_sd, is_torch_model, eta)



In [67]:
import datetime


PENALTY_FUNCS = ['norm1', 'norm2', 'mocha']
lambda_lasso=0.1
start =datetime.datetime.now()

true_labels, datapoints = get_graph_data(m=10, n=30, is_torch_model=False, eta=None)
N = len(true_labels)
K=500
penalty_func = PENALTY_FUNCS[0]
samplingset = [i for i in range(N)]
start = datetime.datetime.now()
_, predicted_w = algorithm_1(K, datapoints, true_labels, samplingset, 
                                 lambda_lasso, penalty_func)
alg1_score = get_algorithm1_w_MSE(datapoints, predicted_w)
print('linear regression w_mse:', alg1_score)
print('pred_w:', predicted_w[0][:5], ', true_w', datapoints[0]['w'][:5])

print()
true_labels, datapoints = get_graph_data(m=10, n=30, is_torch_model=False, eta=0.0)
N = len(true_labels)
K=500
penalty_func = PENALTY_FUNCS[0]
samplingset = [i for i in range(N)]
start = datetime.datetime.now()
_, predicted_w = algorithm_1(K, datapoints, true_labels, samplingset, 
                                 lambda_lasso, penalty_func)
alg1_score = get_algorithm1_w_MSE(datapoints, predicted_w)
print('ridge eta:', 0.0, ', w_mse:', alg1_score)
print('pred_w:', predicted_w[0][:5], ', true_w', datapoints[0]['w'][:5])
# print(datapoints[0]['w'])
# print(predicted_w[0])
# print(predicted_w[1])
# print('-------')
# print(datapoints[N-1]['w'])
# print(predicted_w[-2])
# print(predicted_w[-1])

print()
true_labels, datapoints = get_graph_data(m=10, n=30, is_torch_model=False, eta=0.001)
N = len(true_labels)
K=500
penalty_func = PENALTY_FUNCS[0]
samplingset = [i for i in range(N)]
start = datetime.datetime.now()
_, predicted_w = algorithm_1(K, datapoints, true_labels, samplingset, 
                                 lambda_lasso, penalty_func)
alg1_score = get_algorithm1_w_MSE(datapoints, predicted_w)
print('ridge eta:', 0.001, ', w_mse:', alg1_score)
print('pred_w:', predicted_w[0][:5], ', true_w', datapoints[0]['w'][:5])

print()
true_labels, datapoints = get_graph_data(m=10, n=30, is_torch_model=False, eta=0.01)
N = len(true_labels)
K=500
penalty_func = PENALTY_FUNCS[0]
samplingset = [i for i in range(N)]
start = datetime.datetime.now()
_, predicted_w = algorithm_1(K, datapoints, true_labels, samplingset, 
                                 lambda_lasso, penalty_func)
alg1_score = get_algorithm1_w_MSE(datapoints, predicted_w)
print('ridge eta:', 0.01, ', w_mse:', alg1_score)
print('pred_w:', predicted_w[0][:5], ', true_w', datapoints[0]['w'][:5])


print()
true_labels, datapoints = get_graph_data(m=10, n=30, is_torch_model=False, eta=0.1)
N = len(true_labels)
K=500
penalty_func = PENALTY_FUNCS[0]
samplingset = [i for i in range(N)]
start = datetime.datetime.now()
_, predicted_w = algorithm_1(K, datapoints, true_labels, samplingset, 
                                 lambda_lasso, penalty_func)
alg1_score = get_algorithm1_w_MSE(datapoints, predicted_w)
print('ridge eta:', 0.1, ', w_mse:', alg1_score)
print('pred_w:', predicted_w[0][:5], ', true_w', datapoints[0]['w'][:5])

print()
true_labels, datapoints = get_graph_data(m=10, n=30, is_torch_model=False, eta=1.0)
N = len(true_labels)
K=500
penalty_func = PENALTY_FUNCS[0]
samplingset = [i for i in range(N)]
start = datetime.datetime.now()
_, predicted_w = algorithm_1(K, datapoints, true_labels, samplingset, 
                                 lambda_lasso, penalty_func)
alg1_score = get_algorithm1_w_MSE(datapoints, predicted_w)
print('ridge eta:', 1.0, ', w_mse:', alg1_score)
print('pred_w:', predicted_w[0][:5], ', true_w', datapoints[0]['w'][:5])


linear regression w_mse: {'total': 1.5895201802962097e-05}
pred_w: [3.19220066e-03 1.18855848e-03 9.93331085e-01 5.62435903e-04
 3.10671049e-04] , true_w [0. 0. 1. 0. 0.]

ridge eta: 0.0 , w_mse: {'total': 6.905440276222356e-06}
pred_w: [-2.29442434e-04  9.96019392e-01  4.09318381e-03  9.96967124e-01
  3.65567669e-03] , true_w [0. 1. 0. 1. 0.]

ridge eta: 0.001 , w_mse: {'total': 0.4483309325708109}
pred_w: [0.02247281 0.01332811 0.001777   0.00948261 0.01271683] , true_w [0. 1. 0. 0. 0.]

ridge eta: 0.01 , w_mse: {'total': 0.4922345929629344}
pred_w: [ 0.0168096  -0.00152984  0.01306541  0.01408835  0.02998295] , true_w [1. 0. 0. 0. 1.]

ridge eta: 0.1 , w_mse: {'total': 0.49037770411352727}
pred_w: [-0.00486668  0.03201527  0.01630071 -0.00592273  0.01622616] , true_w [0. 1. 1. 0. 0.]

ridge eta: 1.0 , w_mse: {'total': 0.49278227136247527}
pred_w: [ 0.03461774  0.03357921  0.01177233  0.00401238 -0.00162545] , true_w [1. 1. 0. 0. 0.]
