article : https://arxiv.org/pdf/1307.0048.pdf

L'idée du projet est d'implementer un algorithme mapreduce sur la régression linéaire pénalisée, quand $X \in \mathbb{R}^{n \times p}$ avec $p << n$.

Cela correspond à un type de problème ou le nombre de features $p$ (les caractéristiques d'un individu ou un produit) est assez petit et il est envisageable de les stocker en mémoire, alors que la taille du dataset $n$ et très grande et on voudrait faire du calcul distributé dessus

L'idée de l'agorithme est alors d'exprimer la quantité à minimiser en fonction des matrices ou des vecteurs dont la dimension est une fonction de $p$ ($p\times p$ ou $p$ en fait). Et calculer ces quantitées à partir de $X \in \mathbb{R}^{n \times p}$ en faisant une reduction sur $n$ (qui est la taille de notre dataset)

In [106]:
import numpy as np
from numpy.random import multivariate_normal
from scipy.linalg.special_matrices import toeplitz

p = 10
n = 100
cov = toeplitz(0.5 ** np.arange(p))
X = multivariate_normal(np.zeros(p), np.eye(p), 100)

if False:
    idx = np.arange(p)
    coefs = (idx % 2) * np.exp(-idx / 10.)
    coefs[20:] = 0.
    
coefs = np.ones(p)

y = X.dot(coefs)

Data = [(X[i], y[i]) for i in range(n)]

Data = sc.parallelize(Data)




# Algorithm 1

En notant $X_c \in \mathbb{R}^{n \times p }$ la matrice centrée réduite de $X$. On a que minimiser:
$$||Y - \alpha \mathbb{1} - X \beta||_2 + p_\lambda(\beta)$$

Revient au même que minimiser:
$$\begin{align}
||Y - \hat{\alpha} \mathbb{1} - X_c \hat{\beta}||_2 + p_\lambda(\hat{\beta})
\end{align}$$
Avec le changement de variable:

$$\begin{align} 
\hat{\alpha}&= \alpha + \left(\bar{X_1}, \dots, \bar{X_p}\right) \beta \\
\hat{\beta} &= D \beta
\end{align}
 $$
 
 avec D la matrice diagonale des déviations standards.
 
 
 Comme maintenant les variables sont centrées, la minimisation en $\hat{\alpha}$ donne $\hat{\alpha} = \bar{Y}$, et:
 $$\begin{align}
 \hat{\beta}^* &= \arg\min_{\hat{\beta}} ||Y - \hat{\alpha} \mathbb{1} - X_c \hat{\beta}||_2 + p_\lambda(\hat{\beta}) \\
             &= \arg\min_{\hat{\beta}} ||(Y - \hat{\alpha} \mathbb{1})||_2^2  + ||X_c \hat{\beta}||_2^2 - 2(Y - \hat{\alpha} \mathbb{1})^T X_c \hat{\beta}  + p_\lambda(\hat{\beta}) \\
             &= \arg\min_{\hat{\beta}} ||X_c \hat{\beta}||_2^2 - 2(Y - \hat{\alpha} \mathbb{1})^T X_c \hat{\beta}  + p_\lambda(\hat{\beta}) \\
             &= \arg\min_{\hat{\beta}} \hat{\beta}^TX_c^T X_c \hat{\beta} - 2Y^T X_c\hat{\beta}  + p_\lambda(\hat{\beta})
 \end{align}$$
 
 $X_c^TX_c, Y^TX_c$ sont une matrice de taille $p \times p$ et un vecteur de taille $p$. On peut donc par hypothèse les stocker en mémoire et résoudre ce problème par une des méthodes d'optimisation classiques (coordinate descent par exemple).
 
 Les quantités qu'on doit calculer sont $X_c^TX_c$ qui est la matrice de correlation de $X$. $Y^TX_c$ et $(\bar{X_1}, \dots, \bar{X_p})$ (pour faire le changement de variables inverse).
 
 # En fait pas sûr de mon truc car quand on fait du cross validation centrée sur k-1 partition  с'est pas pareil que de centrer sur tout


In [109]:
from scipy.optimize import minimize 
def PenalizedLR_MR(Data, k, lambdas, penalizer="ridge"):
    """
    Data: an RDD each rows of which is a tuple (x, y)
    k: number of partitions for splitting
    lambdas: list of lambdas to test on
    penalizer: penalization term (only "ridge" is avaialble for now)
    
    """
    

    #calculate means, variance,  standardize X
    def reduce_mean(row1, row2):
        s1 = row1[0]
        s2 = row2[0]
        
        return (s1 + s2, s1 / (s1 + s2) * row1[1] + s2 / (s1 + s2) * row2[1])
    
    #vector of means of length p    
    means_X = np.array(Data.map(lambda row: (1, row[0])).reduce(reduce_mean)[1]) 
    mean_Y = Data.map(lambda row: (1, row[1])).reduce(reduce_mean)[1]
    
    p = len(means_X)
    #center X
    Data.map(lambda row: (np.arrray([row[0][i] - means_X[i] for i in range(p)]), row[1]))
    
    # vector of variance of X of length p (using the fact that now X is centered)
    vars_X = np.array(Data.map(lambda row: (1, row[0]**2)).reduce(reduce_mean)[1])
    
    #standardize X
    Data.map(lambda row: (np.array([row[1][i] / vars_X[i] for i in range(p)]), row[1]))
    
        
    def map_statistics(row):
        # calculate statistics for one row [size, mean(x), mean(y), Y^TY, y * x, cov(x)]
        x = row[0]
        y = row[1]
        return (np.random.randint(k), [1, x, y, y**2, y * x, np.zeros((len(row[0]), len(row[0])))])

    statistics = Data.map(map_statistics)

    def reduce_statistics(row1, row2):
        #returns [size, mean(X), mean(Y), Y^TY, Y^TY, Cov(X)]
        s_1 = row1[0]
        s_2 = row2[0]
        mean_x = s_1 / (s_1 + s_2) * row1[1] + s_2 / (s_1 + s_2) * row2[1]
        mean_y = s_1 / (s_1 + s_2) * row1[2] + s_2 / (s_1 + s_2) * row2[2]
        cov = s_1 / (s_1 + s_2) * row1[5] + s_2 / (s_1 + s_2) * row2[5] + s_1 * s_2 / (
            s_1 + s_2)**2 * (row1[1] - row2[1]).T.dot(row1[1] - row2[1])
        emit = [s_1 + s_2, mean_x, mean_y, row1[3] +
                row2[3], row1[4] + row2[4], cov]
        return emit

    statistics = statistics.reduceByKey(reduce_statistics)

    # Cross validation
    test_errors = []
    for lmbda in lambdas:
        error = 0
        for i in range(k):
            #do the split
            statistics_train = statistics.filter(lambda row: row[0] != i)
            statistics_train = statistics_train.reduce(reduce_statistics)[1]
            
            statistics_test = statistics.filter(lambda row: row[0] == i).collect()[0][1]
            #calculate statistics for our train dataset
            size = statistics_train[0]
            means_X = statistics_train[1]
            mean_Y = np.array(statistics_train[2])
            YT_Y = statistics_train[3]
            YT_X = np.array(statistics_train[4])
            COV_X = np.array(statistics_train[5])
            XT_X = COV_X + (size - 1) * means_X.T.dot(means_X)
            
            D_inv = np.diag([1 / np.sqrt(COV_X[i,i]) for i in range(p)])
            
            #calculate statistics foro our test dataset:
            size_test = statistics_test[0]
            means_X_test = statistics_test[1]
            mean_Y_test = statistics_test[2]
            YT_Y_test = statistics_test[3]
            YT_X_test = statistics_test[4]
            COV_X_test = statistics_test[5]
            
            XT_X_test = COV_X_test + (size_test - 1) * means_X_test.T.dot(means_X_test)
            D_inv_test = np.diag([1 / np.sqrt(COV_X_test[i,i]) for i in range(p)])
            
            def beta_objective(beta):
                #the simplified objective function for beta
                linear_term = (YT_X - size * mean_Y * means_X).dot(D_inv).dot(beta)
                quadratic_term = 1 /2 * beta.dot(D_inv).dot(XT_X - size * means_X.T.dot(means_X)).dot(D_inv).dot(beta)
                if penalizer=="ridge":
                    penalization_term = 1 / 2 * np.linalg.norm(beta, ord=2)
                else:
                    penalization_term = 0
                return linear_term + quadratic_term + lmbda * penalization_term
            
                
            def beta_objective_gradient(beta):
                #calculate gradients of each term
                if penalizer=="ridge":
                    penalization_term = beta
                else:
                    penalization_term = 0
                linear_term = (YT_X - size * mean_Y * means_X).dot(D_inv)
                quadratic_term = beta.dot(D_inv).dot(XT_X - size * means_X.T.dot(means_X)).dot(D_inv)
                
                return linear_term + quadratic_term + lmbda * penalization_term
            
            alpha_hat = mean_Y
            beta_hat = minimize(beta_objective, np.zeros(p), method="CG", jac=beta_objective_gradient).x
            
            def test_error(alpha, beta):
                quadratic_term = YT_Y_test + size_test * alpha**2 + beta.dot(D_inv_test).dot(XT_X_test - size_test * means_X_test.T.dot(means_X_test)).dot(D_inv).dot(beta)
                double_term = -2 * alpha * mean_Y_test -2 * (YT_X_test - size_test * mean_Y * means_X).dot(D_inv).dot(beta)
                return quadratic_term + double_term
            error += test_error(alpha_hat, beta_hat)
        test_errors.append(error)
        
    best_i = np.argmin(test_errors)
    best_lambda = lambdas[best_i]

    #calculate statistics
    statistics = statistics.map(lambda row: row[1]).reduce(reduce_statistics)
    
    size = statistics[0]
    means_X = statistics[1]
    mean_Y = statistics[2]
    YT_Y = statistics[3]
    YT_X = statistics[4]
    COV_X = statistics[5]

    XT_X = COV_X + (size - 1) * means_X.T.dot(means_X)
    
    D_inv = np.diag([1 / np.sqrt(COV_X[i,i]) for i in range(p)])
    
    def beta_objective(beta):
        #the simplified objective function for beta
        linear_term = (YT_X - size * mean_Y * means_X).dot(D_inv).dot(beta)
        quadratic_term = 1 /2 * beta.dot(D_inv).dot(XT_X - size * means_X.T.dot(means_X)).dot(D_inv).dot(beta)
        if penalizer=="ridge":
            penalization_term = 1 / 2 * np.linalg.norm(beta, ord=2)
        else:
            penalization_term = 0
        return linear_term + quadratic_term + lmbda * penalization_term


    def beta_objective_gradient(beta):
        #calculate gradients of each term
        if penalizer=="ridge":
            penalization_term = beta
        else:
            penalization_term = 0
        linear_term = (YT_X - size * mean_Y * means_X).dot(D_inv)
        quadratic_term = beta.dot(D_inv).dot(XT_X - size * means_X.T.dot(means_X)).dot(D_inv)

        return linear_term + quadratic_term + lmbda * penalization_term
    alpha_hat = mean_Y
    beta_hat = minimize(beta_objective, np.zeros(p), method="CG", jac=beta_objective_gradient).x
    
    print(alpha_hat, "\n")
    print(means_X)
    print(beta_hat)  
    print(beta_objective(beta_hat))
    beta = D_inv.dot(beta_hat)
    alpha = alpha_hat - means_X.T.dot(beta)
    
    return (alpha, beta, best_lambda)

In [110]:
PenalizedLR_MR(Data=Data, k=2, lambdas=[0], penalizer=0)

-0.048164441201812055 

[ 2.03101752e-01 -1.98978099e-02  1.81806798e-02 -1.21368099e-01
  6.94640114e-02  2.26584509e-03 -1.29893643e-04 -6.94826179e-02
  6.17316673e-02 -1.92029976e-01]
[ 1.99131038e+17  4.08601611e+17 -1.74548145e+16 -1.93643184e+17
 -3.46814935e+17  7.16903245e+16 -5.81265492e+17  1.75790948e+17
  1.64903021e+17  1.19061479e+17]
-6.259820860377334e+18


(-2191945045476063.0,
 array([ 6.46922703e+16,  1.32743575e+17, -5.67059557e+15, -6.29094154e+16,
        -1.12670761e+17,  2.32902410e+16, -1.88837384e+17,  5.71097085e+16,
         5.35725166e+16,  3.86798436e+16]),
 0)

In [9]:
test_data = sc.parallelize([(np.array([i]), i) for i in range(1, 10)])
test_data.take(4)

[(array([1]), 1), (array([2]), 2), (array([3]), 3), (array([4]), 4)]

In [10]:
stat = PenalizedLR_MR(test_data, 1, [1])
stat.collect()

[(0, [9, array([[5.]]), 5.0, 285, array([[285]]), array([[6.66666667]])])]

In [11]:
np.cov(np.arange(1,10), bias=True)

array(6.66666667)