article : https://arxiv.org/pdf/1307.0048.pdf

L'idée du projet est d'implementer un algorithme mapreduce sur la régression linéaire pénalisée, quand $X \in \mathbb{R}^{n \times p}$ avec $p << n$.

Cela correspond à un type de problème ou le nombre de features $p$ (les caractéristiques d'un individu ou un produit) est assez petit et il est envisageable de les stocker en mémoire, alors que la taille du dataset $n$ et très grande et on voudrait faire du calcul distributé dessus

L'idée de l'agorithme est alors d'exprimer la quantité à minimiser en fonction des matrices ou des vecteurs dont la dimension est une fonction de $p$ ($p\times p$ ou $p$ en fait). Et calculer ces quantitées à partir de $X \in \mathbb{R}^{n \times p}$ en faisant une reduction sur $n$ (qui est la taille de notre dataset)

In [148]:
import numpy as np
from numpy.random import multivariate_normal
from scipy.linalg.special_matrices import toeplitz

p = 100
n = 10000
cov = toeplitz(0.5 ** np.arange(p))
X = multivariate_normal(np.zeros(p), np.eye(p), n)

if True:
    idx = np.arange(p)
    coefs = (idx % 2) * np.exp(-idx / 10.)
    coefs[40:] = 0.
    
y = X.dot(coefs)

Data_array = [(X[i], y[i]) for i in range(n)]

Data_rdd = sc.parallelize(Data_array)




In [2]:

p = 10
n = 100
cov = toeplitz(0.5 ** np.arange(p))
X = multivariate_normal(np.zeros(p), np.eye(p), n)

coefs = np.ones(p)

y = X.dot(coefs)

# Algorithm 1

En notant $X_c \in \mathbb{R}^{n \times p }$ la matrice centrée réduite de $X$. On a :
$$X_c = (X - \mathbb{1} (\bar{X_1}, \dots, \bar{X_p}))D^{-1}$$

et
$$\begin{align}
&||Y - \alpha \mathbb{1} - X \beta||_2 + p_\lambda(\beta) \\
= & ||Y - \alpha \mathbb{1} - (X_cD + \mathbb{1} (\bar{X_1}, \dots, \bar{X_p})) \beta||_2 + p_\lambda(\beta) \\
= & ||Y - (\alpha + (\bar{X_1}, \dots, \bar{X_p})) \beta) \mathbb{1} - X_c D \beta ||_2 + p_\lambda(\beta)
\end{align}$$

On a donc que minimiser:
$$||Y - \alpha \mathbb{1} - X \beta||_2 + p_\lambda(\beta) $$
revient à minimiser:

$$\begin{align}
||Y - \hat{\alpha} \mathbb{1} - X_c \hat{\beta}||_2 + p_\lambda(\hat{\beta})
\end{align}$$
Avec le changement de variable:

$$\begin{align} 
\hat{\alpha}&= \alpha + \left(\bar{X_1}, \dots, \bar{X_p}\right) \beta \\
\hat{\beta} &= D \beta
\end{align}
 $$
 
 avec D la matrice diagonale des déviations standards.
 
 
 Comme maintenant les variables sont centrées, la minimisation en $\hat{\alpha}$ donne $\hat{\alpha} = \bar{Y}$, et:
 $$\begin{align}
 \hat{\beta}^* &= \arg\min_{\hat{\beta}} ||Y - \hat{\alpha} \mathbb{1} - X_c \hat{\beta}||_2 + p_\lambda(\hat{\beta}) \\
             &= \arg\min_{\hat{\beta}} ||(Y - \hat{\alpha} \mathbb{1})||_2^2  + ||X_c \hat{\beta}||_2^2 - 2(Y - \hat{\alpha} \mathbb{1})^T X_c \hat{\beta}  + p_\lambda(\hat{\beta}) \\
             &= \arg\min_{\hat{\beta}} ||X_c \hat{\beta}||_2^2 - 2(Y - \hat{\alpha} \mathbb{1})^T X_c \hat{\beta}  + p_\lambda(\hat{\beta}) \\
             &= \arg\min_{\hat{\beta}} \hat{\beta}^TX_c^T X_c \hat{\beta} - 2Y^T X_c\hat{\beta}  + p_\lambda(\hat{\beta})
 \end{align}$$
 
 $X_c^TX_c, Y^TX_c$ sont une matrice de taille $p \times p$ et un vecteur de taille $p$. On peut donc par hypothèse les stocker en mémoire et résoudre ce problème par une des méthodes d'optimisation classiques (coordinate descent par exemple).
 
 Les quantités qu'on doit calculer sont $X_c^TX_c$ qui est la matrice de correlation de $X$. $Y^TX_c$ et $(\bar{X_1}, \dots, \bar{X_p})$ (pour faire le changement de variables inverse).
 
 # En fait pas sûr de mon truc car quand on fait du cross validation centrée sur k-1 partition  с'est pas pareil que de centrer sur tout


In [146]:
n, p = 100, 50
X = np.random.randint(20, size=(n,p))
coeffs = np.ones(p)
y = X.dot(coeffs)
Data_array = [(X[i], y[i]) for i in range(n)]

Data_rdd = sc.parallelize(Data_array)

In [149]:
from scipy import optimize
def LR_MR(Data):
    def reduce_mean(row1, row2):
        s1 = row1[0]
        s2 = row2[0]
        
        return (s1 + s2, s1 / (s1 + s2) * row1[1] + s2 / (s1 + s2) * row2[1])
    
    def map_statistics(row):
        # calculate statistics for one row [size, mean(x), mean(y), Y^TY, y * x, cov(x)]
        x = row[0]
        y = row[1]
        return [1, x, y, y**2, y * x, np.zeros((len(row[0]), len(row[0])))]

    statistics = Data.map(map_statistics)

    def reduce_statistics(row1, row2):
        #combined with map_statistics returns [size, mean(X), mean(Y), Y^TY, Y^TX, Cov(X)]
        s_1 = row1[0]
        s_2 = row2[0]
        mean_x = s_1 / (s_1 + s_2) * row1[1] + s_2 / (s_1 + s_2) * row2[1]
        mean_y = s_1 / (s_1 + s_2) * row1[2] + s_2 / (s_1 + s_2) * row2[2]
        
        mean_substraction = (row1[1] - row2[1]).reshape((1, -1))
        cov = s_1 / (s_1 + s_2) * row1[5] + s_2 / (s_1 + s_2) * row2[5] + s_1 * s_2 / (
            s_1 + s_2)**2 * (mean_substraction).T.dot(mean_substraction)
        emit = [s_1 + s_2, mean_x, mean_y, row1[3] +
                row2[3], row1[4] + row2[4], cov]
        return emit
    
    statistics = statistics.reduce(reduce_statistics)
    
    size = statistics[0]
    means_X = statistics[1].reshape(-1, 1)
    mean_Y = statistics[2]
    YT_Y = statistics[3]
    YT_X = statistics[4].reshape(-1, 1)
    COV_X = statistics[5] 
    p = COV_X.shape[0]
    XT_X = size * (COV_X + means_X.dot(means_X.T))
    D_inv = np.diag([1 / np.sqrt(COV_X[i,i]) for i in range(p)])
    D = np.diag([np.sqrt(COV_X[i,i]) for i in range(p)])
    
    if False:
        #c'était pour vérifier que les valeurs sont bien calculées mais a priori c'est bon 
        print("COV_X \n {}".format(COV_X))
        print(size)
        print("XT_X \n {} \n".format(XT_X))
        print("means X \n {} \n".format(means_X))
    
    def beta_objective(beta):
        beta = D.dot(beta) #changement de variable en beta_hat
        #the simplified objective function for beta
        linear_term = -(YT_X - size * mean_Y * means_X).T.dot(D_inv).dot(beta)
        quadratic_term = 1 /2 * beta.dot(D_inv).dot(XT_X - size * means_X.dot(means_X.T)).dot(D_inv).dot(beta)
        return linear_term + quadratic_term

    def beta_objective_gradient(beta):
        #calculate gradients of each term
        linear_term = -(YT_X - size * mean_Y * means_X).T.dot(D_inv)
        quadratic_term = beta.dot(D_inv).dot(XT_X - size * means_X.dot(means_X.T)).dot(D_inv)

        return np.ravel(linear_term + quadratic_term)
    
    #changement de variable inverse:
    alpha_hat = mean_Y
    beta = minimize(beta_objective, np.zeros(p), method="CG").x 
    alpha = alpha_hat - means_X.T.dot(beta)
    print("alpha, beta \n {} \n  {} \n".format(alpha, beta))
    return (alpha, beta)

alpha1, beta1 = LR_MR(Data=Data_rdd)


# pour comparer avec la vrai valeur

def objective(beta_alpha):
    alpha = beta_alpha[0]
    beta = beta_alpha[1:]
    return np.linalg.norm(y - X.dot(beta) - np.ones(y.shape[0]), ord=2)


beta_alpha = minimize(objective, np.zeros(p + 1)).x
alpha = beta_alpha[0]
beta = beta_alpha[1:]
print(alpha, beta)

if True:
    COV_X = np.cov(X, rowvar=False, bias=True)
    n = X.shape[0]
    p = X.shape[1]
    means_X = np.mean(X, axis=0).reshape(1, -1)
    D_inv = np.diag([COV_X[i, i] for i in range(p)])
    mean_Y = np.mean(y)
    if False:
        print("COV_X \n {} \n".format(COV_X))
        print(" X.T.X \n {} \n".format(X.T.dot(X)))
        means_X = np.mean(X, axis=0).reshape(1, -1)
        print("means X \n {} \n".format(means_X))
        n = X.shape[0]
        print("size {}".format(n))
        print("cool XTX \n {} \n".format(n * (COV_X + means_X.T.dot(means_X))))

alpha, beta 
 [-1.65064462e-10] 
  [ 5.75388372e-09  9.04837409e-01 -1.81901021e-09  7.40818214e-01
 -3.51445617e-09  6.06530669e-01 -5.25056089e-09  4.96585306e-01
  9.91301333e-09  4.06569657e-01  3.39766618e-09  3.32871085e-01
 -6.24473255e-09  2.72531797e-01 -9.30253687e-09  2.23130166e-01
 -7.65121052e-09  1.82683513e-01 -6.04026846e-09  1.49568617e-01
 -2.54119166e-09  1.22456434e-01  1.37212654e-09  1.00258837e-01
 -3.23362769e-09  8.20849966e-02  7.24651633e-09  6.72055213e-02
 -6.74280835e-09  5.50232244e-02  4.82137097e-09  4.50492016e-02
  5.94053877e-09  3.68831817e-02 -3.35281421e-09  3.01973947e-02
  5.82446470e-09  2.47235294e-02 -7.69020203e-09  2.02419170e-02
 -7.63466638e-09 -1.01384692e-08 -3.89536463e-09  3.07001461e-09
 -2.64306746e-09 -1.33516612e-08 -1.54638222e-09 -8.76548470e-09
  1.84372819e-09 -4.80020189e-09 -3.08247523e-09  4.75321205e-09
 -4.98373406e-09  3.35765596e-09  5.93557678e-09  2.00095395e-08
 -4.10275571e-09 -1.55033461e-09 -1.04717361e-08 -1.040

In [177]:
from scipy.optimize import minimize 
def PenalizedLR_MR(Data, k, lambdas, penalizer="ridge"):
    """
    Data: an RDD each rows of which is a tuple (x, y)
    k: number of partitions for splitting
    lambdas: list of lambdas to test on
    penalizer: penalization term (only "ridge" is avaialble for now)
    
    """
    

    #calculate means, variance,  standardize X
    def reduce_mean(row1, row2):
        s1 = row1[0]
        s2 = row2[0]
        return (s1 + s2, s1 / (s1 + s2) * row1[1] + s2 / (s1 + s2) * row2[1])
    
    #vector of means of length p    
    means_X = np.array(Data.map(lambda row: (1, row[0])).reduce(reduce_mean)[1]) 
    mean_Y = Data.map(lambda row: (1, row[1])).reduce(reduce_mean)[1]
    
    p = len(means_X)
    #center X
    Data.map(lambda row: (np.arrray([row[0][i] - means_X[i] for i in range(p)]), row[1]))
    
    # vector of variance of X of length p (using the fact that now X is centered)
    vars_X = np.array(Data.map(lambda row: (1, row[0]**2)).reduce(reduce_mean)[1])
    
    #standardize X
    Data.map(lambda row: (np.array([row[1][i] / np.sqrt(vars_X[i]) for i in range(p)]), row[1]))
    
        
    def map_statistics(row):
        # calculate statistics for one row [size, mean(x), mean(y), Y^TY, y * x, cov(x)]
        x = row[0]
        y = row[1]
        return (np.random.randint(k), [1, x, y, y**2, y * x, np.zeros((len(row[0]), len(row[0])))])

    statistics = Data.map(map_statistics)

    def reduce_statistics(row1, row2):
        #combined with map_statistics returns [size, mean(X), mean(Y), Y^TY, Y^TX, Cov(X)]
        s_1 = row1[0]
        s_2 = row2[0]
        mean_x = s_1 / (s_1 + s_2) * row1[1] + s_2 / (s_1 + s_2) * row2[1]
        mean_y = s_1 / (s_1 + s_2) * row1[2] + s_2 / (s_1 + s_2) * row2[2]
        cov = s_1 / (s_1 + s_2) * row1[5] + s_2 / (s_1 + s_2) * row2[5] + s_1 * s_2 / (
            s_1 + s_2)**2 * (row1[1] - row2[1]).T.dot(row1[1] - row2[1])
        emit = [s_1 + s_2, mean_x, mean_y, row1[3] +
                row2[3], row1[4] + row2[4], cov]
        return emit

    statistics = statistics.reduceByKey(reduce_statistics)

    # Cross validation
    test_errors = []
    for lmbda in lambdas:
        error = 0
        for i in range(k):
            #do the split
            statistics_train = statistics.filter(lambda row: row[0] != i)
            statistics_train = statistics_train.reduce(reduce_statistics)[1]
            
            statistics_test = statistics.filter(lambda row: row[0] == i).collect()[0][1]
            #calculate statistics for our train dataset
            size = statistics_train[0]
            means_X = statistics_train[1].reshape(-1, 1)
            print(means_X.shape)
            mean_Y = np.array(statistics_train[2])
            YT_Y = statistics_train[3]
            YT_X = np.array(statistics_train[4]).reshape(-1, 1)
            COV_X = np.array(statistics_train[5])
            XT_X = size * ( COV_X + means_X.T.dot(means_X))
            
            D_inv = np.diag([1 / np.sqrt(COV_X[i,i]) for i in range(p)])
            
            #calculate statistics foro our test dataset:
            size_test = statistics_test[0]
            means_X_test = statistics_test[1].reshape(-1, 1)
            mean_Y_test = statistics_test[2]
            YT_Y_test = statistics_test[3]
            YT_X_test = statistics_test[4].reshape(-1, 1)
            COV_X_test = statistics_test[5]
            
            
            XT_X_test = size_test * (COV_X_test + means_X_test.T.dot(means_X_test))
            D_inv_test = np.diag([1 / np.sqrt(COV_X_test[i,i]) for i in range(p)])
            
            def beta_objective(beta):
                #the simplified objective function for beta
                linear_term = (YT_X - size * mean_Y * means_X).T.dot(D_inv).dot(beta)
                quadratic_term = 1 /2 * beta.dot(D_inv).dot(XT_X - size * means_X.T.dot(means_X)).dot(D_inv).dot(beta)
                if penalizer=="ridge":
                    penalization_term = 1 / 2 * np.linalg.norm(beta, ord=2)
                else:
                    penalization_term = 0
                return linear_term + quadratic_term + lmbda * penalization_term
            
                
            def beta_objective_gradient(beta):
                #calculate gradients of each term
                if penalizer=="ridge":
                    penalization_term = beta
                else:
                    penalization_term = 0
                linear_term = (YT_X - size * mean_Y * means_X).T.dot(D_inv)
                quadratic_term = beta.dot(D_inv).dot(XT_X - size * means_X.T.dot(means_X)).dot(D_inv)
                
                return np.ravel(linear_term + quadratic_term + lmbda * penalization_term)
            
            alpha_hat = mean_Y
            print(beta_objective(np.zeros(p)))
            print(beta_objective_gradient(np.zeros(p)).shape)
            beta_hat = minimize(beta_objective, np.zeros(p), method="CG", jac=beta_objective_gradient).x
            
            def test_error(alpha, beta):
                quadratic_term = YT_Y_test + size_test * alpha**2 + beta.dot(D_inv_test).dot(XT_X_test - size_test * means_X_test.T.dot(means_X_test)).dot(D_inv).dot(beta)
                print(mean_Y_test.shape, YT_X_test.shape, size_test, means_X_test.shape)

                double_term = -2 * alpha * mean_Y_test -2 * (YT_X_test - size_test * mean_Y_test * means_X_test).T.dot(D_inv).dot(beta)
                return quadratic_term + double_term
            error += test_error(alpha_hat, beta_hat)
        test_errors.append(error)
        
    best_i = np.argmin(test_errors)
    best_lambda = lambdas[best_i]

    #calculate statistics
    statistics = statistics.map(lambda row: row[1]).reduce(reduce_statistics)
    
    size = statistics[0]
    means_X = statistics[1].reshape(-1, 1)
    mean_Y = statistics[2]
    YT_Y = statistics[3]
    YT_X = statistics[4].reshape(-1, 1)
    COV_X = statistics[5]

    XT_X = size * (COV_X + means_X.T.dot(means_X))
    
    D_inv = np.diag([1 / np.sqrt(COV_X[i,i]) for i in range(p)])
    
    def beta_objective(beta):
        #the simplified objective function for beta
        linear_term = (YT_X - size * mean_Y * means_X).T.dot(D_inv).dot(beta)
        quadratic_term = 1 /2 * beta.dot(D_inv).dot(XT_X - size * means_X.T.dot(means_X)).dot(D_inv).dot(beta)
        if penalizer=="ridge":
            penalization_term = 1 / 2 * np.linalg.norm(beta, ord=2)
        else:
            penalization_term = 0
        return linear_term + quadratic_term + lmbda * penalization_term


    def beta_objective_gradient(beta):
        #calculate gradients of each term
        if penalizer=="ridge":
            penalization_term = beta
        else:
            penalization_term = 0
        linear_term = (YT_X - size * mean_Y * means_X).T.dot(D_inv)
        quadratic_term = beta.dot(D_inv).dot(XT_X - size * means_X.T.dot(means_X)).dot(D_inv)

        return np.ravel(linear_term + quadratic_term + lmbda * penalization_term)
    alpha_hat = mean_Y
    beta_hat = minimize(beta_objective, np.zeros(p), method="CG", jac=beta_objective_gradient).x
    

    beta = D_inv.dot(beta_hat)
    alpha = alpha_hat - means_X.T.dot(beta)
    
    return (alpha, beta, best_lambda)

In [79]:
PenalizedLR_MR(Data=Data_rdd)

(10, 1)
[0.]
(10,)
() (10, 1) 68 (10, 1)
(10, 1)
[0.]
(10,)
() (10, 1) 32 (10, 1)


(array([4.26519645e+14]),
 array([ 1.33572468e+15,  1.14037887e+14, -1.87496611e+15, -8.50901452e+14,
         2.70020556e+14, -1.38159084e+15, -1.27280006e+14,  3.45736829e+12,
         1.62230867e+15,  8.89189237e+14]),
 0)