# Reproduction of Chapter 2 Slide 75

In [1]:
import numpy as np
import pandas as pd
from scipy import linalg
import tensorflow as tf

tf.enable_eager_execution()

In [2]:
orthodont_data = pd.read_csv('orthodont.csv')
orthodont_data = orthodont_data.set_index('Subject')
orthodont_data.head(8)

Unnamed: 0_level_0,distance,age,Sex
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M01,26.0,8,Male
M01,25.0,10,Male
M01,29.0,12,Male
M01,31.0,14,Male
M02,21.5,8,Male
M02,22.5,10,Male
M02,23.0,12,Male
M02,26.5,14,Male


In [3]:
def make_covariates(data_frame):    
    age = (data_frame['age'] - 8).values
    is_female = (data_frame['Sex'] == 'Female').values.astype(np.float64)
    return np.column_stack((
        np.ones(len(data_frame)),
        age,
        is_female,
        age*is_female,        
    ))

def make_response(data_frame):
    return data_frame['distance'].values

X = tf.convert_to_tensor(
    [make_covariates(orthodont_data.loc[i]) for i in np.unique(orthodont_data.index)],
    tf.float32)
y = tf.expand_dims(tf.convert_to_tensor(
    [make_response(orthodont_data.loc[i]) for i in np.unique(orthodont_data.index)],
    tf.float32), -1)

In [4]:
def solve_beta(X, y, weights):
    projected_X = tf.reduce_sum(tf.matmul(tf.tensordot(tf.transpose(X, [0, 2, 1]), weights, 1), X), 0)
    projected_y = tf.reduce_sum(tf.matmul(tf.tensordot(tf.transpose(X, [0, 2, 1]), weights, 1), y), 0)
    return tf.linalg.cholesky_solve(tf.linalg.cholesky(projected_X), projected_y)

def loss_fn(X, y, covariance, reml=False):
    weights = tf.linalg.cholesky_solve(tf.linalg.cholesky(covariance), tf.eye(4))        
    beta = solve_beta(X, y, weights)     
    residuals = y - tf.tensordot(X, beta, 1)
    weighted_squared_error = tf.matmul(
        tf.tensordot(tf.transpose(residuals, [0, 2, 1]), weights, 1), residuals)
    loss = tf.reduce_mean(weighted_squared_error) + tf.linalg.logdet(covariance)
    
    if reml:
        reml_loss = tf.reduce_sum(tf.matmul(tf.tensordot(tf.transpose(X, [0, 2, 1]), weights, 1), X), 0)
        return loss + tf.linalg.logdet(reml_loss) / tf.cast(tf.shape(y)[0], tf.float32)
    
    return loss

## Compound Symmetric

This means each cluster has exchangeable correlation structure.

In [5]:
log_standard_errors = tf.Variable([0., 0., 0., 0.])
log_correlation = tf.Variable([-1.], dtype=tf.float32)

def make_compound_symmetric_covariance(log_standard_errors, log_correlation):
    rho = tf.exp(log_correlation)
    correlation = tf.ones((4, 4), dtype=tf.float32)*rho + tf.eye(4)*(1. - rho)
    standard_errors = tf.exp(log_standard_errors)
    return correlation*standard_errors*tf.expand_dims(standard_errors, -1)

make_compound_symmetric_covariance(log_standard_errors, log_correlation).numpy()

array([[1.        , 0.36787945, 0.36787945, 0.36787945],
       [0.36787945, 1.        , 0.36787945, 0.36787945],
       [0.36787945, 0.36787945, 1.        , 0.36787945],
       [0.36787945, 0.36787945, 0.36787945, 1.        ]], dtype=float32)

### Maximum Likelihood

In [6]:
optimizer = tf.train.AdagradOptimizer(0.1)
variables = [log_standard_errors, log_correlation]

for _ in range(2048):
    with tf.GradientTape() as tape:
        loss = loss_fn(X, y, make_compound_symmetric_covariance(log_standard_errors, log_correlation))
        gradients = tape.gradient(loss, variables)    
    optimizer.apply_gradients(zip(gradients, variables))
    
make_compound_symmetric_covariance(log_standard_errors, log_correlation).numpy()

array([[5.341891 , 2.8692682, 3.472892 , 3.0481002],
       [2.869268 , 3.9785721, 2.9971426, 2.6305428],
       [3.472892 , 2.9971426, 5.828644 , 3.1839447],
       [3.0481005, 2.630543 , 3.183945 , 4.4899707]], dtype=float32)

In [7]:
def estimate_compound_symmetric_S(log_standard_errors, log_correlation):
    S = np.diag(tf.exp(log_standard_errors).numpy())
    S[S == 0] = tf.exp(log_correlation).numpy()
    return S

estimate_compound_symmetric_S(log_standard_errors, log_correlation)

array([[2.311253 , 0.6223863, 0.6223863, 0.6223863],
       [0.6223863, 1.9946358, 0.6223863, 0.6223863],
       [0.6223863, 0.6223863, 2.4142585, 0.6223863],
       [0.6223863, 0.6223863, 0.6223863, 2.1189551]], dtype=float32)

### Resricted Maximum Likelihood

In [8]:
log_standard_errors = tf.Variable([0., 0., 0., 0.])
log_correlation = tf.Variable([-1.], dtype=tf.float32)

optimizer = tf.train.AdagradOptimizer(0.1)
variables = [log_standard_errors, log_correlation]

for _ in range(2048):
    with tf.GradientTape() as tape:
        loss = loss_fn(X, y, make_compound_symmetric_covariance(log_standard_errors, log_correlation), reml=True)
        gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

estimate_compound_symmetric_S(log_standard_errors, log_correlation)

array([[2.3867726, 0.6352883, 0.6352883, 0.6352883],
       [0.6352883, 2.0582683, 0.6352883, 0.6352883],
       [0.6352883, 0.6352883, 2.4678137, 0.6352883],
       [0.6352883, 0.6352883, 0.6352883, 2.1967258]], dtype=float32)

## Symmetric

This is an unstructured covariance matrix. It's only enforced that the covariance matrix is symmetric.

In [9]:
log_covariance = tf.Variable([0., -1., 0., -1., -1., 0., -1., -1., -1., 0.], dtype=tf.float32)

def make_symmetric_covariance(log_covariance):
    return tf.gather(tf.exp(log_covariance),
                     [[0, 1, 3, 6], [1, 2, 4, 7], [3, 4, 5, 8], [6, 7, 8, 9]])

make_symmetric_covariance(log_covariance).numpy()

array([[1.        , 0.36787945, 0.36787945, 0.36787945],
       [0.36787945, 1.        , 0.36787945, 0.36787945],
       [0.36787945, 0.36787945, 1.        , 0.36787945],
       [0.36787945, 0.36787945, 0.36787945, 1.        ]], dtype=float32)

### Maximum Likelihood

In [10]:
optimizer = tf.train.AdagradOptimizer(0.1)
variables = [log_covariance]

for _ in range(2048):
    with tf.GradientTape() as tape:
        loss = loss_fn(X, y, make_symmetric_covariance(log_covariance))
        gradients = tape.gradient(loss, variables)    
    optimizer.apply_gradients(zip(gradients, variables))
    
make_symmetric_covariance(log_covariance).numpy()

array([[5.113675 , 2.4365215, 3.6041563, 2.5170095],
       [2.4365215, 3.9241848, 2.7122397, 3.0578787],
       [3.6041563, 2.7122397, 5.9719524, 3.8170369],
       [2.5170095, 3.0578787, 3.8170369, 4.6125784]], dtype=float32)

In [11]:
def estimate_symmetric_S(log_covariance):
    symmetric_covariance = make_symmetric_covariance(log_covariance).numpy()
    standard_errors = np.sqrt(np.diag(symmetric_covariance))
    S = symmetric_covariance/standard_errors/standard_errors[:,np.newaxis]
    S += np.diag(standard_errors) - np.eye(len(standard_errors))
    return S

estimate_symmetric_S(log_covariance)

array([[2.2613435 , 0.54391235, 0.6521971 , 0.51825845],
       [0.5439124 , 1.9809556 , 0.5602672 , 0.718743  ],
       [0.6521971 , 0.5602672 , 2.443758  , 0.727271  ],
       [0.51825845, 0.7187431 , 0.7272711 , 2.1476912 ]], dtype=float32)

### Restricted Maximum Likelihood

In [12]:
log_covariance = tf.Variable([0., -1., 0., -1., -1., 0., -1., -1., -1., 0.], dtype=tf.float32)

optimizer = tf.train.AdagradOptimizer(0.1)
variables = [log_covariance]

for _ in range(2048):
    with tf.GradientTape() as tape:
        loss = loss_fn(X, y, make_symmetric_covariance(log_covariance), reml=True)
        gradients = tape.gradient(loss, variables)    
    optimizer.apply_gradients(zip(gradients, variables))
    
estimate_symmetric_S(log_covariance)

array([[2.3269567, 0.5674492, 0.6582997, 0.5211466],
       [0.5674493, 2.0452266, 0.5798044, 0.7243952],
       [0.6582997, 0.5798044, 2.4997234, 0.7390662],
       [0.5211466, 0.7243952, 0.7390661, 2.2305765]], dtype=float32)