In [1]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

In [2]:
class PCA(object):
    """Class that performs PCA projection and reconstruction.

    Attributes:
        num_rows: int, number of data instances.
        num_cols: int, number of dimensions for each data instance.
        use_sample_covariance: bool, whether using sample or population
            covariance.
        top_k_pc: int, number of top principal components to keep.
        col_mean_vector: tensor, rank 1 tensor of shape (num_cols,) containing
            column means.
        covariance_matrix: tensor, rank 2 tensor of shape (num_cols, num_cols)
            containing covariance matrix.
        self.top_k_eigenvectors: tensor, rank 2 tensor of shape
            (num_cols, top_k_pc) containing the eigenvectors associated with
            the top_k_pc eigenvalues.
    """
    def __init__(self, num_rows, num_cols, use_sample_covariance, top_k_pc):
        """Initializes `PCA` class instance.

        Args:
            num_rows: int, number of data instances.
            num_cols: int, number of dimensions for each data instance.
            use_sample_covariance: bool, whether using sample or population
                covariance.
            top_k_pc: int, number of top principal components to keep.
        """
        self.num_rows = num_rows
        self.num_cols = num_cols
        self.use_sample_covariance = use_sample_covariance
        self.top_k_pc = top_k_pc

        self.col_mean_vector = tf.zeros(
            shape=(self.num_cols,), dtype=tf.float32
        )
        self.covariance_matrix = tf.zeros(
            shape=(self.num_cols, self.num_cols), dtype=tf.float32
        )

        self.top_k_eigenvectors = tf.zeros(
            shape=(self.num_cols, self.top_k_pc)
        )

    def calculate_data_stats(self, data):
        """Calculates statistics of data.

        Args:
            data: tensor, rank 2 tensor of shape (num_rows, num_cols)
                containing input data.

        Returns:
            Column mean rank 1 tensor and covariance matrix rank 2 tensor.
        """
        # shape = (num_cols,)
        col_mean_vector = tf.reduce_mean(input_tensor=data, axis=0)

        # shape = (num_rows, num_cols)
        centered_data = data - col_mean_vector

        # shape = (num_cols, num_cols)
        unscaled_covariance = tf.matmul(
            a=centered_data,
            b=centered_data,
            transpose_a=True
        )

        # shape = ()
        scale = (
            self.num_rows - 1 if self.use_sample_covariance else self.num_rows
        )

        # shape = (num_cols, num_cols)
        covariance_matrix = tf.divide(
            x=unscaled_covariance, y=tf.cast(x=scale, dtype=tf.float64)
        )

        return col_mean_vector, covariance_matrix

    def calculate_eigenvalues_and_eigenvectors(self, data):
        """Calculates eigenvalues and eigenvectors of data.

        Args:
            data: tensor, rank 2 tensor of shape (num_rows, num_cols)
                containing input data.
        """
        # shape = (num_cols,) & (num_cols, num_cols)
        (self.col_mean_vector,
         self.covariance_matrix) = self.calculate_data_stats(data=data)

        # shape = (num_cols,) & (num_cols, num_cols)
        self.eigenvalues, self.eigenvectors = tf.linalg.eigh(
            tensor=self.covariance_matrix
        )

    def pca_projection_to_top_k_pc(self, data):
        """Projects data down to top_k principal components.

        Args:
            data: tensor, rank 2 tensor of shape (num_rows, num_cols)
                containing input data.

        Returns:
            Rank 2 tensor of shape (num_rows, top_k_pc) containing
                projected centered data.
        """
        # shape = (num_cols, top_k_pc)
        self.top_k_eigenvectors = self.eigenvectors[:, -self.top_k_pc:]

        # shape = (num_rows, num_cols)
        centered_data = data - self.col_mean_vector

        # shape = (num_rows, top_k_pc)
        projected_centered_data = tf.matmul(
            a=centered_data,
            b=self.top_k_eigenvectors
        )

        return projected_centered_data

    def pca_reconstruction_from_top_k_pc(self, data):
        """Reconstructs data up from top_k principal components.

        Args:
            data: tensor, rank 2 tensor of shape (num_rows, top_k_pc)
                containing projected centered data.

        Returns:
            Rank 2 tensor of shape (num_rows, num_cols) containing
                lossy, reconstructed input data.
        """
        # shape = (num_rows, top_k_pc)
        projected_centered_data = self.pca_projection_to_top_k_pc(data=data)

        # shape = (num_rows, num_cols)
        unprojected_centered_data = tf.matmul(
            a=projected_centered_data,
            b=self.top_k_eigenvectors,
            transpose_b=True
        )

        # shape = (num_rows, num_cols)
        data_reconstructed = unprojected_centered_data + self.col_mean_vector

        return data_reconstructed


In [3]:
data = np.array(
    [
        [7.0, 4.0, 3.0],
        [4.0, 1.0, 8.0],
        [6.0, 3.0, 5.0],
        [8.0, 6.0, 1.0],
        [8.0, 5.0, 7.0],
        [7.0, 2.0, 9.0],
        [5.0, 3.0, 3.0],
        [9.0, 5.0, 8.0],
        [7.0, 4.0, 5.0],
        [8.0, 2.0, 2.0],
    ]
)

In [4]:
print("num_rows = {}, num_cols = {}".format(data.shape[0], data.shape[1]))

num_rows = 10, num_cols = 3


## Instantiate class.

In [5]:
pca = PCA(num_rows=10, num_cols=3, use_sample_covariance=True, top_k_pc=2)

## Calculate data statistics.

In [6]:
pca.calculate_eigenvalues_and_eigenvectors(data=data)

In [7]:
pca.col_mean_vector

<tf.Tensor: shape=(3,), dtype=float64, numpy=array([6.9, 3.5, 5.1])>

In [8]:
pca.covariance_matrix

<tf.Tensor: shape=(3, 3), dtype=float64, numpy=
array([[ 2.32222222,  1.61111111, -0.43333333],
       [ 1.61111111,  2.5       , -1.27777778],
       [-0.43333333, -1.27777778,  7.87777778]])>

In [9]:
pca.eigenvalues

<tf.Tensor: shape=(3,), dtype=float64, numpy=array([0.74992815, 3.67612927, 8.27394258])>

In [10]:
pca.eigenvectors

<tf.Tensor: shape=(3, 3), dtype=float64, numpy=
array([[ 0.70172743,  0.69903712, -0.1375708 ],
       [-0.70745703,  0.66088917, -0.25045969],
       [-0.08416157,  0.27307986,  0.95830278]])>

## Projection

In [11]:
pca.pca_projection_to_top_k_pc(data=data)

<tf.Tensor: shape=(10, 2), dtype=float64, numpy=
array([[-0.17311941, -2.15142276],
       [-2.88749898,  3.80418259],
       [-0.98688598,  0.15321328],
       [ 1.30153634, -4.7065185 ],
       [ 2.27912632,  1.29375788],
       [ 0.1435814 ,  4.0993133 ],
       [-2.23208282, -1.62582148],
       [ 3.2512433 ,  2.11448986],
       [ 0.37304031, -0.2348172 ],
       [-1.06894049, -2.74637697]])>

In [12]:
pca.top_k_eigenvectors

<tf.Tensor: shape=(3, 2), dtype=float64, numpy=
array([[ 0.69903712, -0.1375708 ],
       [ 0.66088917, -0.25045969],
       [ 0.27307986,  0.95830278]])>

## Reconstruction.

In [13]:
recon = pca.pca_reconstruction_from_top_k_pc(data=data)
recon

<tf.Tensor: shape=(10, 3), dtype=float64, numpy=
array([[7.07495606, 3.92443193, 2.99101016],
       [4.35818659, 0.63888882, 7.95704095],
       [6.18905239, 2.80940399, 4.97732603],
       [8.45730172, 5.53896441, 0.94515359],
       [8.31521059, 4.68221571, 6.96219527],
       [6.43642293, 2.56817868, 9.06759253],
       [5.56335682, 2.43204338, 2.93243389],
       [8.88184769, 5.11911702, 8.01417058],
       [7.19307301, 3.80535054, 4.97684382],
       [6.53059219, 3.48140552, 2.17623319]])>

## MSE

In [14]:
(np.sum((data-recon)**2) / 10) * (10 / (10 - 1))

0.7499281527948033