## Perform PCA by using eigendecomposition

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

import random

random.seed(10)
np.random.seed(10)
sns.set_theme()

### 2D dataset example

In [None]:
artificial_df = pd.read_csv('data/pca_chapter_synthetical/pca_2d_dataset.csv')
artificial_df.head()

In [None]:
# Plot the dataset

sns.scatterplot(artificial_df,x='Feature 1',y='Feature 2')

plt.hlines(xmin=-5, xmax=30, y=0, color='black')
plt.vlines(ymin=-5, ymax=50, x=0, color='black')

plt.xlim([-5, 30])
plt.ylim([-5, 50])

plt.title('Example dataset')


In [None]:
# Scale the data

artificial_df[:] = StandardScaler().fit_transform(artificial_df)

sns.scatterplot(artificial_df,x='Feature 1',y='Feature 2')

plt.hlines(xmin=-5, xmax=30, y=0, color='black')
plt.vlines(ymin=-5, ymax=50, x=0, color='black')

plt.xlim([-2, 2])
plt.ylim([-2, 2])

plt.gca().set_aspect('equal', adjustable='box')
plt.title('Scaled dataset')

In [None]:
# Calculate covariance matrix

cov_mtx = artificial_df.cov().to_numpy()

cov_mtx

In [None]:
# Calculate eigenvalues and eigenvectors of the covariance matrix

eigenvalues, eigenvectors = np.linalg.eig(cov_mtx)

In [None]:
eigenvalues

In [None]:
eigenvectors

In [None]:
# Plot PCs in the original coordinate system

plt.figure()

sns.scatterplot(artificial_df,x='Feature 1',y='Feature 2')

plt.hlines(xmin=-5, xmax=30, y=0, color='black')
plt.vlines(ymin=-5, ymax=50, x=0, color='black')

plt.xlim([-2, 2])
plt.ylim([-2, 2])

plt.quiver(
    0, 0, 
    eigenvectors[:, 0][0], 
    eigenvectors[:, 0][1], 
    angles='xy', scale_units='xy', 
    scale=1, color='r', 
    label="PC1 vector"
)
plt.quiver(
    0, 0, 
    eigenvectors[:, 1][0], eigenvectors[:, 1][1], 
    angles='xy', scale_units='xy',
    scale=1, color='b', 
    label="PC2 vector"
)

plt.gca().set_aspect('equal', adjustable='box')
plt.legend()
plt.title('PC1 and PC2 in original coordinate system')

In [None]:
# Perform change of basis

transformed_data = eigenvectors.T @ artificial_df.to_numpy().T

transformed_df = pd.DataFrame(transformed_data.T, columns=['PC1', 'PC2'])

# Change basis for eigenvectors

transformed_axis = eigenvectors.T @ eigenvectors

plt.figure()

sns.scatterplot(transformed_df,x='PC1',y='PC2')

plt.hlines(xmin=-5, xmax=30, y=0, color='black')
plt.vlines(ymin=-5, ymax=50, x=0, color='black')

plt.quiver(
    0, 0, 
    transformed_axis[0][0], 
    transformed_axis[0][1],
    angles='xy', scale_units='xy', 
    scale=1, color='r', 
    label="PC1 vector"
)
plt.quiver(
    0, 0,
    transformed_axis[1][0], 
    transformed_axis[1][1], 
    angles='xy', scale_units='xy', 
    scale=1, color='b', 
    label="PC2 vector"
)

plt.xlim([-3, 3])
plt.ylim([-3, 3])

plt.gca().set_aspect('equal', adjustable='box')
plt.title('PC1 and PC2 in original coordinate system')

### 3D - case

In [None]:
def plot_data_and_axis(data, eigenvectors=None, axis_min=None, 
                        axis_max=None, figsize=None):
    
    """
    This function plots data in a 3D coordinate system.
    This functio also plots coordinate system axis or arbitrary
    set of vectors.
    
    :param data: numpy dataset with 3 features
    :param eigenvectors: transposed eigenvector matrix
    :param axis_min: min value for each axis on the plot
    :param axis_max: max value for each axis on the plot
    :return: 
    """
    
    # Create 3D figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot the data
    ax.scatter(data[:, 0], data[:, 1], data[:, 2], alpha=0.2)
    
    # Set axis labels
    ax.set_xlabel('X axis')
    ax.set_ylabel('Y axis')
    ax.set_zlabel('Z axis')
    
    # Set minimum and maximum value for axis
    if axis_min and axis_max:
        ax.set_xlim(axis_min, axis_max)
        ax.set_ylim(axis_min, axis_max)
        ax.set_zlim(axis_min, axis_max)

    ax.view_init(elev=20, azim=-35, roll=0)
    
    # Choose which axis will be plotted (arrow tips)
    if eigenvectors is not None:
        plot_vectors = eigenvectors
    else:
        plot_vectors = [
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, 1]
        ]
    
    # Always set origin at [0,0,0] (arrow starting point)
    origin = [0, 0, 0]
    
    # Iterate over coordinate system arrows (three of them)
    for i, component in enumerate(plot_vectors):
        
        # Plot arrow
        ax.quiver(
            origin[0],
            origin[1],
            origin[2],
            component[0],
            component[1],
            component[2],
            color=['r', 'g', 'b'][i],
            lw=2
        )

    plt.show()

In [None]:
# Create 2D normaly distributed data
mean = np.array([0,0])
cov = np.array([[1,0.5],[0.5,1]])

# Create feature z
x_y_data = np.random.multivariate_normal(mean, cov, 500)
z_col = 1.5*x_y_data[:,1]

# Merge z col with x and y
np_data_3D = np.column_stack([x_y_data, z_col])

# Create dataframe in order to display data
df_3D = pd.DataFrame(np_data_3D, columns=['x', 'y', 'z'])
df_3D.head()

In [None]:
sns.pairplot(df_3D)

In [None]:
# Plot the data in the original coordinate system

plot_data_and_axis(
    data=np_data_3D, 
    axis_min = -3, 
    axis_max=3,
    figsize=(8,8)
)

In [None]:
# Calculate covariance matrix

cov_mtx = df_3D.cov().to_numpy()

cov_mtx

In [None]:
# Calculate eigenvalues and eigenvectors of covariance matrix

eigenvalues, eigenvectors = np.linalg.eig(cov_mtx)

In [None]:
eigenvalues

In [None]:
np.set_printoptions(suppress=True)
eigenvalues / np.sum(eigenvalues)*100

In [None]:
eigenvectors

In [None]:
sort_idx = np.flip(np.argsort(eigenvalues))
sort_idx

In [None]:
eigenvalues = eigenvalues[sort_idx]
eigenvalues

In [None]:
eigenvectors = eigenvectors[:, sort_idx]
eigenvectors

In [None]:
eigenvectors.T

In [None]:
# Plot the data and the principal components

plot_data_and_axis(
    data=np_data_3D, 
    eigenvectors=eigenvectors.T,
    axis_min = -3, 
    axis_max=3,
    figsize=(8,8)
)

In [None]:
# Transform data to 2D

transformation_matrix = eigenvectors[:, :-1].T

transformed_data = transformation_matrix @ df_3D.to_numpy().T

In [None]:
transformed_df = pd.DataFrame(transformed_data.T, columns=['PC1', 'PC2'])

In [None]:
# Plot reduced data

plt.figure(figsize=(7,7))

sns.scatterplot(transformed_df,x='PC1',y='PC2', alpha=0.5)

plt.quiver(0, 0, 1, 0, angles='xy', scale_units='xy', scale=1, color='r', label="PC1 vector")
plt.quiver(0, 0,0, 1, angles='xy', scale_units='xy', scale=1, color='b', label="PC2 vector")


plt.xlim([-6, 6])
plt.ylim([-4, 4])

plt.gca().set_aspect('equal', adjustable='box')
