## Selecting number of PCs

In [None]:
# pandas and numpy imports
import pandas as pd
import numpy as np

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

# import PCA
from sklearn.datasets import load_digits
from sklearn.datasets import load_wine
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# import knee detection algorithm
from kneed import KneeLocator

# set sns theme and set pandas to display all rows and columns
sns.set_theme()
plt.rcParams['figure.figsize'] = [10, 6]

## Load the dataset

In [None]:
# Load the wine dataset
wine_data = load_wine()

wine_df_numeric = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)

wine_df = wine_df_numeric.copy()
wine_df['label'] = wine_data.target

# Normalize the data
standard_scaler = StandardScaler()
standard_scaler.fit(wine_df_numeric)
wine_df_numeric_scaled = standard_scaler.transform(wine_df_numeric)

# Create data df
wine_df_numeric = pd.DataFrame(
    wine_df_numeric_scaled, 
    columns = wine_df_numeric.columns
)

g = sns.boxplot(wine_df_numeric)
_ = g.set_xticklabels(g.get_xticklabels(), rotation=90)

## n_components parameter

In [None]:
# Run full PCA
pca = PCA()
pca.fit(wine_df_numeric)
pca_transformed_data = pca.transform(wine_df_numeric)

print('Number of PCA components {}'.format(pca_transformed_data.shape[1]))
print('\nVariances explained by components:')

[var_ration*100 for var_ration in pca.explained_variance_ratio_]

In [None]:
# Perform PCA and choose two components
pca = PCA(n_components=2)
pca.fit(wine_df_numeric)
pca_transformed_data = pca.transform(wine_df_numeric)

print('Number of PCA components {}'.format(pca_transformed_data.shape[1]))
print('\nVariances explained by components:')

[var_ration*100 for var_ration in pca.explained_variance_ratio_]

In [None]:
# Perform PCA and choose number of components based on desired percent of explained variance
pca = PCA(n_components=0.7)
pca.fit(wine_df_numeric)
pca_transformed_data = pca.transform(wine_df_numeric)

print('Number of PCA components {}'.format(pca_transformed_data.shape[1]))

print('\nVariances explained by components:')
print([var_ration*100 for var_ration in pca.explained_variance_ratio_])

explained_sum = sum([var_ration*100 for var_ration in pca.explained_variance_ratio_])
print('\nTotal variance explained by returned components : {}'.format(explained_sum))


## Selecting right number of principle components - cumulative variance

In [None]:
pca = PCA()
pca.fit(wine_df_numeric)
pca_transformed_data = pca.transform(wine_df_numeric)

print('Number of PCA components {}'.format(pca_transformed_data.shape[1]))

cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)*100

plt.plot(range(len(cumulative_explained_variance)), cumulative_explained_variance)
plt.scatter(range(len(cumulative_explained_variance)),cumulative_explained_variance)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()

## Selecting right number of principle components - elbow method



In [None]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', parser='auto')

mnist_data = mnist.data.values
mnist_label = mnist.target

In [None]:
mnist_data.shape

In [None]:
# Reshape image
image = np.reshape(mnist_data[0,:], [28,28])

# Display the image using matplotlib
plt.imshow(image, cmap='gray')
plt.title(f"Label: {mnist_label[0]}")
plt.show()

In [None]:
# Custom normalization
mnist_data = mnist_data/255

In [None]:
# Perform PCA
pca = PCA()
pca.fit(mnist_data)
pca_transformed_data = pca.transform(mnist_data)

print('Number of PCA components {}'.format(pca_transformed_data.shape[1]))

cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)*100

kneedle = KneeLocator(
    range(len(cumulative_explained_variance)), 
    cumulative_explained_variance, 
    curve="concave", 
    direction="increasing"
)
kneedle.plot_knee()

In [None]:
kneedle.elbow

In [None]:
# If we do not perform inverse transform : 

# pca.components_ = pca.components_[kneedle.elbow, :]

In [None]:
# Perform PCA
pca = PCA(n_components=kneedle.elbow)
pca.fit(mnist_data)
pca_transformed_data = pca.transform(mnist_data)

In [None]:
pca_transformed_data.shape

In [None]:
reverse_transformed_data = pca.inverse_transform(pca_transformed_data)
unscaled_data = reverse_transformed_data * 255 

In [None]:
image = np.reshape(unscaled_data[0,:], [28,28])

# Display the image using matplotlib
plt.imshow(image, cmap='gray')
plt.title(f"Label: {mnist_label[0]}")
plt.show()

## Denoise with PCA

In [None]:
mnist = fetch_openml('mnist_784', parser='auto')

mnist_data = mnist.data.values
mnist_label = mnist.target

In [None]:
mnist_data = mnist_data/255

In [None]:
image = np.reshape(mnist_data[0,:], [28,28])

plt.imshow(image, cmap='gray')
plt.title(f"Label: {mnist_label[0]}")
plt.show()

In [None]:
noise = np.random.normal(0, 0.2, mnist_data.shape)
mnist_data_noisy = mnist_data + noise

In [None]:
image = np.reshape(mnist_data_noisy[0,:], [28,28])

plt.imshow(image, cmap='gray')
plt.title(f"Label: {mnist_label[0]}")
plt.show()

In [None]:
# Denoise with PCA
pca = PCA()
pca.fit(mnist_data_noisy)
pca_transformed_data = pca.transform(mnist_data_noisy)
print('Number of PCA components {}'.format(pca_transformed_data.shape[1]))
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)*100

kneedle = KneeLocator(
    range(len(cumulative_explained_variance)), 
    cumulative_explained_variance, 
    S=1.0, 
    curve="concave", 
    direction="increasing"
)
kneedle.plot_knee()

In [None]:
kneedle.elbow

In [None]:
pca = PCA(n_components=kneedle.elbow)
pca.fit(mnist_data_noisy)
pca_transformed_data = pca.transform(mnist_data_noisy)

In [None]:
pca_transformed_data.shape

In [None]:
reverse_transformed_data = pca.inverse_transform(pca_transformed_data)
unscaled_data = reverse_transformed_data * 255 

In [None]:
image = np.reshape(unscaled_data[0,:], [28,28])

plt.imshow(image, cmap='gray')
plt.title(f"Label: {mnist_label[0]}")
plt.show()

### Display digits dataset with PCA

In [None]:
pca = PCA()
pca.fit(mnist_data)
pca.explained_variance_ratio_[:2]*100

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(x=pca_transformed_data[:,0], y=pca_transformed_data[:,1], hue=mnist_label, alpha=0.5)

plt.title('Digits - top 2 principal components')
plt.xlabel('PC1 [9.7 % variance]')
plt.ylabel('PC1 [7.1 % variance]')