# DIMENSIONALITY REDUCTION USING PRINCIPAL COMPONENTS

#### Given a set of features, you want to reduce the number of featureswgile retaining the variance in the data

Feature Extraction: Reduce the dimensionality of our feature matrix by creating new features with "ideally" similar ability to train quality models but with significant fewer dimensions

In [1]:
# Load Libreries

import numpy as np
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Load data

digits = datasets.load_digits()
df= digits.data

In [3]:
# Standarize the feature matrix

features  = StandardScaler().fit_transform(df)
print("The Shape[0] of the feature is: ", str(features.shape[0]) + " & the Shape[1] is : ", str(features.shape[1]))


The Shape[0] of the feature is:  1797 & the Shape[1] is :  64


In [4]:
# Create a PCA that will retain 99% of the variance

pca = PCA(n_components = 0.99, whiten = True)

In [5]:
# Conduct PCA

features_pca = pca.fit_transform(features)
print("The Shape[0] of the feature_pca is: ", str(features_pca.shape[0]) + " & the Shape[1] is : ", str(features_pca.shape[1]))


The Shape[0] of the feature_pca is:  1797 & the Shape[1] is :  54


In [6]:
# Show results

print("Original number of features : ", features.shape[1])
print("Reduced number of features : ", features_pca.shape[1])

Original number of features :  64
Reduced number of features :  54


## Reducing Features when data is Linearly NOT separable

In [7]:
# Load Libraries

from sklearn.decomposition import KernelPCA
from sklearn.datasets import make_circles

In [8]:
# Create linearly inseparable data
feature_not, _ = make_circles(n_samples = 1000, random_state = 1, noise = 0.1, factor = 0.1)
print("The Shape[0] of the features_not is: ", str(feature_not.shape[0]) + " & the Shape[1] is : ", str(feature_not.shape[1]))


The Shape[0] of the features_not is:  1000 & the Shape[1] is :  2


In [9]:
# Apply Kernel PCA with radius basis function(RBF) kernel
kpca = KernelPCA(kernel = "rbf", gamma = 15, n_components = 1)
features_not_kpca = kpca.fit_transform(feature_not)
print("The Shape[0] of the features_not_kpca is: ", str(features_not_kpca.shape[0]) + " & the Shape[1] is : ", str(features_not_kpca.shape[1]))


The Shape[0] of the features_not_kpca is:  1000 & the Shape[1] is :  1


In [10]:
# Show results

print("Original number of features : ", feature_not.shape[1])
print("Reduced number of features : ", features_not_kpca.shape[1])

Original number of features :  2
Reduced number of features :  1


## Reducind Features by Maximizing Class Separability
#### Try Linear Discriminant Analysis (LDA) to project the features onto components axes that maximize the separation of classes 
Maximizing the difference between classes

In [11]:
# Load libraries
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [14]:
# Load Iris Data Set
iris = datasets.load_iris()
features_class = iris.data
target = iris.target

In [15]:
# Create amd run am LDA, then use it to transform the features 
lda = LinearDiscriminantAnalysis(n_components = 1)
features_lda = lda.fit(features_class, target).transform(features_class)

In [17]:
# Show results

print("Original number of features : ", features_class.shape[1])
print("Reduced number of features : ", features_lda.shape[1])

Original number of features :  4
Reduced number of features :  1


In [20]:
# See the amount of variance explained
lda.explained_variance_ratio_

array([0.9912126])

### How to select N_Components??

Set LDA to "n_componets= None" to return the ratio of variance explained by every coponent feature, then calculate how many components are required to get above some threshold of variance explained (often 95% or 99%)

In [33]:
# Create and run LDA

lda = LinearDiscriminantAnalysis(n_components= None)
features_lda_none = lda.fit(features_class, target)

In [34]:
# Create array of explained variance ratios

lda_var_ratios = lda.explained_variance_ratio_

In [41]:
# Create function

def select_n_components(var_ratio, goal_var: float) ->int:
    # Set initial variance explained so far
    total_variance = 0.0
    # Set initial number of features
    n_components = 0
    # For the explained variance of each feature
    for explained_variance in var_ratio:
        # Add the explained variable to the total
        total_variance += explained_variance
        # Add one to the number of components
        n_components += 1
        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break
    # Return the number of components
    return n_components

In [44]:
# Run Function
run = select_n_components(lda_var_ratios, 0.95)
run

1

# Reducing Features Using Matrix Factorization
#### Use non negative matrix factorization (NMF) to reduce the dimensionality of the feature matrix


In [51]:
# Load libraries
from sklearn.decomposition import NMF

In [54]:
# Load Data
digits = datasets.load_digits()
features_digits = digits.data

In [55]:
# Create fit and apply NMF
nmf = NMF(n_components = 10, random_state = 1)
features_nmf  = nmf.fit_transform(features_digits)

In [56]:
# Show results

print("Original number of features : ", features_digits.shape[1])
print("Reduced number of features : ", features_nmf.shape[1])

Original number of features :  64
Reduced number of features :  10


# Reducing Features on Sparse Data

#### You have an sparse Matriz an want to reduce the dimensionality using Truncated Singular Value Decomposition (TSVD)


In [59]:
# Load Libraries 

from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import numpy as np

In [62]:
# Load Data

df_digits = datasets.load_digits()
df = df_digits.data

In [66]:
# Standarize feature matrix

features_df = StandardScaler().fit_transform(df)

In [68]:
# Make Sparse Matrix
features_sparse = csr_matrix(features_df)

In [69]:
# Create a TSVD 
tsvd = TruncatedSVD(n_components = 10)

In [70]:
# Conduct TSVD on sparse_matrix
features_sparse_tsvd = tsvd.fit(features_sparse).transform(features_sparse)

In [71]:
# Show results

print("Original number of features : ", features_sparse.shape[1])
print("Reduced number of features : ", features_sparse_tsvd.shape[1])

Original number of features :  64
Reduced number of features :  10


In [74]:
# Sum of first three components' explained variance ratio
tsvd.explained_variance_ratio_[0:3].sum()

0.30039385377447186

We can automate the process by creating a function that runs TSVD with n_components set to one less than the number of origianl features and then calculate the number of components that explain a desired amount of the original dataś variance: 
    

In [76]:
# Create and run an SVD with one less than number of features
tsvd = TruncatedSVD(n_components= features_sparse.shape[1]-1)
features_tsvd = tsvd.fit(features_df)

In [77]:
# List of explained variances
tsvd_var_ratios = tsvd.explained_variance_ratio_

In [78]:
# Create function

def select_n_components(var_ratio, goal_var: float) ->int:
    # Set initial variance explained so far
    total_variance = 0.0
    # Set initial number of features
    n_components = 0
    # For the explained variance of each feature
    for explained_variance in var_ratio:
        # Add the explained variable to the total
        total_variance += explained_variance
        # Add one to the number of components
        n_components += 1
        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break
    # Return the number of components
    return n_components

In [81]:
# Run function 95%
select_n_components(tsvd_var_ratios, 0.95)

40

In [82]:
# Run function 99%
select_n_components(tsvd_var_ratios, 0.99)

54