## Q2 dimesnionalityReduction on at least 4 datasets using at least 6 techniques

In [1]:
from sklearn.datasets import load_breast_cancer, load_wine, load_digits, load_iris
from sklearn.decomposition import PCA, FastICA, TruncatedSVD, FactorAnalysis
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.manifold import LocallyLinearEmbedding

# Define the datasets
datasets = {'breast_cancer': load_breast_cancer(), 'wine': load_wine(), 'digits': load_digits(), 'iris': load_iris()}

# Define the dimensionality reduction techniques to use
dim_reduction_methods = [
    PCA(n_components=2),
    FastICA(n_components=2),
    TruncatedSVD(n_components=2),
    FactorAnalysis(n_components=2),
    #LinearDiscriminantAnalysis(n_components=2),
    LocallyLinearEmbedding(n_components=2),
    NeighborhoodComponentsAnalysis(n_components=2)
]

# Loop over the datasets
for dataset_name, dataset in datasets.items():
    X, y = dataset.data, dataset.target
    print(f"Dataset: {dataset_name}")
    
    # Loop over the dimensionality reduction methods
    for dim_reduction in dim_reduction_methods:
        # Apply dimensionality reduction to the data
        X_reduced = dim_reduction.fit_transform(X, y)
        
        # Print the shape of the reduced data
        print(f"{type(dim_reduction).__name__} - Reduced shape: {X_reduced.shape}")
        
    print()

Dataset: breast_cancer
PCA - Reduced shape: (569, 2)
FastICA - Reduced shape: (569, 2)
TruncatedSVD - Reduced shape: (569, 2)
FactorAnalysis - Reduced shape: (569, 2)
LocallyLinearEmbedding - Reduced shape: (569, 2)
NeighborhoodComponentsAnalysis - Reduced shape: (569, 2)

Dataset: wine
PCA - Reduced shape: (178, 2)
FastICA - Reduced shape: (178, 2)
TruncatedSVD - Reduced shape: (178, 2)
FactorAnalysis - Reduced shape: (178, 2)
LocallyLinearEmbedding - Reduced shape: (178, 2)
NeighborhoodComponentsAnalysis - Reduced shape: (178, 2)

Dataset: digits
PCA - Reduced shape: (1797, 2)
FastICA - Reduced shape: (1797, 2)
TruncatedSVD - Reduced shape: (1797, 2)
FactorAnalysis - Reduced shape: (1797, 2)
LocallyLinearEmbedding - Reduced shape: (1797, 2)
NeighborhoodComponentsAnalysis - Reduced shape: (1797, 2)

Dataset: iris
PCA - Reduced shape: (150, 2)
FastICA - Reduced shape: (150, 2)
TruncatedSVD - Reduced shape: (150, 2)
FactorAnalysis - Reduced shape: (150, 2)
LocallyLinearEmbedding - Reduc

In [2]:
#importing necessary libraries
from sklearn.datasets import load_breast_cancer, load_wine, load_digits, load_iris
from sklearn.decomposition import PCA, FastICA, TruncatedSVD, FactorAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import numpy as np

#defining the datasets
datasets = {'breast_cancer': load_breast_cancer(), 'wine': load_wine(), 'digits': load_digits(), 'iris': load_iris()}

#defining the dimensionality reduction techniques to use
dim_reduction_methods = [
    PCA(n_components=2), #principal component analysis
    FastICA(n_components=2), #fast independent component analysis
    TruncatedSVD(n_components=2), #truncated singular value decomposition
    FactorAnalysis(n_components=2), #factor analysis
    LocallyLinearEmbedding(n_components=2), #locally linear embedding
    NeighborhoodComponentsAnalysis(n_components=2) #neighborhood components analysis
]

#defining the classifier to use
classifier = KNeighborsClassifier(n_neighbors=5)

#defining the test sizes to use
test_sizes = [0.1, 0.2, 0.3, 0.4]

#looping over the datasets
for dataset_name, dataset in datasets.items():
    X, y = dataset.data, dataset.target
    print(f"Dataset: {dataset_name}")
    
    #looping over the test sizes
    for test_size in test_sizes:
        #looping over the dimensionality reduction methods
        for dim_reduction in dim_reduction_methods:
            #defining the pipeline to use
            pipeline = make_pipeline(dim_reduction, classifier)
            
            #computing the cross-validation score
            scores = cross_val_score(pipeline, X, y, cv=5)
            mean_score = np.mean(scores)
            
            #printing the results
            print(f"Test size: {test_size:.1f}, {type(dim_reduction).__name__} - Mean CV score: {mean_score:.3f}\n")
        
    print()


Dataset: breast_cancer
Test size: 0.1, PCA - Mean CV score: 0.923

Test size: 0.1, FastICA - Mean CV score: 0.921

Test size: 0.1, TruncatedSVD - Mean CV score: 0.923

Test size: 0.1, FactorAnalysis - Mean CV score: 0.919

Test size: 0.1, LocallyLinearEmbedding - Mean CV score: 0.879

Test size: 0.1, NeighborhoodComponentsAnalysis - Mean CV score: 0.919

Test size: 0.2, PCA - Mean CV score: 0.923

Test size: 0.2, FastICA - Mean CV score: 0.921

Test size: 0.2, TruncatedSVD - Mean CV score: 0.923

Test size: 0.2, FactorAnalysis - Mean CV score: 0.919

Test size: 0.2, LocallyLinearEmbedding - Mean CV score: 0.881

Test size: 0.2, NeighborhoodComponentsAnalysis - Mean CV score: 0.919

Test size: 0.3, PCA - Mean CV score: 0.923

Test size: 0.3, FastICA - Mean CV score: 0.921

Test size: 0.3, TruncatedSVD - Mean CV score: 0.923

Test size: 0.3, FactorAnalysis - Mean CV score: 0.919

Test size: 0.3, LocallyLinearEmbedding - Mean CV score: 0.884

Test size: 0.3, NeighborhoodComponentsAnalysis