In [18]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import LocallyLinearEmbedding

In [2]:
mnist = fetch_openml('mnist_784', version=1, parser='auto')

In [3]:
X,y = mnist['data'].to_numpy(), mnist['target'].to_numpy()

In [16]:
tree_clf = DecisionTreeClassifier()

In [7]:
cross_val_score(tree_clf, X, y, cv=3)

array([0.86286106, 0.8607123 , 0.87129816])

In [17]:
pca = PCA(n_components=10)

In [17]:
X_10D = pca.fit_transform(X)

In [20]:
X_10D.shape

(70000, 10)

In [19]:
cross_val_score(tree_clf, X_10D, y, cv=3)

array([0.81631953, 0.81511164, 0.82145459])

In [24]:
pca.components_.T[:10,0]

array([-5.05811077e-18, -1.05163307e-19,  4.97869471e-20,  1.20857437e-19,
        1.26448010e-19, -8.10415031e-23,  1.22144561e-21, -8.23343057e-22,
       -1.92930090e-21,  7.04963863e-22])

In [25]:
pca.explained_variance_ratio_

array([0.09746116, 0.07155445, 0.06149531, 0.05403385, 0.04888934,
       0.04305227, 0.03278261, 0.02889641, 0.02758364, 0.02342134])

In [26]:
sum(pca.explained_variance_ratio_)

0.4891703682171665

In [4]:
pca = PCA()

In [5]:
pca.fit(X)

In [12]:
cumsum = np.cumsum(pca.explained_variance_ratio_)

In [22]:
d = len(cumsum[cumsum <= 0.95])
d

153

In [15]:
pca = PCA(n_components=0.95)

In [16]:
X_95 = pca.fit_transform(X)

In [17]:
X_95.shape

(70000, 154)

In [4]:
n_batches = 100

inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X, n_batches):
    inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X)

In [5]:
X_reduced.shape

(70000, 154)

In [15]:
rbf_pca = KernelPCA(n_components=2, kernel='rbf', gamma=0.04)

X_ = X[:1000]
y_ = y[:1000]

X_kernel_reduced = rbf_pca.fit_transform(X_)

X_kernel_reduced.shape

(1000, 2)

In [16]:
pipe = Pipeline([('kpca', KernelPCA(n_components=2)),
                 ('log_reg', LogisticRegression())])
param_grid = [{'kpca__gamma': np.linspace(0.03, 0.05, 10),
               'kpca__kernel': ['rbf', 'sigmoid']}]

grid_search = GridSearchCV(pipe, param_grid, cv=3)

grid_search.fit(X_, y_)

In [17]:
grid_search.best_params_

{'kpca__gamma': 0.03222222222222222, 'kpca__kernel': 'rbf'}

In [19]:
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X_)
X_reduced.shape

(1000, 2)