In [1]:
import numpy as np

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [3]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [9]:
len(data.feature_names)

30

### PCA

In [10]:
X = data['data']

In [12]:
X_centered = X-X.mean(axis=0)

In [13]:
U,s,vt = np.linalg.svd(X_centered)

In [15]:
c1 = vt.T[:,0]
c2 = vt.T[:,1]

In [17]:
W2 = vt.T[:, :2]

In [18]:
X2D = X_centered.dot(W2)

In [19]:
X2D

array([[-1160.1425737 ,  -293.91754364],
       [-1269.12244319,    15.63018184],
       [ -995.79388896,    39.15674324],
       ...,
       [ -314.50175618,    47.55352518],
       [-1124.85811531,    34.12922497],
       [  771.52762188,   -88.64310636]])

In [20]:
X_centered

array([[ 3.86270826e+00, -8.90964851e+00,  3.08309666e+01, ...,
         1.50793777e-01,  1.70024429e-01,  3.49541828e-02],
       [ 6.44270826e+00, -1.51964851e+00,  4.09309666e+01, ...,
         7.13937768e-02, -1.50755712e-02,  5.07418278e-03],
       [ 5.56270826e+00,  1.96035149e+00,  3.80309666e+01, ...,
         1.28393777e-01,  7.12244288e-02,  3.63418278e-03],
       ...,
       [ 2.47270826e+00,  8.79035149e+00,  1.63309666e+01, ...,
         2.71937768e-02, -6.82755712e-02, -5.74581722e-03],
       [ 6.47270826e+00,  1.00403515e+01,  4.81309666e+01, ...,
         1.50393777e-01,  1.18624429e-01,  4.00541828e-02],
       [-6.36729174e+00,  5.25035149e+00, -4.40490334e+01, ...,
        -1.14606223e-01, -2.97557118e-03, -1.35558172e-02]])

In [21]:
from sklearn.decomposition import PCA

In [22]:
pca = PCA(n_components=2)

In [23]:
X2d = pca.fit_transform(X)

In [24]:
X2d

array([[1160.1425737 , -293.91754364],
       [1269.12244319,   15.63018184],
       [ 995.79388896,   39.15674324],
       ...,
       [ 314.50175618,   47.55352518],
       [1124.85811531,   34.12922497],
       [-771.52762188,  -88.64310636]])

In [25]:
pca.explained_variance_ratio_

array([0.98204467, 0.01617649])

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train,X_test,y_train,y_test = train_test_split(X,data['target'])

In [29]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [30]:
d

1

In [31]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

In [33]:
X_recovered = pca.inverse_transform(X_reduced)

### Randomized PCA

In [38]:
rnd_pca = PCA(n_components=4, svd_solver="randomized")
X_reduced = rnd_pca.fit_transform(X_train)

### Kernel PCA

In [39]:
from sklearn.decomposition import KernelPCA
rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(X_train)

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [42]:
clf = Pipeline([
 ("kpca", KernelPCA(n_components=2)),
 ("log_reg", LogisticRegression())
 ])
param_grid = [{
 "kpca__gamma": np.linspace(0.03, 0.05, 10),
 "kpca__kernel": ["rbf", "sigmoid"]
 }]
grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, data['target'])

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('kpca',
                                        KernelPCA(alpha=1.0, coef0=1,
                                                  copy_X=True, degree=3,
                                                  eigen_solver='auto',
                                                  fit_inverse_transform=False,
                                                  gamma=None, kernel='linear',
                                                  kernel_params=None,
                                                  max_iter=None, n_components=2,
                                                  n_jobs=None,
                                                  random_state=None,
                                                  remove_zero_eig=False,
                                                  tol=0)),
                                       ('log_reg',
                                 

In [43]:
print(grid_search.best_params_)

{'kpca__gamma': 0.03, 'kpca__kernel': 'rbf'}
