# Ch.8 차원축소

## 8.3 PCA

### 8.3.2 주성분

In [12]:
import numpy as np
np.random.seed(4)

m = 60
noise = 0.1

X = np.empty((m, 3))

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2   # x 좌표
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2                # y 좌표

w1, w2 = 0.1, 0.3
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)             # z 좌표 (초평면 + 잡음)

In [13]:
X_centered = X - X.mean(axis= 0)
U,s,Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:,0]
c2 = Vt.T[:,1]

### 8.3.3 d차원으로 투영하기

In [3]:
w2 = Vt.T[:,:2]
X2D = X_centered.dot(w2)

### 8.3.4 사이킷런 사용하기

In [4]:
from sklearn.decomposition import PCA

pca=PCA(n_components=2)
X2D = pca.fit_transform(X)

In [5]:
pca.explained_variance_ratio_

array([0.84248607, 0.14631839])

### 8.3.6 적절한 차원 수 선택하기

In [6]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

X = mnist["data"]
y = mnist["target"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

  warn(


In [7]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [8]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

### 8.3.7 압축을 위한 PCA

In [9]:
pca = PCA(n_components = 154)
X_reduced = pca.fit_transform(X_train)
X_recovered = pca.inverse_transform(X_reduced)

### 8.3.8 랜덤 PCA

In [10]:
rnd_pca = PCA(n_components=154,svd_solver="randomized")
X_reduced = rnd_pca.fit_transform(X_train)

### 8.3.9 점진적 PCA

In [11]:
from sklearn.decomposition import IncrementalPCA
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train,n_batches):
    inc_pca.partial_fit(X_batch)
    
X_reduced = inc_pca.transform(X_train)

In [None]:
filename = "my_mnist.data"
m, n = X_train.shape

X_mm = np.memmap(filename,dtype="float32",mode="readonly",shape=(m,n))

batch_size = m // n_batches
inc_pca = IncrementalPCA(n_components=154,batch_size=batch_size)
inc_pca.fit(X_mm)

In [None]:
from sklearn.decomposition import KernelPCA
rbf_pca = KernelPCA(n_components =2,kernel="rbf",gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("kpca",KernelPCA(n_components=2)),
    ("log_reg",LogicRegression())
])
param_grid = [{
    "kpca__gamma":np.linspace(0.03,0.05,10),
    "kpca__kernel":["rbf","sigmoid"]
}]
grid_search = GridSearchCV(clf,param_grid,cv=3)
grid_search.fit(X,y)

In [None]:
print(grid_search_best_params_)

In [14]:
rbf_pca = KernelPCA(n_components = 2,kernel="rbf",gamma=0.0433,
                   fit_inverse_transform=True)
X_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)

NameError: name 'KernelPCA' is not defined

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(X,X_preimage)

In [None]:
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=2,n_neighbors=10)
X_reduced = lle.fit_transform(X)
