In [1]:
import os
import urllib
from sklearn.datasets import load_svmlight_file
from matplotlib import pyplot as plt
import numpy as np

In [2]:
GITHUB_HOST = "https://raw.githubusercontent.com/apache/spark/master/data/mllib/"
BINOMIAL_DATA_URL = GITHUB_HOST + "sample_libsvm_data.txt"
MULTICLASS_DATA_URL = GITHUB_HOST + "sample_multiclass_classification_data.txt"
DATA_FILENAME = "data.txt"

In [3]:
def get_data(url, filename):
    if not os.path.isfile(filename):
        urllib.request.urlretrieve(url, "{}".format(filename))
    data = load_svmlight_file(filename)
    return data[0], data[1]

In [4]:
X, y = get_data(MULTICLASS_DATA_URL, DATA_FILENAME)

In [5]:
X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]

In [6]:
W2 = Vt.T[:, :2]

In [7]:
X2D = X_centered.dot(W2)

In [8]:
from sklearn.decomposition import PCA

In [9]:
pca = PCA(n_components = 2)
X = X.todense() # since PCA does not support sparse input
X2D = pca.fit_transform(X)

In [10]:
pca.explained_variance_ratio_

array([0.84141901, 0.11732473])

In [11]:
X.shape

(150, 4)

In [12]:
X_train, X_test = X[:100], X[:150]

In [13]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [14]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

In [15]:
print("kindle pos 4734")

from sklearn.decomposition import IncrementalPCA
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X_train)

kindle pos 4734


ValueError: n_components=154 invalid for n_features=4, need more rows than columns for IncrementalPCA processing

In [16]:
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("kpca", KernelPCA(n_components=2)),
    ("log_reg", LogisticRegression())
])

param_grid = [{
    "kpca__gamma": np.linspace(0.03, 0.05, 10),
    "kpca__kernel": ["rbf", "sigmoid"]
}]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)
print(grid_search.best_params_)

{'kpca__gamma': 0.034444444444444444, 'kpca__kernel': 'rbf'}
