In [1]:
# import libraries
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd

In [27]:
# create the dataset
boston = load_boston()
X = pd.DataFrame(boston.data)
X.columns = boston.feature_names
y = boston.target

In [5]:
# standardize data
X -= X.mean()
X /= X.std()

In [21]:
# import PCA
from sklearn.decomposition import PCA
pca = PCA()

In [22]:
# it's a transformer, so the typical methods apply
X_pca = pca.fit_transform(X)

In [23]:
# PCA transforms your dataset into something else
# columns that progressively capture more of the explained variance
# within your dataset
X_pca

(506, 9)

In [None]:
# without any specification, your transformed dataset will have the same
# dimensions as your original one
X_pca.shape

In [29]:
# however, each column in your dataset will contain more of the explained
# variance of your dataset than the next one
# so in this case the first column of the transformed dataset
# contains 47% of the explained variance in your dataset
# the second column 11%, and so on
pca.explained_variance_ratio_

array([0.47129606, 0.11025193, 0.0955859 , 0.06596732, 0.06421661,
       0.05056978, 0.04118124, 0.03046902, 0.02130333])

In [None]:
# using this reasoning, we can see that 95% of the explained variance
# is contained within the first 9 columns
np.cumsum(pca.explained_variance_ratio_)

In [None]:
# when you initialize PCA, you can specify either how many components you want
# to use, or what percentage of the variance you want to maintain
# this says, 'keep the number of columns that contains 95% of the explained variance
# within the dataset, whatever that number is'
pca = PCA(n_components=0.95)

In [24]:
# you typically run Linear Regression
# on the transformed version of your dataset after 
# running pca
from sklearn.linear_model import LinearRegression

lreg = LinearRegression()
X_pca = pca.fit_transform(X)
# our results are exactly 95% of the original r_squared value we had
# on the original dataset in its entirety
lreg.fit(X_pca, y).score(X_pca, y)

0.7056708975511301

In [26]:
# PCA can also be used inside pipelines
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# initialize the methods
sc  = StandardScaler()
pca  = PCA(n_components=0.95)
lreg = LinearRegression()
# chain them together
pipe = make_pipeline(sc, pca, lreg)

In [28]:
# and then fit and predict
pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=0.95,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [30]:
pipe.predict(X)

array([30.63642022, 25.18516656, 31.79983299, 30.64413274, 30.22344187,
       26.37788509, 23.63759093, 20.76872368, 12.83567246, 20.62847172,
       21.09449375, 22.68967376, 22.02765556, 20.94437623, 20.29331339,
       20.41090304, 22.18999374, 17.96016423, 16.39706513, 18.55408942,
       12.62677238, 18.22053712, 16.96194261, 14.48553549, 16.67824427,
       13.96389084, 16.52693316, 15.95305268, 21.15917633, 22.59206961,
       12.39796892, 18.66774255, 10.21229099, 14.37893265, 14.26211063,
       22.15784682, 20.7968095 , 22.41251577, 22.62003856, 28.63557712,
       32.06223404, 29.88878742, 26.18809775, 25.80700256, 23.41760163,
       21.52238157, 20.31374658, 18.2448109 ,  9.32097416, 17.49249487,
       21.51203311, 23.7706348 , 28.55148972, 24.42677584, 16.50558352,
       30.93917264, 25.91703597, 31.01082862, 23.00273673, 20.86801104,
       17.62654896, 17.68000936, 24.26922133, 24.79756734, 27.41214366,
       28.1450399 , 22.74023482, 22.25558863, 18.23129834, 21.76