In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
dtrain = pd.read_parquet('../input/dtrain-parquet/dtrain.parquet')
dtrain = dtrain[[c for c in dtrain.columns if 'feature' in c]]

# Missing Values

Note that `sklearn.decomposition.PCA` cannot handle missing values. I drop the incomplete observations here, but there are implementations of PPCA that are able to handle missing values. See https://stackoverflow.com/a/56576569/1838257 for pointers.

In [None]:
dtrain.isna().any(axis=1).sum() / dtrain.index.size

In [None]:
dtrain = dtrain.dropna()

# Standardisation
In order to perform PCA we must first standardise the data; i.e. scale all features to the same dimension. `StandardScaler` does exactly this, scaling the standard deviation of each column to exactly 1 and the mean to exactly 0. It does so with:
$z = (x_i - \mu) / \sigma$. Binary data (`feature_0`) cannot be handled like this, instead it has to be scaled by $2\sigma$. See [Gelman and Hill, 2006, p. 57](https://doi.org/10.1017/CBO9780511790942).

In [None]:
scaled = StandardScaler().fit_transform(dtrain)
scaled[:, 0] = ((dtrain['feature_0'] - dtrain['feature_0'].mean()) / 2 * dtrain['feature_0'].std()).to_numpy()

# PCA

In [None]:
pca = PCA(n_components=.95).fit(scaled)

In [None]:
print(pca.explained_variance_ratio_.size)

pd.DataFrame(pca.explained_variance_ratio_.cumsum()).plot(style='.', legend=False)
plt.xlabel('principal component #')
plt.ylabel('explained variance')
plt.show()

A total of 38 principal components can already explain 95% of the variance in data. About 29% of the original amount of features!

In [None]:
pd.DataFrame({
    'eigenvalues': pca.explained_variance_, 
    'explained variance': pca.explained_variance_ratio_,
    'cumulative expl. var.': pca.explained_variance_ratio_.cumsum()
})

In [None]:
print(PCA(n_components=.99).fit(scaled).n_components_)