# Principal Component Analysis
* `dim1` -- PCA with given number of components
* `dim2` -- PCA Minka's MLE
* `dim3` -- PCA required Explained Variance

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# add path
import sys; import os; sys.path.append(os.path.realpath("../"))

In [3]:
# demo datasets
from datasets.demo1 import X_train, Y_train, fold_ids, X_valid, Y_valid, meta as meta_data
#meta_data

In [4]:
# transformer implementations
typ = 'dim1'

if typ is 'dim1':
    from verto.dim1 import trans, meta
    trans.set_params(**{'pca__n_components': 3})
elif typ is 'dim2':
    from verto.dim2 import trans, meta
elif typ is 'dim3':
    from verto.dim3 import trans, meta
    trans.set_params(**{'required_ev': 0.8})  # slowly increase from 0.1 towards 100%

In [5]:
meta

{'id': 'dim1',
 'name': 'PCA',
 'description': 'PCA with given number of components',
 'keywords': ['dimensionality reduction',
  'principal component anlysis',
  'StandardScaler',
  'PCA'],
 'feature_names_prefix': 'pca'}

## Transform

In [6]:
%%time
trans.fit(X_train)

CPU times: user 4.8 ms, sys: 2.21 ms, total: 7.01 ms
Wall time: 6.89 ms


Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False))])

In [7]:
%%time
X_new = trans.transform(X_train)

CPU times: user 1.05 ms, sys: 2.52 ms, total: 3.56 ms
Wall time: 2.97 ms


In [8]:
from seasalt import create_feature_names
feature_names = create_feature_names(meta['feature_names_prefix'], X_new.shape[1])
print(feature_names)

['pca_0', 'pca_1', 'pca_2']


In [9]:
import pandas as pd
df_new = pd.DataFrame(data=X_new, columns=feature_names)

## Evaluate
- check if the PCA components are "good" predictors
- eyeball the p-values of the logistic regression coefficients

In [10]:
df_new.head()

Unnamed: 0,pca_0,pca_1,pca_2
0,-3.084842,-2.158704,-0.339875
1,-2.182647,-0.617571,0.447208
2,2.049959,2.328953,1.169407
3,-2.809267,-0.950791,-0.305102
4,-1.913537,-1.072582,0.498365


In [11]:
import statsmodels.api as sm
#lr = sm.Logit(Y_train, sm.add_constant(X_new)).fit()
lr = sm.Logit(Y_train, sm.add_constant(X_new)).fit_regularized(method='l1', alpha=.5)
print(lr.summary())

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.1295775447691668
            Iterations: 33
            Function evaluations: 33
            Gradient evaluations: 33
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  398
Model:                          Logit   Df Residuals:                      394
Method:                           MLE   Df Model:                            3
Date:                Tue, 02 Apr 2019   Pseudo R-squ.:                  0.8117
Time:                        16:08:43   Log-Likelihood:                -49.556
converged:                       True   LL-Null:                       -263.17
                                        LLR p-value:                 2.786e-92
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------