# Principal Component Analysis
* `dim1` -- PCA with given number of components
* `dim2` -- PCA Minka's MLE
* `dim3` -- PCA required Explained Variance

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# add path
import sys; import os; sys.path.append(os.path.realpath("../"))

In [3]:
# demo datasets
from datasets.demo1 import X_train, Y_train, fold_ids, X_valid, Y_valid, meta as meta_data
#meta_data

In [4]:
# transformer implementations
typ = 'dim4'

if typ is 'dim1':
    from verto.dim1 import trans, meta
    trans.set_params(**{'pca__n_components': 3})
elif typ is 'dim3':
    from verto.dim3 import trans, meta
    trans.set_params(**{'required_ev': 0.8})  # slowly increase from 0.1 towards 100%
else:
    tmp = __import__("verto."+typ, fromlist=['trans', 'meta'])
    trans = tmp.trans
    meta = tmp.meta

In [5]:
meta

{'id': 'dim4',
 'name': 'PCA BIC',
 'description': 'Number of components is determined by fitting a Lasso-Logit model and minimize the BIC criteron.',
 'keywords': ['dimensionality reduction',
  'principal component anlysis',
  'StandardScaler',
  'PCA',
  'BIC',
  'Logistic Regression',
  'statsmodels'],
 'feature_names_prefix': 'dim_bic'}

## Transform

In [6]:
%%time
trans.fit(X_train, Y_train)

CPU times: user 127 ms, sys: 12 ms, total: 139 ms
Wall time: 73.7 ms


In [7]:
%%time
X_new = trans.transform(X_train)

CPU times: user 1.3 ms, sys: 567 µs, total: 1.86 ms
Wall time: 737 µs


In [8]:
from seasalt import create_feature_names
feature_names = create_feature_names(meta['feature_names_prefix'], X_new.shape[1])
print(feature_names)

['dim_bic_0', 'dim_bic_1', 'dim_bic_2', 'dim_bic_3', 'dim_bic_4', 'dim_bic_5']


In [9]:
import pandas as pd
df_new = pd.DataFrame(data=X_new, columns=feature_names)

## Evaluate
- check if the PCA components are "good" predictors
- eyeball the p-values of the logistic regression coefficients

In [10]:
df_new.head()

Unnamed: 0,dim_bic_0,dim_bic_1,dim_bic_2,dim_bic_3,dim_bic_4,dim_bic_5
0,-3.083984,-2.158203,-0.339844,-0.216675,1.544922,0.258057
1,-2.183594,-0.617676,0.447266,-0.150879,2.859375,-0.353027
2,2.050781,2.328125,1.168945,-1.814453,-0.115601,0.280762
3,-2.808594,-0.950684,-0.305176,-0.380859,0.546875,1.203125
4,-1.913086,-1.072266,0.498291,1.311523,-0.204224,0.064941


In [11]:
import statsmodels.api as sm
#lr = sm.Logit(Y_train, sm.add_constant(X_new)).fit()
lr = sm.Logit(Y_train, sm.add_constant(X_new)).fit_regularized(method='l1', alpha=.5)
print(lr.summary())

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.09103455213203748
            Iterations: 59
            Function evaluations: 59
            Gradient evaluations: 59
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  398
Model:                          Logit   Df Residuals:                      391
Method:                           MLE   Df Model:                            6
Date:                Wed, 03 Apr 2019   Pseudo R-squ.:                  0.8759
Time:                        17:11:28   Log-Likelihood:                -32.665
converged:                       True   LL-Null:                       -263.17
                                        LLR p-value:                 2.086e-96
                 coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------