# Table of Contents
 <p><div class="lev1"><a href="#Principal-Component-Analysis-on-Threshold-Crossing-Events-(PCA-on-TCE-Data)"><span class="toc-item-num">1&nbsp;&nbsp;</span>Principal Component Analysis on Threshold Crossing Events (PCA on TCE Data)</a></div><div class="lev2"><a href="#Load-Column-Names"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load Column Names</a></div><div class="lev2"><a href="#Load-TCE-Data"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Load TCE Data</a></div><div class="lev2"><a href="#Drop-all-the-NaN-columns"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Drop all the NaN columns</a></div><div class="lev1"><a href="#PCA"><span class="toc-item-num">2&nbsp;&nbsp;</span>PCA</a></div>

# Principal Component Analysis on Threshold Crossing Events (PCA on TCE Data)

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from IPython.display import display, HTML
from sympy import init_printing, Matrix, symbols, sqrt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
init_printing(use_latex = 'mathjax')
# pd.set_option('max_columns', 50)
# pd.set_option('max_rows', 10)

In [3]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 18)
pd.set_option('display.width', 500)

In [4]:
%qtconsole

## Load Column Names

TCE parameters are in the tec_cols.csv file. The parameters are used in the calculations are uncommented

In [8]:
tceColDescriptions_tmp = pd.read_table("../../data/raw/tce/tce_cols.csv", comment="#")
tceColDescriptions = tceColDescriptions_tmp.rename(columns={
        tceColDescriptions_tmp.columns[0]: 'data'
    })
tceColDescriptions['parameter'] = tceColDescriptions.data.apply(lambda x: x.split(':')[0])
tceColDescriptions['description'] = tceColDescriptions.data.apply(lambda x: x.split(':')[1])
def findUnits(val):
    sval = val.split('[')
    if sval[-1][-1] == ']':
        return sval[-1][:-1]

tceColDescriptions['units'] = tceColDescriptions.description.apply(findUnits)
tceColDescriptions['description'] = tceColDescriptions.description.apply(lambda x: x.split('[')[0])
tceColDescriptions['parameter'] = tceColDescriptions.parameter.apply(lambda x: x.split('COLUMN ')[1])
del tceColDescriptions['data']
del tceColDescriptions_tmp

IOError: File ../../data/raw/tce/tce_cols.csv does not exist

## Load TCE Data
Load only columns defined in the **_tceColDescriptions_** data frame

In [5]:
tceRawDf = pd.read_csv(
    "../../../data/total/q1_q17_dr24_tce.csv", 
    comment="#", 
    usecols=list(tceColDescriptions.parameter)
)
n_rows, n_cols = tceRawDf.shape

## Drop all the NaN columns 

Some parameters may not have any values

In [6]:
tceRawDf = tceRawDf.dropna(axis='columns', how='all')
tceRawDf[:1000].to_csv("../../../data/total/d.csv", index=True)
# tceRawDf = tceRawDf - tceRawDf.mean(axis=0)

In [7]:
display(tceRawDf.head())

Unnamed: 0,tce_period,tce_ror,tce_dor,tce_incl,tce_impact,tce_duration,tce_ingress,tce_depth,tce_eccen,...,tce_max_mult_ev,tce_bin_oedp_stat,tce_rmesmad,tce_rsnrmes,tce_rminmes,boot_fap,boot_mesthresh,boot_mesmean,boot_messtd
0,0.83185,0.002704,1.022012,25.04,0.926,2.392,0.04453,6.092,0,...,10.41,0.001097,2.416,0.2834,0.6137,1.848598e-57,2.271512,-4.180471,0.915663
1,1.09524,0.00544,3.834003,86.17,0.2563,2.122,0.01228,34.53,0,...,9.321,0.374,6.461,1.137,0.5694,9.717469e-16,8.228319,-0.24854,1.204526
2,0.719273,0.005357,2.606748,74.42,0.7,1.521,0.01581,29.68,0,...,9.099,8.453999999999999e-24,5.155,0.9124,0.4886,1.316926e-13,8.749945,-0.208627,1.272514
3,0.831833,0.004806,2.914834,88.13,0.09536,2.181,0.01053,27.1,0,...,10.03,1.251e-05,8.5,1.192,0.5317,3.886701e-45,2.252824,-5.596324,1.112148
4,622.408,0.011713,400.63642,90.0,0.00319,12.01,0.139,161.3,0,...,9.081,0.01448,12.73,0.4095,0.4821,7.372099e-13,9.047263,-1.496304,1.49981


# PCA

In [6]:
pca = PCA(n_components=70)
pca.fit(tceRawDf)

PCA(copy=True, n_components=70, whiten=False)

In [7]:
pca.explained_variance_

array([  6.33595101e+28,   2.45019746e+18,   1.56195600e+15,
         6.20839495e+13,   3.10990917e+12,   6.43194378e+11,
         1.21560825e+11,   1.06637167e+11,   1.25901982e+10,
         7.00330475e+09,   4.09011434e+09,   4.52914827e+08,
         2.60557047e+08,   3.57646942e+07,   1.68238557e+07,
         4.34640437e+06,   2.00075643e+06,   1.68782096e+06,
         1.25966334e+06,   3.54345014e+05,   1.62420410e+05,
         1.04769487e+05,   2.56253716e+04,   5.21889429e+03,
         4.25533350e+03,   2.07442918e+03,   1.95643515e+03,
         1.41561177e+03,   7.34128832e+02,   5.17815494e+02,
         3.33609395e+02,   2.72459323e+02,   1.33833965e+02,
         3.68607527e+01,   2.36503798e+01,   1.34486767e+01,
         1.13890889e+01,   1.10039238e+01,   9.37704403e+00,
         7.34992460e+00,   6.52989737e+00,   4.17309728e+00,
         1.83075142e+00,   8.54796416e-01,   6.40340577e-01,
         6.08262899e-01,   4.89375154e-01,   1.74497347e-01,
         7.33777557e-02,

In [8]:
pca.explained_variance_ratio_

array([  1.00000000e+00,   3.86713447e-11,   2.46522739e-14,
         9.79867890e-16,   4.90835419e-17,   1.01515049e-17,
         1.91858846e-18,   1.68304910e-18,   1.98710473e-19,
         1.10532811e-19,   6.45540714e-20,   7.14833222e-21,
         4.11235893e-21,   5.64472392e-22,   2.65530078e-22,
         6.85990842e-23,   3.15778393e-23,   2.66387943e-23,
         1.98812039e-23,   5.59260975e-24,   2.56347325e-24,
         1.65357161e-24,   4.04443967e-25,   8.23695493e-26,
         6.71617172e-26,   3.27406128e-26,   3.08783187e-26,
         2.23425302e-26,   1.15867189e-26,   8.17265622e-27,
         5.26534051e-27,   4.30021196e-27,   2.11229481e-27,
         5.81771428e-28,   3.73272770e-28,   2.12259795e-28,
         1.79753424e-28,   1.73674382e-28,   1.47997420e-28,
         1.16003495e-28,   1.03061046e-28,   6.58637870e-29,
         2.88946588e-29,   1.34912094e-29,   1.01064635e-29,
         9.60018312e-30,   7.72378375e-30,   2.75408296e-30,
         1.15811747e-30,