In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
%matplotlib notebook

### For missing data, impute the mean value for the purpose of obtaining a first order result.

In [None]:
df = pd.read_csv('../data/interim/Third_order_clean_confidential.csv').drop(columns='Unnamed: 0')
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [None]:
X = df.select_dtypes(exclude=['object'])
imp_mean.fit(X)
X = (imp_mean.transform(X))
y = df.Enrolled

### Fit PCA Transform

In [None]:
# In general, it's a good idea to scale the data prior to PCA.
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)    

pca = PCA()
x_new = pca.fit_transform(X)

### Scree Plot

In [None]:
f, axes = plt.subplots(figsize=(8,6))
plt.plot((1-np.cumsum(pca.explained_variance_ratio_))*100,'o-');
plt.xlabel("Number of Components",size=15);
plt.ylabel("Percent of Data Explained",size=15);
plt.title('Scree Plot of Data',size=15);

In [None]:
pca.explained_variance_ratio_

### Biplot

In [None]:
plt.subplots(figsize=(10,6))
def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley, c = y)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
    plt.xlim(-1,1)
    plt.ylim(-1,1)
    plt.xlabel("PC{}".format(1))
    plt.ylabel("PC{}".format(2))
    plt.grid()

#Call the function. Use only the 2 PCs.
myplot(x_new[:,0:],np.transpose(pca.components_[0:, :]),df.select_dtypes(exclude=['object']).columns.values[:])
plt.savefig('../reports/figures/biplot.png')

In [None]:
comp1_over_10_labels = df.drop(columns='Enrolled').select_dtypes(exclude=['object']).columns.values[abs( np.transpose(pca.components_[:, 0]) ) >0.1]
comp1_over_10_ = np.transpose(pca.components_[:, 0])[ abs(np.transpose(pca.components_[:, 0])) >0.1]