In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib notebook

### For missing data, impute the mean value for the purpose of obtaining a first order result.

In [None]:
df = pd.read_csv('../data/interim/Third_order_clean_confidential.csv').drop(columns='Unnamed: 0')
# imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [None]:
X = df.select_dtypes(exclude='object').drop(columns='Enrolled').fillna(-999)
y = df.Enrolled

### Fit PCA Transform

In [None]:
# In general, it's a good idea to scale the data prior to PCA.
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)    

pca = PCA()
x_new = pca.fit_transform(X)

### Scree Plot

In [None]:
f, axes = plt.subplots(figsize=(8,6))
plt.plot((np.cumsum(pca.explained_variance_ratio_))*100,'o-');
plt.xlabel("Number of Components",size=15);
plt.ylabel("Percent of Data Explained",size=15);
plt.title('Scree Plot of Data',size=15);

In [None]:
pca.explained_variance_ratio_

### Biplot

In [None]:
def myplot(score,coeff,labels=None, n = None):
    xs = score[:,0]
    ys = score[:,1]
    
    if n == None:
        n = np.arange(coeff.shape[0])
        
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    
    plt.scatter(xs * scalex,ys * scaley, c = y,alpha=0.3,cmap=cm.bone)
    plt.colorbar()

    for i in n:
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
#     plt.xlim(-1,1)
#     plt.ylim(-1,1)
    plt.xlabel("PC{}".format(1))
    plt.ylabel("PC{}".format(2))

In [None]:
#Call the function. Use only the 2 PCs.
# plt.subplots(figsize=(10,6))
myplot(score = x_new[:,0:],
       coeff = np.transpose(pca.components_[0:, :]),
       labels = df.select_dtypes(exclude=['object']).columns.values[:],
      n = [3,4,5,6])
plt.savefig('../reports/figures/biplot.png')