### This notebook applies PCA to every Imputed and Enriched data set 

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [2]:
def read_csv(file):
    print(file)
    df = pd.read_csv(file, parse_dates=True, index_col=0)
    return df

In [3]:
def scale(df, df_test):
    scaler = preprocessing.StandardScaler()
    scaler.fit(df)
    array = scaler.transform(df)
    array2 = scaler.transform(df_test)
    df_scaled = pd.DataFrame(data=array, index=df.index, columns=df.columns)
    df_test_scaled = pd.DataFrame(data=array2, index=df_test.index, columns=df_test.columns)
    return df_scaled, df_test_scaled

In [4]:
def calculate_number_components(df):
    pca = PCA(svd_solver='full')
    pca.fit(df)
    pca.explained_variance_ratio_
    explained_var = 0
    num_components = 0
    for var in pca.explained_variance_ratio_:
        explained_var += var
        num_components +=1
        if explained_var >=0.95:
            break

    print(explained_var, " is the percentage of variance explained by ", num_components, " components.")
    return num_components

In [5]:
def apply_pca(df, df_test, num_components):
    pca = PCA(n_components=num_components)
    pca.fit(df)
    principalComponents = pca.transform(df)
    principalDf = pd.DataFrame(data = principalComponents)
    principalComponents_test = pca.transform(df_test)
    principalDf_test = pd.DataFrame(data = principalComponents_test)
    return principalDf, principalDf_test

In [6]:
files = ['KNNWeeklyImputed','MeanWeeklyImputed', 'MIIWeeklyImputed', 'MovingAverageWeeklyImputed', 'RegressionWeeklyImputed']
for file in files:
    df = read_csv(file+'_training_FE.csv')
    df_test = read_csv(file+'_test_FE.csv')
    df_scaled, df_test_scaled = scale(df,df_test)
    
    num_components = calculate_number_components(df_scaled)
    print("___________________")
    principalDf, testDf = apply_pca(df_scaled, df_test_scaled, num_components)
    principalDf.to_csv(file + '_PCA_training_FE.csv', index = True)
    testDf.to_csv(file + '_PCA_test_FE.csv', index = True)
    

KNNWeeklyImputed_training_FE.csv
KNNWeeklyImputed_test_FE.csv
0.951136005825  is the percentage of variance explained by  80  components.
___________________
MeanWeeklyImputed_training_FE.csv
MeanWeeklyImputed_test_FE.csv
0.950343855415  is the percentage of variance explained by  92  components.
___________________
MIIWeeklyImputed_training_FE.csv
MIIWeeklyImputed_test_FE.csv
0.950510428493  is the percentage of variance explained by  89  components.
___________________
MovingAverageWeeklyImputed_training_FE.csv
MovingAverageWeeklyImputed_test_FE.csv
0.950239884274  is the percentage of variance explained by  63  components.
___________________
RegressionWeeklyImputed_training_FE.csv
RegressionWeeklyImputed_test_FE.csv
0.951727612001  is the percentage of variance explained by  57  components.
___________________
