### This notebook applies PCA to every Imputed and Enriched data set 

In [9]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [10]:
def read_csv(file):
    print(file)
    df = pd.read_csv(file, parse_dates=True, index_col=0)
    return df

In [11]:
def scale(df):
    scaler = preprocessing.StandardScaler()
    scaler.fit(df)
    array = scaler.transform(df)
    df_scaled = pd.DataFrame(data=array, index=df.index, columns=df.columns)
    return df_scaled

In [12]:
def descaler(df, scaler):
    array = scaler.inverse_transform(df)
    df_final = pd.DataFrame(data=array, index=df.index, columns=df.columns)
    return df_final

In [13]:
def calculate_number_components(df):
    pca = PCA(svd_solver='full')
    pca.fit(df)
    pca.explained_variance_ratio_
    explained_var = 0
    num_components = 0
    for var in pca.explained_variance_ratio_:
        explained_var += var
        num_components +=1
        if explained_var >=0.95:
            break

    print(explained_var, " is the percentage of variance explained by ", num_components, " components.")
    return num_components

In [14]:
def apply_pca(df, df_test, num_components):
    pca = PCA(n_components=num_components)
    pca.fit(df)
    principalComponents = pca.transform(df)
    principalDf = pd.DataFrame(data = principalComponents)
    principalComponents_test = pca.transform(df_test)
    principalDf_test = pd.DataFrame(data = principalComponents_test)
    return principalDf, principalDf_test

In [15]:
files = ['KNNWeeklyImputed','MeanWeeklyImputed', 'MIIWeeklyImputed', 'MovingAverageWeeklyImputed', 'RegressionWeeklyImputed']
for file in files:
    df = read_csv(file+'_training_FE.csv')
    df_test = read_csv(file+'_test_FE.csv')
    df_scaled = scale(df)
    df_test_scaled = scale(df_test)
    
    num_components = calculate_number_components(df_scaled)
    print("___________________")
    principalDf, testDf = apply_pca(df_scaled, df_test_scaled, num_components)
    principalDf.to_csv(file + '_PCA_training_FE.csv', index = True)
    testDf.to_csv(file + '_PCA_test_FE.csv', index = True)
    

KNNWeeklyImputed_training_FE.csv
KNNWeeklyImputed_test_FE.csv
0.9511360058248994  is the percentage of variance explained by  80  components.
___________________
MeanWeeklyImputed_training_FE.csv
MeanWeeklyImputed_test_FE.csv
0.9503438554151121  is the percentage of variance explained by  92  components.
___________________
MIIWeeklyImputed_training_FE.csv
MIIWeeklyImputed_test_FE.csv
0.9501726005056654  is the percentage of variance explained by  27  components.
___________________
MovingAverageWeeklyImputed_training_FE.csv
MovingAverageWeeklyImputed_test_FE.csv
0.9501059524334041  is the percentage of variance explained by  64  components.
___________________
RegressionWeeklyImputed_training_FE.csv
RegressionWeeklyImputed_test_FE.csv
0.951722703764459  is the percentage of variance explained by  57  components.
___________________
