### This notebook applies PCA to every Imputed and Enriched data set 

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [2]:
def read_csv(file):
    print(file)
    df = pd.read_csv(file, parse_dates=True, index_col=0)
    return df

In [3]:
def scale(df):
    scaler = preprocessing.StandardScaler()
    scaler.fit(df)
    array = scaler.transform(df)
    df_scaled = pd.DataFrame(data=array, index=df.index, columns=df.columns)
    return df_scaled

In [4]:
def descaler(df, scaler):
    array = scaler.inverse_transform(df)
    df_final = pd.DataFrame(data=array, index=df.index, columns=df.columns)
    return df_final

In [5]:
def calculate_number_components(df):
    pca = PCA(svd_solver='full')
    pca.fit(df)
    pca.explained_variance_ratio_
    explained_var = 0
    num_components = 0
    for var in pca.explained_variance_ratio_:
        explained_var += var
        num_components +=1
        if explained_var >=0.95:
            break

    print(explained_var, " is the percentage of variance explained by ", num_components, " components.")
    return num_components

In [6]:
def apply_pca(df, df_test, num_components):
    pca = PCA(n_components=num_components)
    pca.fit(df)
    principalComponents = pca.transform(df)
    principalDf = pd.DataFrame(data = principalComponents)
    principalComponents_test = pca.transform(df_test)
    principalDf_test = pd.DataFrame(data = principalComponents_test)
    return principalDf, principalDf_test

In [8]:
files = ['KNNWeeklyImputed','MeanWeeklyImputed', 'MIIWeeklyImputed', 'MovingAverageWeeklyImputed', 'MovingAverageImputed', 'RegressionImputed', 'RegressionWeeklyImputed']
for file in files:
    df = read_csv(file+'_training_FE.csv')
    df_test = read_csv(file+'_test_FE.csv')
    df_scaled = scale(df)
    df_test_scaled = scale(df_test)
    
    num_components = calculate_number_components(df_scaled)
    print("___________________")
    principalDf, testDf = apply_pca(df_scaled, df_test_scaled, num_components)
    principalDf.to_csv(file + '_PCA_training_FE.csv', index = True)
    testDf.to_csv(file + '_PCA_test_FE.csv', index = True)
    

KNNWeeklyImputed
KNNWeeklyImputed_training.csv
KNNWeeklyImputed_test.csv
0.9511360058248994  is the percentage of variance explained by  80  components.
___________________
MeanWeeklyImputed
MeanWeeklyImputed_training.csv
MeanWeeklyImputed_test.csv
0.9503438554151121  is the percentage of variance explained by  92  components.
___________________
MIIWeeklyImputed
MIIWeeklyImputed_training.csv
MIIWeeklyImputed_test.csv
0.9501726005056654  is the percentage of variance explained by  27  components.
___________________
MovingAverageWeeklyImputed
MovingAverageWeeklyImputed_training.csv


FileNotFoundError: [Errno 2] File b'MovingAverageWeeklyImputed_training.csv' does not exist: b'MovingAverageWeeklyImputed_training.csv'