### This notebook applies PCA to every Imputed and Enriched data set 

In [17]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [30]:
def read_csv(file):
    print(file)
    df = pd.read_csv(file, parse_dates=True, index_col=0)
    return df

In [19]:
def scale(df):
    scaler = preprocessing.StandardScaler()
    scaler.fit(df)
    array = scaler.transform(df)
    df_scaled = pd.DataFrame(data=array, index=df.index, columns=df.columns)
    return df_scaled

In [20]:
def descaler(df, scaler):
    array = scaler.inverse_transform(df)
    df_final = pd.DataFrame(data=array, index=df.index, columns=df.columns)
    return df_final

In [21]:
def calculate_number_components(df):
    pca = PCA(svd_solver='full')
    pca.fit(df)
    pca.explained_variance_ratio_
    explained_var = 0
    num_components = 0
    for var in pca.explained_variance_ratio_:
        explained_var += var
        num_components +=1
        if explained_var >=0.95:
            break

    print(explained_var, " is the percentage of variance explained by ", num_components, " components.")
    return num_components

In [22]:
def apply_pca(df, df_test, num_components):
    pca = PCA(n_components=num_components)
    pca.fit(df)
    principalComponents = pca.transform(df)
    principalDf = pd.DataFrame(data = principalComponents)
    principalComponents_test = pca.transform(df_test)
    principalDf_test = pd.DataFrame(data = principalComponents_test)
    return principalDf, principalDf_test

In [34]:
files = ['KNNImputed.csv', 'KNNWeeklyImputed.csv','MeanWeeklyImputed.csv','MeanImputed.csv', 'MIImputed.csv', 'MIIWeeklyImputed.csv', 'MovingAverageWeeklyImputed.csv', 'MovingAverageImputed.csv', 'RegressionImputed.csv', 'RegressionWeeklyImputed.csv']
#files = ['KNNWeeklyImputed','MeanWeeklyImputed', 'MIIWeeklyImputed', 'MovingAverageWeeklyImputed', 'MovingAverageImputed', 'RegressionImputed', 'RegressionWeeklyImputed']
for file in files:
    df = read_csv(file)
    test_data = file.split('.')
    df_test = read_csv(test_data[0] + '_test.' + test_data[1])
    print(df_test.shape)
    df_scaled = scale(df)
    df_test_scaled = scale(df_test)
    
    num_components = calculate_number_components(df_scaled)
    print("___________________")
    principalDf, testDf = apply_pca(df_scaled, df_test_scaled, num_components)
    output = file.split('.')
    principalDf.to_csv(output[0] + 'PCA.' + output[1], index = True)
    testDf.to_csv(output[0] + 'PCA_test.' + output[1], index = True)
    

KNNImputed.csv
KNNImputed_test.csv
(1903, 22)
0.96073254145736  is the percentage of variance explained by  14  components.
___________________
KNNWeeklyImputed.csv
KNNWeeklyImputed_test.csv
(272, 22)
0.9591116414751806  is the percentage of variance explained by  12  components.
___________________
MeanWeeklyImputed.csv
MeanWeeklyImputed_test.csv
(272, 22)
0.9597093637409934  is the percentage of variance explained by  14  components.
___________________
MeanImputed.csv
MeanImputed_test.csv
(1903, 22)
0.9591078203302558  is the percentage of variance explained by  14  components.
___________________
MIImputed.csv
MIImputed_test.csv
(1903, 44)
0.9631230808370136  is the percentage of variance explained by  5  components.
___________________
MIIWeeklyImputed.csv
MIIWeeklyImputed_test.csv
(272, 44)
0.9529938401585641  is the percentage of variance explained by  6  components.
___________________
MovingAverageWeeklyImputed.csv
MovingAverageWeeklyImputed_test.csv
(272, 22)
0.95003780298006