In [64]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [65]:
def read_csv(file):
    df = pd.read_csv(file, parse_dates=True, index_col=0)
    df = df.loc['2000-01-01':'2015-01-01'] #Selecting training data
    return df

In [66]:
def scale(df):
    scaler = preprocessing.StandardScaler()
    scaler.fit(df)
    array = scaler.transform(df)
    df_scaled = pd.DataFrame(data=array, index=df.index, columns=df.columns)
    return df_scaled, scaler

In [67]:
def descaler(df, scaler):
    array = scaler.inverse_transform(df)
    df_final = pd.DataFrame(data=array, index=df.index, columns=df.columns)
    return df_final

In [68]:
def calculate_number_components(df):
    pca = PCA(svd_solver='full')
    pca.fit(df)
    pca.explained_variance_ratio_
    explained_var = 0
    num_components = 0
    for var in pca.explained_variance_ratio_:
        explained_var += var
        num_components +=1
        if explained_var >=0.95:
            break

    print(explained_var, num_components)
    return num_components

In [69]:
def apply_pca(df, num_components):
    pca = PCA(n_components=num_components)
    principalComponents = pca.fit_transform(df)
    principalDf = pd.DataFrame(data = principalComponents)
    return principalDf

In [71]:
files = ['KNNImputed.csv', 'KNNWeeklyImputed.csv','MeanWeeklyImputed.csv','MeanImputed.csv', 'MIImputed.csv', 'MIIWeeklyImputed.csv', 'MovingAverageWeeklyImputed.csv', 'MovingAverageWeeklyImputed.csv', 'RegressionImputed.csv', 'RegressionWeeklyImputed.csv']
for file in files:
    df = read_csv(file)
    df_scaled, scaler = scale(df)
    print(file)
    num_components = calculate_number_components(df_scaled)
    print("___________________")
    principalDf = apply_pca(df_scaled, num_components)
    output = file.split('.')
    principalDf.to_csv(output[0]+'PCA.'+output[1], index = True)
    

KNNImputed.csv
0.96073254145736 14
___________________
KNNWeeklyImputed.csv
0.9591116414751806 12
___________________
MeanWeeklyImputed.csv
0.9597093637409934 14
___________________
MeanImputed.csv
0.9591078203302558 14
___________________
MIImputed.csv
0.9631230808370136 5
___________________
MIIWeeklyImputed.csv
0.9529938401585641 6
___________________
MovingAverageWeeklyImputed.csv
0.9500378029800636 10
___________________
MovingAverageWeeklyImputed.csv
0.9500378029800636 10
___________________
RegressionImputed.csv
0.9572465199801969 10
___________________
RegressionWeeklyImputed.csv
0.9521290229332674 10
___________________
