In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
def do_PCA(file_name,window=[0,44640],num_components=-1,plot=False):
    
    '''
    Performs principal component analysis on a set of data for a given GLE.
    
    Arguments
    file_name:
    Path corresponding to the input data file. This file should be generated by NMDB_access_data.ipynb.
    range_to_take:
    A list of two minutes between which you wish to perform PCA. For example, window=[40000,46440]
    only considers the period from minute 40000 of the GLE's month to the final minute of the month.
    num_components:
    Number of principal components to extract from the data. If -1, will extract as many components as possible.
    Within limitations of sklearn.decomposition.PCA, this is equal to the number of features of the input data.
    plot:
    Boolean. If true, will create a plot of each principal component. If false, does nothing. If there are a lot of components,
    plotting them all may eat up a lot of your computer's memory.
    
    Returns
    X_pca:
    An array of type np.ndarray whose columns correspond to chronologically-ordered data for each principal component,
    each row representing one timestep (one minute).    
    '''

    X = pd.read_csv(file_name, index_col=0) #Read in the file for the month in question
    
    #Isolate and only consider the time range within the chosen window
    range_to_take = np.arange(window[0],window[1])
    X = X.take(range_to_take)
    
    #Interpolate all NaN values. If they can't be interpolated (i.e. at the beginning or end of a series), then try to forward-fill them, then try to backward-fill them
    X = X.interpolate(axis=0)
    X = X.fillna(method='ffill',axis=0)
    X = X.fillna(method='bfill',axis=0)
    
    #Scale the input data
    scaler = StandardScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    
    #By default, find the maximum number of principal components possible
    if num_components == -1:
        num_components = len(X_scaled[0])
    
    #Instantiate, fit, and then perform PCA
    pca = PCA(n_components=num_components)
    pca.fit(X_scaled)
    X_pca = pca.transform(X_scaled)
    
    #Plot only if enabled when this function was called
    if plot:
        for component in range(len(X_pca[0])):
            plt.figure(figsize=(10,4))
            plt.plot(range_to_take,X_pca[:,component])
            plt.title('Principal component #{} of the {} {} GLE'.format(component+1,file_name[:-14],file_name[4:-9]))
            plt.xlabel('Time (minutes since beginning of month)',fontsize=12)
            plt.ylabel('Principal component {}'.format(component+1),fontsize=12)
    
    return(X_pca)

In [None]:
do_PCA(file_name='Nov_2003_NMDB.txt', window=[2500,3000], num_components=-1, plot=True)