In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [None]:
def autocorrelate(file_name, key_time, tau_range, plot_columns=[]):
    '''
    Does autocorrelation analysis on each series in the input file. Each file should be created by the download_data function
    from NMDB_access_data.ipynb. Returns the discrete autocorrelation function for each series in the input file, and plots
    the functions that the user selects.
    
    Arguments
    file_name:
    The name of the file containing the timeseries you want to be autocorrelated. This file should have been created by the
    download_data function from NMDB_access_data.ipynb.
    key_time:
    An integer that corresponds to the minute of the month you want to define the autocorrelation bounds with. This minute will
    become the right bound of the autocorrelation window (the window in which the two series are summed once multiplied).
    tau_range:
    An integer that corresponds to the maximum value of tau you wish the function to reach. The left bound of the
    autocorrelation window will be key_time-tau_range. Autocorrelation is done from tau=0 to tau=tau_range in steps of 1 minute.
    plot_columns:
    A list-like containing the string codenames of the stations whse autocorrelation functions you wish to plot. For instance,
    if plot_columns=['SOPO','INVK'], then the autocorrelation functions of the SOPO and INVK monitors for the corresponding year,
    month, and time range will be plotted. By default, no plots will be created.
    
    Returns
    all_corr_values:
    A list of lists; each constituent list has elements corresponding to the autocorrelation values of the time series at
    different values of tau. Each list of autocorrelation values can be plotted individually against tau.
    '''

    X = pd.read_csv(file_name, index_col=0) #load in the requested file

    #Interpolate all NaN values. If they can't be interpolated (i.e. at the beginning or end of a series), then try to forward-fill them, then try to backward-fill them
    X = X.interpolate(axis=0)
    X = X.fillna(method='ffill',axis=0)
    X = X.fillna(method='bfill',axis=0)
    
    
    all_corr_values = [] #Initialize a list to return all the autocorrelation functions
    for column in X:
        series = X[column]-X[column].min() #Shift x-axis up to minimum value, aka change of origin
        series_window = (key_time-tau_range,key_time)

        corr_values = [] #Initialize a list in which to store the products of the series and its shifted versions at different tau
        for tau in range(tau_range):
            shifted_series = series.shift(-tau) #Create a shifted series
            corr_series=(shifted_series[series_window[0]:series_window[1]]*
                          series[series_window[0]:series_window[1]]) #Autocorrelate the series
            corr_value = corr_series.sum() #Sum the autocorrelated series' elements
            corr_values.append(corr_value) #Append the sum for this specific tau to the list
        all_corr_values.append(corr_values) #Append this autocorrelation function to the list of all autocorrelation functions
        
        
        if column in plot_columns:
            plt.figure(figsize=(10,3.8))
            plt.plot(np.arange(len(corr_values)),corr_values)
            plt.title('Autocorrelation as a function of $\\tau$ for {}, {} {} GLE'.format(column, file_name[:-14],file_name[4:-9]))
            plt.xlabel('Timeshift $ \\tau $',size=14)
            plt.ylabel('Correlation Factor $C(t, \\tau)$',size=14)
            
            plt.figure(figsize=(10,3.8))
            X[column][series_window[0]-60:series_window[1]+100].plot()
            plt.title('Raw data for {}, {} {} GLE'.format(column,file_name[:-14],file_name[4:-9]))
            plt.xlabel('Time',size=14)
            plt.ylabel('Count/s',size=14)
            
    return(all_corr_values)

In [None]:
autocorrelate('Nov_2003_NMDB.txt', key_time=2550, tau_range=160, plot_columns = ['SOPO'])