<a href="https://colab.research.google.com/github/shanks847/EV-Grid-Integration-Study/blob/master/notebooks/Quantifying_Time_Series_Distances.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install dtaidistance pyts python-Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.signal import coherence
from statsmodels.tsa.stattools import acf
from pyts.approximation import SymbolicAggregateApproximation
from sklearn.utils import resample
from statsmodels.tsa.statespace.sarimax import SARIMAX
from scipy.signal import correlate
from scipy.signal import welch
from scipy.stats import f_oneway
from scipy.stats import linregress
from sklearn.metrics import mutual_info_score
from scipy.spatial.distance import euclidean
from dtaidistance import dtw
from dtaidistance import dtw_visualisation as dtwvis
from scipy.stats import entropy
from pyts.metrics.dtw import dtw
from Levenshtein import distance as lcs_distance


ModuleNotFoundError: ignored

In [None]:
# DISTANCES
def calculate_correlation(X, Y):
    correlation = np.corrcoef(X, Y)[0, 1]
    return correlation

def calculate_cross_correlation(X, Y):
    cross_corr = correlate(X, Y, mode='same')
    max_corr = np.max(cross_corr)
    return max_corr

def calculate_euclidean_distance(X, Y):
    distance = np.sqrt(np.sum((X - Y)**2))
    return distance

def calculate_dtw_distance(X, Y):
    distance = dtw.distance(X, Y)
    return distance

def calculate_cosine_similarity(X, Y):
    dot_product = np.dot(X, Y)
    magnitude_X = np.sqrt(np.sum(X**2))
    magnitude_Y = np.sqrt(np.sum(Y**2))
    similarity = dot_product / (magnitude_X * magnitude_Y)
    return similarity

def calculate_mutual_information(X, Y):
    return mutual_info_score(X, Y)

def calculate_lcs_distance(X, Y):
    m, n = len(X), len(Y)
    dp = [[0] * (n+1) for _ in range(m+1)]
    for i in range(1, m+1):
        for j in range(1, n+1):
            if X[i-1] == Y[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    lcs_distance = m + n - 2 * dp[m][n]
    return lcs_distance

def calculate_power_spectrum_distance(X, Y):
    _, X_power_spectrum = welch(X, nperseg=min(len(X), 256))
    _, Y_power_spectrum = welch(Y, nperseg=min(len(Y), 256))
    distance = np.sqrt(np.sum((X_power_spectrum - Y_power_spectrum)**2))
    return distance

def test_variance(X, Y):
    statistic, p_value = f_oneway(X, Y)
    return p_value

def calculate_combined_distance(X, Y, weights):
    distances = [
        calculate_correlation(X, Y),
        calculate_cross_correlation(X, Y),
        calculate_euclidean_distance(X, Y),
        calculate_dtw_distance(X, Y),
        calculate_cosine_similarity(X, Y),
        calculate_mutual_information(X, Y),
        calculate_lcs_distance(X, Y),
        calculate_hurst_exponent(X),
        calculate_power_spectrum_distance(X, Y)
    ]
    combined_distance = np.dot(weights, distances)
    return combined_distance

def calculate_mutual_info(X, Y):
    return mutual_info_score(X, Y)

def calculate_entropy(X):
    return entropy(X)

def calculate_sax_distance(X, Y, n_bins=4):
    sax = SymbolicAggregateApproximation(n_bins=n_bins)
    X_sax = sax.transform(np.expand_dims(X, axis=0))
    Y_sax = sax.transform(np.expand_dims(Y, axis=0))
    return dtw(X_sax, Y_sax)

def calculate_edit_distance(X, Y, normalized=True):
    return lcs_distance(str(X), str(Y)) / max(len(X), len(Y)) if normalized else lcs_distance(str(X), str(Y))




In [None]:
# HYPOTHESIS TESTS
def autocorrelation_test(X, Y, alpha=0.05):
    """
    Perform autocorrelation test on two time series.
    
    Parameters:
        X (array-like): First time series.
        Y (array-like): Second time series.
        alpha (float, optional): Significance level. Default is 0.05.
    
    Returns:
        str: Test result and interpretation.
    """
    acf_X = acf(X, nlags=len(X))
    acf_Y = acf(Y, nlags=len(Y))
    _, p_value = stats.ttest_ind(acf_X, acf_Y)
    if p_value < alpha:
        return "Reject the null hypothesis. The autocorrelation structures of the two time series are significantly different."
    else:
        return "Fail to reject the null hypothesis. The autocorrelation structures of the two time series are similar."

def waveform_distance_test(X, Y, alpha=0.05):
    """
    Perform waveform distance test on two time series.
    
    Parameters:
        X (array-like): First time series.
        Y (array-like): Second time series.
        alpha (float, optional): Significance level. Default is 0.05.
    
    Returns:
        str: Test result and interpretation.
    """
    distance_XY = dtw(X, Y)
    distance_XX = dtw(X, X)
    distance_YY = dtw(Y, Y)
    test_statistic = (distance_XY - (distance_XX + distance_YY) / 2) / np.sqrt((distance_XX + distance_YY) / 2)
    p_value = stats.norm.sf(np.abs(test_statistic))
    if p_value < alpha:
        return "Reject the null hypothesis. The waveform distances of the two time series are significantly different."
    else:
        return "Fail to reject the null hypothesis. The waveform distances of the two time series are similar."

def bootstrap_test(X, Y, statistic, n_bootstrap=1000, alpha=0.05):
    """
    Perform bootstrap test on two time series.
    
    Parameters:
        X (array-like): First time series.
        Y (array-like): Second time series.
        statistic (function): Function to calculate the test statistic.
        n_bootstrap (int, optional): Number of bootstrap iterations. Default is 1000.
        alpha (float, optional): Significance level. Default is 0.05.
    
    Returns:
        str: Test result and interpretation.
    """
    np.random.seed(0)
    X_bootstrap = resample(X, replace=True, n_samples=len(X))
    Y_bootstrap = resample(Y, replace=True, n_samples=len(Y))
    test_statistic_X = statistic(X_bootstrap)
    test_statistic_Y = statistic(Y_bootstrap)
    bootstrap_statistics = []
    for _ in range(n_bootstrap):
        X_bootstrap = resample(X, replace=True, n_samples=len(X))
        Y_bootstrap = resample(Y, replace=True, n_samples=len(Y))
        bootstrap_statistics.append(test_statistic(X_bootstrap) - test_statistic(Y_bootstrap))
    p_value = np.mean(np.abs(bootstrap_statistics) >= np.abs(test_statistic_X - test_statistic_Y))
    if p_value < alpha:
        return "Reject the null hypothesis. The two time series are significantly different based on the bootstrap test."
    else:
        return "Fail to reject the null hypothesis. The two time series are similar based on the bootstrap test."

def kolmogorov_smirnov_test(X, Y, alpha=0.05):
    """
    Perform Kolmogorov-Smirnov test on two time series.
    
    Parameters:
        X (array-like): First time series.
        Y (array-like): Second time series.
        alpha (float, optional): Significance level. Default is 0.05.
    
    Returns:
        str: Test result and interpretation.
    """
    _, p_value = stats.ks_2samp(X, Y)
    if p_value < alpha:
        return "Reject the null hypothesis. The two time series are significantly different based on the Kolmogorov-Smirnov test."
    else:
        return "Fail to reject the null hypothesis. The two time series are similar based on the Kolmogorov-Smirnov test."

def spectral_coherence_test(X, Y, fs, alpha=0.05):
    """
    Perform spectral coherence test on two time series.
    
    Parameters:
        X (array-like): First time series.
        Y (array-like): Second time series.
        fs (float): Sampling frequency of the time series.
        alpha (float, optional): Significance level. Default is 0.05.
    
    Returns:
        str: Test result and interpretation.
    """
    _, coherence_values = coherence(X, Y, fs=fs)
    test_statistic = np.mean(coherence_values)
    p_value = 1 - stats.chi2.cdf(test_statistic, df=len(coherence_values))
    if p_value < alpha:
        return "Reject the null hypothesis. The spectral coherence of the two time series is significantly different."
    else:
        return "Fail to reject the null hypothesis. The spectral coherence of the two time series is similar."

def nonlinear_measures_test(X, Y, alpha=0.05):
    """
    Perform nonlinear measures test on two time series.
    
    Parameters:
        X (array-like): First time series.
        Y (array-like): Second time series.
        alpha (float, optional): Significance level. Default is 0.05.
    
    Returns:
        str: Test result and interpretation.
    """
    distance_XY = lcss(X, Y)
    distance_XX = lcss(X, X)
    distance_YY = lcss(Y, Y)
    test_statistic = (distance_XY - (distance_XX + distance_YY) / 2) / np.sqrt((distance_XX + distance_YY) / 2)
    p_value = stats.norm.sf(np.abs(test_statistic))
    if p_value < alpha:
        return "Reject the null hypothesis. The nonlinear measures of the two time series are significantly different."
    else:
        return "Fail to reject the null hypothesis. The nonlinear measures of the two time series are similar."

def state_space_modeling_test(X, Y, alpha=0.05):
    """
    Perform state space modeling test on two time series.
    
    Parameters:
        X (array-like): First time series.
        Y (array-like): Second time series.
        alpha (float, optional): Significance level. Default is 0.05.
    
    Returns:
        str: Test result and interpretation.
    """
    model_X = SARIMAX(X, order=(1, 0, 0))
    model_Y = SARIMAX(Y, order=(1, 0, 0))
    results_X = model_X.fit()
    results_Y = model_Y.fit()
    aic_X = results_X.aic
    aic_Y = results_Y.aic
    _, p_value = stats.ttest_ind(aic_X, aic_Y)
    if p_value < alpha:
        return "Reject the null hypothesis. The state space models of the two time series are significantly different."
    else:
        return "Fail to reject the null hypothesis. The state space models of the two time series are similar."


# Testing Distance Measurement

In [None]:
# Example usage
X = np.array([1, 2, 3, 4, 5])
Y = np.array([2, 4, 6, 8, 10])
weights = np.array([0.2, 0.1, 0.3, 0.2, 0.1, 0.05, 0.05, 0.05, 0.05])

# Calculate individual distances
correlation = calculate_correlation(X, Y)
cross_correlation = calculate_cross_correlation(X, Y)
euclidean_distance = calculate_euclidean_distance(X, Y)
dtw_distance = calculate_dtw_distance(X, Y)
cosine_similarity = calculate_cosine_similarity(X, Y)
mutual_information = calculate_mutual_information(X, Y)
lcs_distance = calculate_lcs_distance(X, Y)
power_spectrum_distance = calculate_power_spectrum_distance(X, Y)

# Calculate combined distance
combined_distance = calculate_combined_distance(X, Y, weights)


In [None]:
distances= {'Correlation':correlation, 
              'Corss Correlation':cross_correlation,
              'Euclidean Distance':euclidean_distance,
              'DTW Distance':dtw_distance,
              'Cosine Similarity':cosine_similarity,
              'Mutual Information':mutual_information,
              'LCS Distance':lcs_distance,
              'Hurst Exponent':hurst_exponent,
              'PSD Distance':power_spectrum_distance}


In [None]:
pd.DataFrame.from_dict(distances, orient='index')

Unnamed: 0,0
Correlation,1.0
Corss Correlation,110.0
Euclidean Distance,7.416198
DTW Distance,6.082763
Cosine Similarity,1.0
Mutual Information,1.609438
LCS Distance,6.0
Hurst Exponent,999.0
PSD Distance,8.975587


# Hypotheis Testing

In [None]:
X = np.array([1, 2, 3, 4, 5])
Y = np.array([1, 2, 3, 4, 6])

print("Autocorrelation test:")
print(autocorrelation_test(X, Y))
print()

print("Waveform distance test:")
print(waveform_distance_test(X, Y))
print()

# Perform other hypothesis tests...

# Experimentation

# Further Work

I will utilize the following works to improve the anlysis and make the abalysis a bit more verbose.


1.   [Bag of recurrence patterns representation for time-series
classification](https://hal.science/hal-01774237)

2.   [Analyzing Multivariate Dynamics Using Cross-Recurrence Quantification Analysis (CRQA), Diagonal-Cross-Recurrence Profiles (DCRP), and Multidimensional Recurrence Quantification Analysis (MdRQA) – A Tutorial in R](https://doi.org/10.3389/fpsyg.2018.02232)

3. [Parametric recurrence quantification analysis of
autoregressive processes for pattern recognition in
multichannel electroencephalographic data](https://hal.science/hal-02921847)

4. [An Empirical Evaluation of Similarity Measures for Time Series Classification](https://arxiv.org/abs/1401.3973)

5. [On the Similarity and Dependence of Time Series](https://doi.org/10.3390/math9050550)

6. [A Competitive Measure to Assess the Similarity between Two Time Series](https://link.springer.com/chapter/10.1007/978-3-642-32986-9_31)

7. [A new similarity index for nonlinear signal analysis based on local
extrema patterns](https://doi.org/10.1016/j.physleta.2017.11.022)

8. See Reading list in the Notion Project Page 


I will attempt using the following work to bolster my analysis.