# Dynamic Time Warping:

Dynamic Time Warping is a path-searching algorithm. DTW finds the minimum cost path between the complete matrix of pairwise distances between two time-series.

This matrix of pairwise distances is referred to as the cost matrix.


Low cost implies similarity, high cost implies dissimilarity. DTW finds a path through the cost matrix of minimum total cost. Each valid path through the cost matrix is called a “warping” path.




In [None]:
from IPython.display import Image
Image("/kaggle/input/dtw-image/Screenshot 2020-06-07 at 10.53.09.png")

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

!pip install tslearn
import tslearn

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from tslearn.clustering import TimeSeriesKMeans 
from tslearn import metrics

from scipy.signal import hilbert, butter, filtfilt
from scipy.fftpack import fft,fftfreq,rfft,irfft,ifft
import numpy as np
import seaborn as sns
import pandas as pd
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

seed = 0
np.random.seed(seed)

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading Dataset

In [None]:
df_covid = pd.read_csv('/kaggle/input/corona-virus-report/covid_19_clean_complete.csv')

In [None]:
df_covid.head()

In [None]:
#Datatype conversion
df_covid.Date = pd.to_datetime(df_covid.Date)

In [None]:
#Checking the number of entries for each country
df_covid['Country/Region'].value_counts()

In [None]:
 selected_countries = df_covid['Country/Region'].value_counts().reset_index(name="count").query("count > 500")['index']

In [None]:
df_covid_filtered = df_covid[df_covid['Country/Region'].isin(selected_countries)]

In [None]:
# multiline plot with group by

fig, ax1= plt.subplots(nrows = 1, ncols = 1,figsize=(15,15))
for key, grp in df_covid_filtered.groupby(['Country/Region']): 
    ax1.plot(grp['Date'], grp['Confirmed'], label = "{}".format(key))
plt.legend(loc='best')    
plt.show()


In [None]:
df_covid_filtered['Date'] = pd.to_datetime(df_covid_filtered['Date'])

In [None]:

df_covid_pivot = df_covid.reset_index().pivot_table(index='Date', columns='Country/Region', values='Confirmed', aggfunc = 'sum')

In [None]:
#Transposing
df_covid_pivot = df_covid_pivot.T

In [None]:
df_covid_pivot.index

In [None]:
# converting to array 
df_train = np.array(df_covid_pivot).reshape(df_covid_pivot.shape[0],df_covid_pivot.shape[1],1)

# Dynamic time warping - Time series clustering

In [None]:
plt.figure(figsize=(20,15))
sz = df_train.shape[1]

# DBA-k-means
print("DBA k-means")
dba_km = TimeSeriesKMeans(n_clusters=4,
                          n_init=2,
                          metric="dtw",
                          verbose=True,
                          max_iter_barycenter=10,
                          random_state=seed)
y_pred = dba_km.fit_predict(df_train)
labels = dict(zip(df_covid_pivot.index, y_pred))


for yi in range(4):
    plt.subplot(2, 2, 1 + yi)
    for xx in df_train[y_pred == yi]:
        if yi == 0:
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='blue' ) 
            plt.legend([ k for k, v in labels.items() if v == 0 ], loc ='upper left')
        elif yi == 1: 
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='green' )
            plt.legend([ k for k, v in labels.items() if v == 1 ], loc ='upper left')
            
        elif yi ==2:
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='magenta' )
            plt.legend([ k for k, v in labels.items() if v == 2 ], loc ='upper left')
            
        else:
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='purple' )
            plt.legend([ k for k, v in labels.items() if v == 3 ], loc ='upper left')
            
        plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-5, 1000000)
    #plt.legend(df_excep.columns[1:],loc="upper left")
    plt.text(0.55, 0.85,'Cluster %d ' % (yi + 1),
             transform=plt.gca().transAxes)
    

# Euclidean distance - Timeseries clustering


The Euclidean distance between two time series is the square root of the sum of the squared length of the vertical hatch lines while joining the series point to point


In [None]:

plt.figure(figsize=(20,15))
sz = df_train.shape[1]

# Euclidean distance k means
print("Euclidean k-means")
km = TimeSeriesKMeans(n_clusters=4, verbose=True, random_state=seed)
y_pred = km.fit_predict(df_train)
labels = dict(zip(df_covid_pivot.index, y_pred))


for yi in range(4):
    plt.subplot(2, 2, 1 + yi)
    for xx in df_train[y_pred == yi]:
        if yi == 0:
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='blue' ) 
            plt.legend([ k for k, v in labels.items() if v == 0 ], loc ='upper left')
        elif yi == 1: 
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='green' )
            plt.legend([ k for k, v in labels.items() if v == 1 ], loc ='upper left')
            
        elif yi ==2:
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='magenta' )
            plt.legend([ k for k, v in labels.items() if v == 2 ], loc ='upper left')
            
        else:
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='purple' )
            plt.legend([ k for k, v in labels.items() if v == 3 ], loc ='upper left')
            
        plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-5, 1000000)
    #plt.legend(df_excep.columns[1:],loc="upper left")
    plt.text(0.55, 0.85,'Cluster %d ' % (yi + 1),
             transform=plt.gca().transAxes)
    

# Soft Dynamic Time Warping - Timeseries Clustering

Soft-DTW is a differentiable loss function, and both its value and gradient can be computed with quadratic time/space complexity (DTW has quadratic time but linear space complexity). 

In [None]:

plt.figure(figsize=(20,15))
sz = df_train.shape[1]

# Soft dtw -  k means
print("Soft dtw k-means")
km = TimeSeriesKMeans(n_clusters=4,
                           metric="softdtw",
                           metric_params={"gamma": .01},
                           verbose=True,
                           random_state=seed)
y_pred = km.fit_predict(df_train)
labels = dict(zip(df_covid_pivot.index, y_pred))


for yi in range(4):
    plt.subplot(2, 2, 1 + yi)
    for xx in df_train[y_pred == yi]:
        if yi == 0:
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='blue' ) 
            plt.legend([ k for k, v in labels.items() if v == 0 ], loc ='upper left')
        elif yi == 1: 
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='green' )
            plt.legend([ k for k, v in labels.items() if v == 1 ], loc ='upper left')
            
        elif yi ==2:
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='magenta' )
            plt.legend([ k for k, v in labels.items() if v == 2 ], loc ='upper left')
            
        else:
            plt.plot(xx.ravel(), "k-", alpha=.5, color ='purple' )
            plt.legend([ k for k, v in labels.items() if v == 3 ], loc ='upper left')
            
        plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-5, 1000000)
    #plt.legend(df_excep.columns[1:],loc="upper left")
    plt.text(0.55, 0.85,'Cluster %d ' % (yi + 1),
             transform=plt.gca().transAxes)
    

# Finding similarity between the clusters

In [None]:
df_covid_cluster_results = df_covid_pivot

In [None]:
df_covid_cluster_results['Cluster'] = y_pred

In [None]:
df_covid_cluster_results.loc[df_covid_cluster_results.Cluster==2,df_covid_cluster_results.columns !='Cluster']

In [None]:
for i in range(4):
    for j in range(4):
        if i < j:
            path, dist = metrics.dtw_path(df_covid_cluster_results.loc[df_covid_cluster_results.Cluster==i,df_covid_cluster_results.columns !='Cluster'], 
                                df_covid_cluster_results.loc[df_covid_cluster_results.Cluster==j,df_covid_cluster_results.columns !='Cluster'])
            print('Distance between cluster ' +str(i)+ ' and '+ str(j)+ ' is  %.2f'%(dist))

# Phase Synchrony

Phase Synchrony, measures moment-to-moment synchrony between two signals. It can be somewhat subjective because you need to filter the data to the wavelength of interest but you might have theoretical reasons for determining such bands. To calculate phase synchrony, we need to extract the phase of the signal which can be done by using the Hilbert transform which splits the signal into its phase and power. This allows us to assess if two signals are in phase (moving up and down together) or out of phase.

If we want to compare the pattern of two specific countries and see how similar they are, we can use phase synchrony approach. Let's pick Brazil and Russia as those countries are clustered together due to similar pattern by DTW. 

In [None]:
#Band pass filter
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = filtfilt(b, a, data)
    return y

lowcut  = .01
highcut = .5
fs = 30.
order = 1

In [None]:
y1 = butter_bandpass_filter(df_covid_pivot.loc['Brazil',:],lowcut=lowcut,highcut=highcut,fs=fs,order=order)
y2 = butter_bandpass_filter(df_covid_pivot.loc['Russia',:],lowcut=lowcut,highcut=highcut,fs=fs,order=order)

al1 = np.angle(hilbert(y1),deg=False)
al2 = np.angle(hilbert(y2),deg=False)
phase_synchrony = 1-np.sin(np.abs(al1-al2)/2)
N = len(al1)

# Plot results
f,ax = plt.subplots(3,1,figsize=(14,7),sharex=True)
ax[0].plot(y1,color='r',label='y1')
ax[0].plot(y2,color='b',label='y2')
ax[0].legend(bbox_to_anchor=(0., 1.02, 1., .102),ncol=2)
ax[0].set(xlim=[0,N], title='Filtered Timeseries Data')
ax[1].plot(al1,color='r')
ax[1].plot(al2,color='b')
ax[1].set(ylabel='Angle',title='Angle at each Timepoint',xlim=[0,N])
phase_synchrony = 1-np.sin(np.abs(al1-al2)/2)
ax[2].plot(phase_synchrony)
ax[2].set(ylim=[0,1.1],xlim=[0,N],title='Instantaneous Phase Synchrony',xlabel='Time',ylabel='Phase Synchrony')
plt.tight_layout()
plt.show()

As we can see, the phase synchrony is quite high between these 2 countries as they have similar pattern of increase in covid cases which was rightly clustered together using dtw.

In [None]:
phase_synchrony.mean()

Let's try to check the sychrony between Brazil and US which are from different clusters and have different pattern of increase.

In [None]:
y1 = butter_bandpass_filter(df_covid_pivot.loc['Brazil',:],lowcut=lowcut,highcut=highcut,fs=fs,order=order)
y2 = butter_bandpass_filter(df_covid_pivot.loc['US',:],lowcut=lowcut,highcut=highcut,fs=fs,order=order)

al1 = np.angle(hilbert(y1),deg=False)
al2 = np.angle(hilbert(y2),deg=False)
phase_synchrony = 1-np.sin(np.abs(al1-al2)/2)
N = len(al1)

# Plot results
f,ax = plt.subplots(3,1,figsize=(14,7),sharex=True)
ax[0].plot(y1,color='r',label='y1')
ax[0].plot(y2,color='b',label='y2')
ax[0].legend(bbox_to_anchor=(0., 1.02, 1., .102),ncol=2)
ax[0].set(xlim=[0,N], title='Filtered Timeseries Data')
ax[1].plot(al1,color='r')
ax[1].plot(al2,color='b')
ax[1].set(ylabel='Angle',title='Angle at each Timepoint',xlim=[0,N])
phase_synchrony = 1-np.sin(np.abs(al1-al2)/2)
ax[2].plot(phase_synchrony)
ax[2].set(ylim=[0,1.1],xlim=[0,N],title='Instantaneous Phase Synchrony',xlabel='Time',ylabel='Phase Synchrony')
plt.tight_layout()
plt.show()

In [None]:
# phase synchrony is lower as compared to same cluster countries

phase_synchrony.mean()

Ref:
https://towardsdatascience.com/four-ways-to-quantify-synchrony-between-time-series-data-b99136c4a9c9
