# DTW Survey

## Sample data preparation

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
df_warranty_model_part_num_daily = pd.read_feather("../data/interim/df_warranty_model_part_num_daily.feather")

  labels, = index.labels


In [3]:
# change format to datetime type
df_warranty_model_part_num_daily.date = pd.to_datetime(df_warranty_model_part_num_daily.date) 

In [4]:
df_warranty_model_part_num_daily.head()

Unnamed: 0,model_cd,part_cd,date,n
0,,*0000-000,2003-01-22,1
1,,*1110-CT5,2000-06-16,1
2,,*1110-CT5,2001-07-06,1
3,,*1110-DC5,2001-01-10,1
4,,*1110-MA6,2000-02-05,1


In [5]:
# group number of warranty with part & month
df_warranty_part_num_monthly = (df_warranty_model_part_num_daily
                                .fillna("!Unknown")
                                .groupby(["part_cd",pd.Grouper(key="date",freq="MS")])["n"]
                                .sum().reset_index())

In [6]:
df_warranty_part_num_monthly.head()

Unnamed: 0,part_cd,date,n
0,*0000-000,2002-05-01,1
1,*0000-000,2002-06-01,1
2,*0000-000,2002-11-01,1
3,*0000-000,2003-01-01,1
4,*0000-000,2003-05-01,1


In [7]:
# number of warranty for each parts
parts_top = (df_warranty_model_part_num_daily
             .groupby("part_cd")["n"]
             .sum()
             .sort_values(ascending=False))

In [8]:
parts_top.head()

part_cd
27610-76G    340055
95411-82K    107101
14200-588    104781
95410-82K     85231
85104-82K     83442
Name: n, dtype: int64

In [9]:
# select top30 parts
df_warranty_part_num_monthly_top30 = df_warranty_part_num_monthly.loc[
    df_warranty_part_num_monthly.part_cd.isin(parts_top.index[:30])
]

In [10]:
df_warranty_part_num_monthly_top30.head()

Unnamed: 0,part_cd,date,n
11142,09262-350,1997-01-01,3
11143,09262-350,1997-02-01,6
11144,09262-350,1997-03-01,239
11145,09262-350,1997-04-01,336
11146,09262-350,1997-05-01,356


## DTW

Normal DTW. For further detail, [Wikipedia](https://en.wikipedia.org/wiki/Dynamic_time_warping) has detailed description.  
The order of calculation complexity is $O(N^{2})$

In [11]:
from dtw import dtw # you can install this module with `conda install dtw -c temporary-recipes`
from scipy.spatial.distance import euclidean # euclidean distance

In [12]:
# pivot table 
df_warranty_part_num_monthly_top30_pivot = (df_warranty_part_num_monthly_top30
                                            .pivot_table(index="date", columns="part_cd", values="n")
                                            .fillna(0))

In [13]:
df_warranty_part_num_monthly_top30_pivot.head()

part_cd,09262-350,09283-320,09283-350,14200-588,17400-588,17521-50M,17521-70G,27610-76G,33220-76G,33400-76G,...,55311-75F,55311-81M,84701-70K,84701-82K,84702-70K,85104-82K,95200-58J,95410-82K,95411-72M,95411-82K
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1997-01-01,3.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-02-01,6.0,14.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-03-01,239.0,305.0,862.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-04-01,336.0,386.0,1124.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-05-01,356.0,431.0,1267.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# calc dtw using euclidian distance
dtw_1, cost_matrix_1, acc_cost_matrix_1, path_1 = \
    dtw(df_warranty_part_num_monthly_top30_pivot.loc[:,"09262-350"].fillna(0),
        df_warranty_part_num_monthly_top30_pivot.loc[:,"09283-320"].fillna(0),
        dist=euclidean)

In [15]:
dtw_1 # dtw distance

9.712686567164178

In [16]:
path_1 # dtw path (pair of index between 2 time-series)

(array([  0,   1,   2,   3,   4,   4,   5,   6,   7,   8,   8,   8,   8,
          8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  19,
         19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  28,  28,  28,
         28,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  38,
         38,  38,  38,  38,  38,  39,  39,  40,  41,  42,  43,  44,  44,
         44,  44,  45,  46,  47,  48,  49,  50,  50,  51,  51,  52,  53,
         54,  55,  55,  55,  55,  55,  55,  56,  57,  58,  58,  58,  58,
         59,  60,  61,  62,  63,  64,  65,  65,  66,  67,  68,  69,  70,
         71,  72,  72,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,
         82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,
         95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
        108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
        121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
        134, 135, 136, 137, 138, 139, 140, 141, 142

In [17]:
# we can calculate dtw between 2 series whose lengthes are not same
dtw_2, *_, path_2 = dtw(df_warranty_part_num_monthly_top30_pivot.loc[:,"09262-350"].fillna(0),
                df_warranty_part_num_monthly_top30_pivot.loc[:,"09283-320"].iloc[1:100].fillna(0),
                dist=euclidean)

In [18]:
dtw_2

14.997275204359672

In [19]:
path_2

(array([  0,   1,   2,   3,   4,   4,   5,   6,   7,   8,   8,   8,   8,
          8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  19,
         19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  28,  28,  28,
         28,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  38,
         38,  38,  38,  38,  38,  39,  39,  40,  41,  42,  43,  44,  44,
         44,  44,  45,  46,  47,  48,  49,  50,  50,  51,  51,  52,  53,
         54,  55,  55,  55,  55,  55,  55,  56,  57,  58,  58,  58,  58,
         59,  60,  61,  62,  63,  64,  65,  65,  66,  67,  68,  69,  70,
         71,  72,  72,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,
         82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,
         95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
        108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
        121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
        134, 135, 136, 137, 138, 139, 140, 141, 142

In [20]:
# make artificial series with multivariate
series_a = np.stack([df_warranty_part_num_monthly_top30_pivot.iloc[:,0].fillna(0).values,
                 df_warranty_part_num_monthly_top30_pivot.iloc[:,1].fillna(0).values], axis=-1)
series_b = np.stack([df_warranty_part_num_monthly_top30_pivot.iloc[1:100,2].fillna(0).values,
                 df_warranty_part_num_monthly_top30_pivot.iloc[1:100,3].fillna(0).values], axis=-1)

In [21]:
series_a.shape, series_b.shape

((268, 2), (99, 2))

In [22]:
# we can calculate dtw between multivariate series also.
dtw_3, *_, path_3 = dtw(series_a, series_b, dist=euclidean)

In [23]:
dtw_3

169.12354985651956

## FastDTW

FastDTW is algorithm for calculate DTW(Dynamic Time Warp) **approximately**.  
The order of calculation complexity is $O(N)$, although that of normal DTW is $O(N^{2})$.  
For further detail, please check [original report](https://pdfs.semanticscholar.org/05a2/0cde15e172fc82f32774dd0cf4fe5827cad2.pdf).

In [24]:
from fastdtw import fastdtw

In [25]:
dtw_4, path_4 = \
    fastdtw(df_warranty_part_num_monthly_top30_pivot.loc[:,"09262-350"].fillna(0),
        df_warranty_part_num_monthly_top30_pivot.loc[:,"09283-320"].fillna(0),
        dist=euclidean)

In [26]:
dtw_4 # distance with fastdtw is different from dtw

5464.0

In [27]:
path_4 

[(0, 0),
 (1, 1),
 (2, 2),
 (3, 2),
 (4, 2),
 (5, 2),
 (6, 2),
 (7, 2),
 (8, 3),
 (9, 4),
 (10, 5),
 (11, 5),
 (12, 5),
 (13, 6),
 (13, 7),
 (14, 8),
 (14, 9),
 (14, 10),
 (14, 11),
 (14, 12),
 (15, 13),
 (16, 14),
 (17, 15),
 (18, 16),
 (19, 17),
 (19, 18),
 (19, 19),
 (20, 20),
 (21, 20),
 (22, 20),
 (23, 20),
 (24, 20),
 (25, 20),
 (26, 21),
 (27, 22),
 (28, 23),
 (28, 24),
 (28, 25),
 (28, 26),
 (28, 27),
 (28, 28),
 (29, 29),
 (30, 29),
 (31, 29),
 (32, 30),
 (33, 31),
 (34, 32),
 (34, 33),
 (34, 34),
 (34, 35),
 (34, 36),
 (34, 37),
 (34, 38),
 (35, 39),
 (35, 40),
 (35, 41),
 (36, 42),
 (37, 43),
 (38, 44),
 (39, 45),
 (39, 46),
 (39, 47),
 (40, 48),
 (41, 48),
 (42, 48),
 (43, 48),
 (44, 48),
 (45, 48),
 (46, 49),
 (47, 49),
 (48, 49),
 (49, 49),
 (50, 50),
 (50, 51),
 (51, 52),
 (51, 53),
 (52, 54),
 (53, 54),
 (54, 55),
 (55, 56),
 (55, 57),
 (55, 58),
 (55, 59),
 (55, 60),
 (55, 61),
 (56, 62),
 (57, 63),
 (58, 64),
 (58, 65),
 (58, 66),
 (58, 67),
 (59, 68),
 (60, 69),
 (61

In [28]:
# we can calculate dtw between 2 series whose lengthes are not same
dtw_5, path_5 = fastdtw(df_warranty_part_num_monthly_top30_pivot.loc[:,"09262-350"].fillna(0),
                df_warranty_part_num_monthly_top30_pivot.loc[:,"09283-320"].iloc[1:100].fillna(0),
                dist=euclidean)

In [29]:
# we can calculate dtw between multivariate series also.
dtw_6, *_, path_6 = fastdtw(series_a, series_b, dist=euclidean)

## Scalable implementation

- Implementation by Spark https://github.com/zaratsian/dynamic_time_warping
- Implementation by Teradata https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=2ahUKEwiq2oeY897jAhUPSo8KHTtNCOwQFjAAegQIARAB&url=https%3A%2F%2Fdocs.teradata.com%2Freader%2FAI5zLpFtwEQWIVvpnZgB8g%2FuUaUVLaXSRPTSh~ZmJ4RMw&usg=AOvVaw1hpP1_3a4W6pONtjHoxAXc

I expect DTW can be implemented with recursive aggregation query, but I coundl't find any webpage...