In [1]:
# K Nearest Neighbors (KNN) with Dynamic Time Warping (DTW)

import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# https://stackoverflow.com/questions/57015499/how-to-use-dynamic-time-warping-with-knn-in-python

# custom metric
def DTW(a, b):   
    an = a.size
    bn = b.size
    pointwise_distance = distance.cdist(a.reshape(-1,1),b.reshape(-1,1))
    cumdist = np.matrix(np.ones((an+1,bn+1)) * np.inf)
    cumdist[0,0] = 0

    for ai in range(an):
        for bi in range(bn):
            minimum_cost = np.min([cumdist[ai, bi+1],
                                   cumdist[ai+1, bi],
                                   cumdist[ai, bi]])
            cumdist[ai+1, bi+1] = pointwise_distance[ai,bi] + minimum_cost

    return cumdist[an, bn]



In [2]:

df_feature=pd.read_csv('../data/df_feature.csv')
X=pd.read_csv('../data/X_data_tr.csv', index_col='date', parse_dates=True)
y=pd.read_csv('../data/y_data_tr.csv', index_col='date', parse_dates=True)

selected_features=list(df_feature[df_feature.select==1]['variable'])
X_train=X[selected_features][:-96]
y_train=y['y_oecd'][:-96]
X_test=X[selected_features][-96:]
y_test=y['y_oecd'][-96:]

In [3]:
#train
parameters = {'n_neighbors':[2, 3]}

## cv: cv=TimeSeriesSplit(n_splits=3), cv=KFold(n_splits=3, shuffle=False, random_state=14)

clf = GridSearchCV(KNeighborsClassifier(metric=DTW), parameters, cv=TimeSeriesSplit(n_splits=3), 
                   verbose=3, n_jobs=-1)
clf.fit(X_train, y_train)



Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [None]:
#evaluate
X_test=X[selected_features][-96:]
y_test=y['y_oecd'][-96:]

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt



y_test=y['y_agg'][-96:]
plt.plot(y_test)

In [None]:
import pickle

with open ('../result/clf_1.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [None]:


y['y_agg'].groupby(y['y_oecd']).count()

In [None]:
## test split = 36 month
## train, validation with the rest(47 years)

from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
kfold=KFold(n_splits=3, shuffle=False)

X_tv=X[selected_features][:-36]
X_test=X[selected_features][-36:]
y_tv=y['y_oecd'][:-36]
y_test=y['y_oecd'][-36:]

for train_idx, val_idx in kfold.split(X_test):
    X_train, X_val=X_tv[train_idx], X_tv[val_idx]
    y_train, y_val=y_tv[train_idx], y_tv[val_idx]
    

                                    

In [None]:
KFold.split()