In [45]:
from imputationLibrary import forwardFilling, hotDeck, meanImputation, movingAverage, splineInterpolation, randomSampleImputation
from imputationLibrary.util import util
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn import preprocessing
from scipy import signal
import matplotlib.pyplot as plt

In [46]:
INPUT = 'output/\CompleteWeeklyIndexes'

In [50]:
def calc_acc(col):
    normalized = preprocessing.normalize([np.array(col.fillna(0))])
    corr = signal.correlate(normalized, normalized, mode='full')
    return pd.Series(corr.T[-len(col.index):].reshape(-1), index = col.index)

In [51]:
df = pd.read_csv(INPUT+'.csv', parse_dates=True, index_col=0).drop('vix_Close', axis=1)
df_train = df.loc['2000-01-01':'2015-01-01']
df_test = df.loc['2015-01-02':]

df_train_acc = df_train.apply(calc_acc)

In [52]:
df_train_T = df_train_acc.transpose()

In [53]:
df_train_0_imp = df_train_T.fillna(0)
scaler = preprocessing.StandardScaler()
scaler.fit(df_train_0_imp)
array = scaler.transform(df_train_0_imp)
df_scaled = pd.DataFrame(data=array, index=df_train_T.index, columns=df_train_T.columns)

In [54]:
clustering_4 = AgglomerativeClustering(n_clusters=4, linkage='complete', affinity='manhattan').fit(df_scaled)
clustering_3 = AgglomerativeClustering(n_clusters=3, linkage='complete', affinity='manhattan').fit(df_scaled)

In [55]:
df_train_T["clusters_4"] = clustering_4.labels_
df_train_T["clusters_3"] = clustering_3.labels_

In [56]:
print("Cluster 1 ", df_train_T[df_train_T['clusters_4'] == 0].index)
print("Cluster 2 ", df_train_T[df_train_T['clusters_4'] == 1].index)
print("Cluster 3 ", df_train_T[df_train_T['clusters_4'] == 2].index)
print("Cluster 4 ", df_train_T[df_train_T['clusters_4'] == 3].index)

Cluster 1  Index(['sp500_Close', 'dji_Close', 'ndx_Close', 'ndx_Volume', 'n225_Close',
       'ftse_Close', 'hsi_Close', 'n100_Close'],
      dtype='object')
Cluster 2  Index(['vix_Volume', 'Overall EMV Tracker',
       'infectious_daily_infect_emv_index', 'GPR',
       'trade_US Trade Policy Uncertainty',
       'trade_Japanese Trade Policy Uncertainty',
       'trade_Trade Policy EMV Fraction'],
      dtype='object')
Cluster 3  Index(['sp500_Volume', 'dji_Volume', 'n225_Volume', 'ftse_Volume'], dtype='object')
Cluster 4  Index(['hsi_Volume', 'n100_Volume'], dtype='object')


In [57]:
print("Cluster 1 ", df_train_T[df_train_T['clusters_3'] == 0].index)
print("Cluster 2 ", df_train_T[df_train_T['clusters_3'] == 1].index)
print("Cluster 3 ", df_train_T[df_train_T['clusters_3'] == 2].index)

Cluster 1  Index(['sp500_Volume', 'dji_Volume', 'n225_Volume', 'ftse_Volume',
       'hsi_Volume', 'n100_Volume'],
      dtype='object')
Cluster 2  Index(['vix_Volume', 'Overall EMV Tracker',
       'infectious_daily_infect_emv_index', 'GPR',
       'trade_US Trade Policy Uncertainty',
       'trade_Japanese Trade Policy Uncertainty',
       'trade_Trade Policy EMV Fraction'],
      dtype='object')
Cluster 3  Index(['sp500_Close', 'dji_Close', 'ndx_Close', 'ndx_Volume', 'n225_Close',
       'ftse_Close', 'hsi_Close', 'n100_Close'],
      dtype='object')
