In [None]:
import glob
import umap
import umap.plot
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from joblib import Parallel, delayed
from sklearn.manifold import TSNE
from sklearn.preprocessing import minmax_scale
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier, NeighborhoodComponentsAnalysis, KNeighborsRegressor, LocalOutlierFactor
from sklearn.cluster import AgglomerativeClustering

import plotly.io as plt_io
import plotly.graph_objects as go
%matplotlib inline

%config InlineBackend.figure_format = 'retina'

In [None]:
%%capture
!pip install umap-learn[plot]

In [None]:
df_target = pd.read_csv('/kaggle/input/optiver-realized-volatility-prediction/train.csv')
df_target = df_target.groupby('time_id').target.mean()

In [None]:
!mkdir -p /tmp/pip/cache/
!cp ../input/hdbscan0827-whl/hdbscan-0.8.27-cp37-cp37m-linux_x86_64.whl /tmp/pip/cache/
!pip install --no-index --find-links /tmp/pip/cache/ hdbscan

In [None]:
import hdbscan

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train_index_time_id = train.pivot(index='time_id', columns='stock_id', values='target')
train_index_stock_id = train.pivot(index='stock_id', columns='time_id', values='target')
train_index_time_id = train_index_time_id.fillna(0)
train_index_stock_id = train_index_stock_id.fillna(0)
train_index_time_id = pd.DataFrame(minmax_scale(train_index_time_id), index=train_index_time_id.index)
train_index_stock_id = pd.DataFrame(minmax_scale(train_index_stock_id), index=train_index_stock_id.index)

corr_stock_id = train_index_time_id.corr()
corr_time_id = train_index_stock_id.corr()

# K-distance Graph between STOCK_ID

In [None]:
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(train_index_stock_id)
distances, indices = nbrs.kneighbors(train_index_stock_id)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.figure(figsize=(20,10))
plt.plot(distances)
plt.title('K-distance Graph',fontsize=20)
plt.xlabel('Data Points sorted by distance',fontsize=14)
plt.ylabel('Epsilon',fontsize=14)
plt.show()

# number of kmeans

In [None]:
model = KMeans()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,10), timings= True)
visualizer.fit(train_index_stock_id)        # Fit data to visualizer
visualizer.show()   

In [None]:
f = plt.figure(figsize=(19, 15))
plt.matshow(corr_stock_id, fignum=f.number)
plt.xticks(range(corr_stock_id.shape[1]), corr_stock_id.columns, fontsize=10, rotation=45)
plt.yticks(range(corr_stock_id.shape[1]), corr_stock_id.columns, fontsize=10)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

# KMEANS

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(train_index_stock_id.values)
u, counts = np.unique(kmeans.labels_, return_counts=True)
print(u)
print(counts)

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(corr_stock_id.values)
u, counts = np.unique(kmeans.labels_, return_counts=True)
print(u)
print(counts)

# HDBSCAN solo

In [None]:
clusterer = hdbscan.HDBSCAN(prediction_data=True,cluster_selection_epsilon=12.5, cluster_selection_method = 'eom').fit(corr_stock_id)
u, counts = np.unique(clusterer.labels_, return_counts=True)
print(u)
print(counts)

# UMAP + HDBSCAN

In [None]:
reducer = umap.UMAP(random_state=42, n_components=2)
embedding = reducer.fit_transform(corr_stock_id)
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(embedding)
distances, indices = nbrs.kneighbors(embedding)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.figure(figsize=(20,10))
plt.plot(distances)
plt.title('K-distance Graph',fontsize=20)
plt.xlabel('Data Points sorted by distance',fontsize=14)
plt.ylabel('Epsilon',fontsize=14)
plt.show()

In [None]:
clusterer2 = hdbscan.HDBSCAN(prediction_data=True,cluster_selection_epsilon=0.35, cluster_selection_method = 'eom').fit(embedding)
u, counts = np.unique(clusterer2.labels_, return_counts=True)
print(u)
print(counts)
plt.figure(figsize=(10, 8))
plt.scatter(embedding[:, 0], embedding[:, 1], s=35, c=train_index_stock_id.index, edgecolors='none', cmap='jet', norm=mpl.colors.LogNorm());


In [None]:
train_index_stock_id['label'] = clusterer2.labels_
for x in np.unique(clusterer2.labels_):
    print(list(train_index_stock_id[train_index_stock_id['label'] == x].index))


In [None]:
reducer = umap.UMAP(random_state=42, n_components=2)
embedding = reducer.fit_transform(train_index_stock_id)
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(embedding)
distances, indices = nbrs.kneighbors(embedding)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.figure(figsize=(20,10))
plt.plot(distances)
plt.title('K-distance Graph',fontsize=20)
plt.xlabel('Data Points sorted by distance',fontsize=14)
plt.ylabel('Epsilon',fontsize=14)
plt.show()

In [None]:
clusterer2 = hdbscan.HDBSCAN(prediction_data=True,cluster_selection_epsilon=0.3, cluster_selection_method = 'eom').fit(embedding)
u, counts = np.unique(clusterer2.labels_, return_counts=True)
print(u)
print(counts)
plt.figure(figsize=(10, 8))
plt.scatter(embedding[:, 0], embedding[:, 1], s=35, c=train_index_stock_id.index, edgecolors='none', cmap='jet', norm=mpl.colors.LogNorm());


# KNN

In [None]:
# y_train = train_index_stock_id.index
# X_train = train_index_stock_id
# nca = NeighborhoodComponentsAnalysis(random_state=42)
# x_new = nca.fit_transform(X_train, y_train)

In [None]:
neigh = NearestNeighbors(n_neighbors=5)
nbrs = neigh.fit(train_index_stock_id)
neighb = nbrs.kneighbors(train_index_stock_id,  return_distance=False)

In [None]:
neighb
for i,x in enumerate(train_index_stock_id.index):
    print(neighb[i][0], neighb[i][1], neighb[i][2], neighb[i][3], neighb[i][4])

# KNN for outliers

stock_id

In [None]:
lof = LocalOutlierFactor(n_neighbors=5)
lof.fit_predict(train_index_stock_id)
train_index_stock_id['test'] = lof.fit_predict(train_index_stock_id)


In [None]:
train_index_stock_id[train_index_stock_id['test']==-1].index

time_id

In [None]:
lof = LocalOutlierFactor(n_neighbors=5)
lof.fit_predict(train_index_time_id)
train_index_time_id['test'] = lof.fit_predict(train_index_time_id)
train_index_time_id[train_index_time_id['test']==-1].index