In [1]:
import pandas as pd
import numpy as np
from functools import partial
from collections import OrderedDict
from sklearn import manifold
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.linalg import hankel, eigh
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
n_neighbors = 10
n_components = 1
# Set-up manifold methods
LLE = partial(manifold.LocallyLinearEmbedding,
              n_neighbors, n_components, eigen_solver='auto')

methods = OrderedDict()
methods['LLE'] = LLE(method='standard')
methods['LTSA'] = LLE(method='ltsa')
methods['Hessian LLE'] = LLE(method='hessian')
methods['Modified LLE'] = LLE(method='modified')
methods['Isomap'] = manifold.Isomap(n_neighbors, n_components)
methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1)
methods['SE'] = manifold.SpectralEmbedding(n_components=n_components,
                                           n_neighbors=n_neighbors)
methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca',
                                 random_state=0)

In [3]:
methods.keys()

odict_keys(['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE', 'Isomap', 'MDS', 'SE', 't-SNE'])

In [4]:
df = pd.read_csv('steps_train.txt', skiprows=1,nrows=110, header=None, usecols=[1,2])
df_test = pd.read_csv('steps_test.txt', skiprows=1, nrows=150, header=None, usecols=[1,2])

## Locally Linear Embedding
Locally linear embedding (LLE) seeks a lower-dimensional projection of the data which preserves distances within local neighborhoods. It can be thought of as a series of local Principal Component Analyses which are globally compared to find the best non-linear embedding.

In [59]:
method=methods['LLE']
X_train_data = method.fit_transform(df)
N = len(X_train_data)
# L = (N)//2
L =  25
X_train = hankel(X_train_data[:L],X_train_data[L-1:]) # Creating trajectory matrix
eigenValues, eigenVectors = eigh(np.matmul(X_train, X_train.T))
idx = eigenValues.argsort()[::-1]
eigenValues = eigenValues[idx]
eigenVectors = eigenVectors[:,idx]
# Sree plot

eigen_data = go.Scatter(x=list(range(len(eigenValues[0:10]))), y=eigenValues[0:10])
eigen_layout = go.Layout(title='ScreePlot', title_x=0.5, xaxis_title="No of eigen values", yaxis_title="eigen value",)
fig = go.Figure(data=[eigen_data], layout=eigen_layout)
fig.show()

In [6]:
# Setting statistical dimension based on screeplot above
r = 2
# Performing singular value decomposition
U, Sigma, V = np.linalg.svd(X_train)
V = V.T
X_elem = np.array( [Sigma[i] * np.outer(U[:,i], V[:,i]) for i in range(0,r)] )
X_train_extracted = X_elem.sum(axis=0)
U = eigenVectors[:,:r] # r as statistical dimension
UT = U.T
pX = np.matmul(UT,X_train_extracted)
centroid = np.mean(pX, axis=1)
centroid = centroid[:,np.newaxis]
pXt = np.matmul(UT,X_train)
dt_matrix = centroid - pXt
dt_scores = np.linalg.norm(dt_matrix, axis=0, ord=2)

# Testing positional Deviation
X_test = method.fit_transform(df_test)
Xj = hankel(X_test[:L],X_test[L-1:])
pXj = np.matmul(UT, Xj)
dj_matrix = centroid - pXj
dj_scores = np.linalg.norm(dj_matrix, axis=0, ord=2)
dj_scores = np.asarray(dj_scores)


In [8]:
fig = make_subplots(rows=2, cols=2,specs=[[{}, {}], [{"colspan": 2}, None]],
    subplot_titles=("Training plot","Testing plot", "Anomaly score by Locally Linear Embedding"))
fig.add_trace(go.Scatter(x=df[1], y=df[2], marker_color='blue',
                        name="Training path"),row=1, col=1)
fig.add_trace(go.Scatter(x=df_test[1], y=df_test[2], marker_color='red',
                        name="Testing Score"),row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(len(dt_scores))), y=dt_scores[5:-5], marker_color='blue',
                         name="Training Score"), row=2, col=1)
fig.add_trace(go.Scatter(x=list(range(len(dj_scores))), y=dj_scores, marker_color='red',
                        name="Testing Score"), row=2, col=1)
fig.update_layout(showlegend=False)


## Multidimensional scaling (MDS)
Multidimensional scaling (MDS) seeks a low-dimensional representation of the data in which the distances respect well the distances in the original high-dimensional space.

In general, MDS is a technique used for analyzing similarity or dissimilarity data. It attempts to model similarity or dissimilarity data as distances in a geometric spaces. The data can be ratings of similarity between objects, interaction frequencies of molecules, or trade indices between countries.

There exists two types of MDS algorithm: metric and non metric. In the scikit-learn, the class MDS implements both. In Metric MDS, the input similarity matrix arises from a metric (and thus respects the triangular inequality), the distances between output two points are then set to be as close as possible to the similarity or dissimilarity data. In the non-metric version, the algorithms will try to preserve the order of the distances, and hence seek for a monotonic relationship between the distances in the embedded space and the similarities/dissimilarities.

In [58]:
method=methods['MDS']
X_train_data = method.fit_transform(df)
N = len(X_train_data)
# L = (N)//2
L =  25
X_train = hankel(X_train_data[:L],X_train_data[L-1:]) # Creating trajectory matrix
eigenValues, eigenVectors = eigh(np.matmul(X_train, X_train.T))
idx = eigenValues.argsort()[::-1]
eigenValues = eigenValues[idx]
eigenVectors = eigenVectors[:,idx]
# Sree plot

eigen_data = go.Scatter(x=list(range(len(eigenValues[0:10]))), y=eigenValues[0:10])
eigen_layout = go.Layout(title='ScreePlot', title_x=0.5, xaxis_title="No of eigen values", yaxis_title="eigen value",)
fig = go.Figure(data=[eigen_data], layout=eigen_layout)
fig.show()

In [10]:
# Setting statistical dimension based on screeplot above
r = 2
# Performing singular value decomposition
U, Sigma, V = np.linalg.svd(X_train)
V = V.T
X_elem = np.array( [Sigma[i] * np.outer(U[:,i], V[:,i]) for i in range(0,r)] )
X_train_extracted = X_elem.sum(axis=0)
U = eigenVectors[:,:r] # r as statistical dimension
UT = U.T
pX = np.matmul(UT,X_train_extracted)
centroid = np.mean(pX, axis=1)
centroid = centroid[:,np.newaxis]
pXt = np.matmul(UT,X_train)
dt_matrix = centroid - pXt
dt_scores = np.linalg.norm(dt_matrix, axis=0, ord=2)

# Testing positional Deviation
X_test = method.fit_transform(df_test)
Xj = hankel(X_test[:L],X_test[L-1:])
pXj = np.matmul(UT, Xj)
dj_matrix = centroid - pXj
dj_scores = np.linalg.norm(dj_matrix, axis=0, ord=2)
dj_scores = np.asarray(dj_scores)


In [12]:
fig = make_subplots(rows=2, cols=2,specs=[[{}, {}], [{"colspan": 2}, None]],
    subplot_titles=("Training plot","Testing plot", "Anomaly score by Multidimensional scaling"))
fig.add_trace(go.Scatter(x=df[1], y=df[2], marker_color='blue',
                        name="Training path"),row=1, col=1)
fig.add_trace(go.Scatter(x=df_test[1], y=df_test[2], marker_color='red',
                        name="Testing Score"),row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(len(dt_scores))), y=dt_scores[5:-5], marker_color='blue',
                         name="Training Score"), row=2, col=1)
fig.add_trace(go.Scatter(x=list(range(len(dj_scores))), y=dj_scores, marker_color='red',
                        name="Testing Score"), row=2, col=1)
fig.update_layout(showlegend=False)


## Spectral Embedding
Spectral Embedding is an approach to calculating a non-linear embedding. Scikit-learn implements Laplacian Eigenmaps, which finds a low dimensional representation of the data using a spectral decomposition of the graph Laplacian. The graph generated can be considered as a discrete approximation of the low dimensional manifold in the high dimensional space. Minimization of a cost function based on the graph ensures that points close to each other on the manifold are mapped close to each other in the low dimensional space, preserving local distances. Spectral embedding can be performed with the function spectral_embedding or its object-oriented counterpart SpectralEmbedding.

In [50]:
methods.keys()

odict_keys(['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE', 'Isomap', 'MDS', 'SE', 't-SNE'])

In [57]:
method=methods['SE']
X_train_data = method.fit_transform(df)
N = len(X_train_data)
L = (N)//2
# L =  50
X_train = hankel(X_train_data[:L],X_train_data[L-1:]) # Creating trajectory matrix
eigenValues, eigenVectors = eigh(np.matmul(X_train, X_train.T))
idx = eigenValues.argsort()[::-1]
eigenValues = eigenValues[idx]
eigenVectors = eigenVectors[:,idx]

eigen_data = go.Scatter(x=list(range(len(eigenValues[0:10]))), y=eigenValues[0:10])
eigen_layout = go.Layout(title='ScreePlot', title_x=0.5, xaxis_title="No of eigen values", yaxis_title="eigen value",)
fig = go.Figure(data=[eigen_data], layout=eigen_layout)
fig.show()


In [39]:
# Setting statistical dimension based on screeplot above
r = 1
# Performing singular value decomposition
U, Sigma, V = np.linalg.svd(X_train)
V = V.T
X_elem = np.array( [Sigma[i] * np.outer(U[:,i], V[:,i]) for i in range(0,r)] )
X_train_extracted = X_elem.sum(axis=0)
U = eigenVectors[:,:r] # r as statistical dimension
UT = U.T
pX = np.matmul(UT,X_train_extracted)
centroid = np.mean(pX, axis=1)
centroid = centroid[:,np.newaxis]
pXt = np.matmul(UT,X_train)
dt_matrix = centroid - pXt
dt_scores = np.linalg.norm(dt_matrix, axis=0, ord=2)

# Testing positional Deviation
X_test = method.fit_transform(df_test)
Xj = hankel(X_test[:L],X_test[L-1:])
pXj = np.matmul(UT, Xj)
dj_matrix = centroid - pXj
dj_scores = np.linalg.norm(dj_matrix, axis=0, ord=2)
dj_scores = np.asarray(dj_scores)


In [49]:
fig = make_subplots(rows=2, cols=2,specs=[[{}, {}], [{"colspan": 2}, None]],
    subplot_titles=("Training plot","Testing plot", "Anomaly score by t_SNE"))
fig.add_trace(go.Scatter(x=df[1], y=df[2], marker_color='blue',
                        name="Training path"),row=1, col=1)
fig.add_trace(go.Scatter(x=df_test[1], y=df_test[2], marker_color='red',
                        name="Testing Score"),row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(len(dt_scores))), y=dt_scores[:], marker_color='blue',
                         name="Training Score"), row=2, col=1)
fig.add_trace(go.Scatter(x=list(range(len(dj_scores))), y=dj_scores, marker_color='red',
                        name="Testing Score"), row=2, col=1)
fig.update_layout(showlegend=False)


## Isomap
One of the earliest approaches to manifold learning is the Isomap algorithm, short for Isometric Mapping. Isomap can be viewed as an extension of Multi-dimensional Scaling (MDS) or Kernel PCA. Isomap seeks a lower-dimensional embedding which maintains geodesic distances between all points.


In [60]:
method=methods['Isomap']
X_train_data = method.fit_transform(df)
N = len(X_train_data)
# L = (N)//2
L =  50
X_train = hankel(X_train_data[:L],X_train_data[L-1:]) # Creating trajectory matrix
eigenValues, eigenVectors = eigh(np.matmul(X_train, X_train.T))
idx = eigenValues.argsort()[::-1]
eigenValues = eigenValues[idx]
eigenVectors = eigenVectors[:,idx]
# Sree plot

eigen_data = go.Scatter(x=list(range(len(eigenValues[0:10]))), y=eigenValues[0:10])
eigen_layout = go.Layout(title='ScreePlot', title_x=0.5, xaxis_title="No of eigen values", yaxis_title="eigen value",)
fig = go.Figure(data=[eigen_data], layout=eigen_layout)
fig.show()

In [47]:
# Setting statistical dimension based on screeplot above
r = 1
# Performing singular value decomposition
U, Sigma, V = np.linalg.svd(X_train)
V = V.T
X_elem = np.array( [Sigma[i] * np.outer(U[:,i], V[:,i]) for i in range(0,r)] )
X_train_extracted = X_elem.sum(axis=0)
U = eigenVectors[:,:r] # r as statistical dimension
UT = U.T
pX = np.matmul(UT,X_train_extracted)
centroid = np.mean(pX, axis=1)
centroid = centroid[:,np.newaxis]
pXt = np.matmul(UT,X_train)
dt_matrix = centroid - pXt
dt_scores = np.linalg.norm(dt_matrix, axis=0, ord=2)

# Testing positional Deviation
X_test = method.fit_transform(df_test)
Xj = hankel(X_test[:L],X_test[L-1:])
pXj = np.matmul(UT, Xj)
dj_matrix = centroid - pXj
dj_scores = np.linalg.norm(dj_matrix, axis=0, ord=2)
dj_scores = np.asarray(dj_scores)


In [48]:
fig = make_subplots(rows=2, cols=2,specs=[[{}, {}], [{"colspan": 2}, None]],
    subplot_titles=("Training plot","Testing plot", "Anomaly score by Isomap"))
fig.add_trace(go.Scatter(x=df[1], y=df[2], marker_color='blue',
                        name="Training path"),row=1, col=1)
fig.add_trace(go.Scatter(x=df_test[1], y=df_test[2], marker_color='red',
                        name="Testing Score"),row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(len(dt_scores))), y=dt_scores[:], marker_color='blue',
                         name="Training Score"), row=2, col=1)
fig.add_trace(go.Scatter(x=list(range(len(dj_scores))), y=dj_scores, marker_color='red',
                        name="Testing Score"), row=2, col=1)
fig.update_layout(showlegend=False)
