# Dimensionality reduction using graphs

In [None]:
%matplotlib widget

from utils import visualization, weights, features, build, embedding, evaluation
from sklearn.datasets import load_digits, load_iris, make_swiss_roll
from data import preprocessing, small
import networkx as nx
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import keras

## Load dataset

Select a dataset for testing the algorithms

### Small toy dataset

In [None]:
# two point clouds in 2D
# data = small.TWO_CLOUDS_2D
# labels = small.TWO_CLOUDS_2D_LABELS

# four point clouds in 2D
# data = small.FOUR_CLOUDS_2D
# labels = small.FOUR_CLOUDS_2D

# five point clouds in 2D
data = small.FIVE_CLOUDS_2D
labels = small.FIVE_CLOUDS_2D_LABELS

# three point clouds in 3D
# data = small.THREE_CLOUDS_3D
# labels = small.THREE_CLOUDS_3D_LABELS

### Iris

In [None]:
iris = load_iris()
data = iris.data
labels = iris.target

### Digits

In [None]:
digits = load_digits()
data = digits.data
labels = digits.target

### Swiss roll

In [None]:
data, labels = make_swiss_roll(n_samples=1000, noise=0.0, random_state=0)

## Remove duplicities

important to do for every datasets - some edge weights can be infinite otherwise

In [None]:
data, labels = preprocessing.remove_duplicities(data, labels, shuffle=True, normalize=False)

## Visualize dataset

note that the function throws error on datasets with dimension different than 2 and 3

In [None]:
visualization.show_data(data, graph=None, labels=labels, square=False, outpath='', show_numbers=False, title='Original space', dpi=300)

## Select builder

1. select weight function and feature function

In [None]:
# get reciprocal function 1/x^k
weight_fun = weights.get_reciprocal_pow(k=1)

# feature function - necessary only for GraphSAGE
feature_fun = None
# feature_fun = features.feature_coords # features are coordinates in the original space
# feature_fun = features.feature_deg_weight # features are degree of a node and average outgoing edge weight

2. select builder method

In [None]:
# builder = build.FullBuilder(weight_fun=weight_fun, feature_fun=feature_fun)
# builder = build.CheapestBuilder(weight_fun=weight_fun, feature_fun=feature_fun)
# builder = build.CheapestNNBuilder(weight_fun=weight_fun, feature_fun=feature_fun, knn=2)
builder = build.SpanningTreeBuilder(weight_fun=weight_fun, feature_fun=feature_fun)
# builder = build.SpanningNNBuilder(weight_fun=weight_fun, feature_fun=feature_fun, knn=2)
# builder = build.HierarchicalBuilder(weight_fun=weight_fun, feature_fun=feature_fun, knn=2)
# builder = build.HierarchicalClusterBuilder(weight_fun=weight_fun, feature_fun=feature_fun, knn=2)
# builder = build.SpanningTreeDenseBuilder(weight_fun=weight_fun, feature_fun=feature_fun)

3. build the graph with the selected builder

In [None]:
builder.build(data)
graph = builder.graph

## Visualize the graph and connections in original space

In [None]:
visualization.show_graph(graph, labels=labels, outpath='', show_numbers=False, title='Graph', dpi=300)

In [None]:
visualization.show_data(data, graph=graph, labels=labels, square=False, outpath='', show_numbers=False, title='Original space with built connections', dpi=300)

## Embed the graph

1. select target dimension

In [None]:
target_dimension = 2

2. select the embedder method

In [None]:
# embedder = embedding.Node2VecEmbedder(embedding_dim=target_dimension, walk_length=100, num_walks=10, window=10, min_count=1, batch_words=4)

# embedder = embedding.WatchYourStepEmbedder(embedding_dim=target_dimension, adjacency_powers=10, num_walks=150, attention_regularization=0.5, batch_size=12, epochs=100)

# embedder = embedding.GraphSAGEEmbedder(embedding_dim=target_dimension, num_walks=10, walk_length=10, batch_size=50, epochs=4, num_samples=[10, 5], layer_sizes=[20, 2], dropout=0.05, bias=False, loss=keras.losses.binary_crossentropy, normalize=None)

# embedder = embedding.SpringEmbedder()

embedder = embedding.KamadaKawaiEmbedder(embedding_dim=target_dimension, scale=1)

3. embed the data

In [None]:
embedder.embed(graph)
embeddings = embedder.embeddings

4. compute metrics

In [None]:
evaluation.print_evaluation(data=data, embeddings=embeddings)

## Visualize the result

note that it is possible only for target dimension 2 or 3

In [None]:
visualization.show_data(embeddings, labels=labels, square=False, outpath='', show_numbers=False, title='Embedded data', dpi=300)

## Compare with SOTA

### PCA

In [None]:
pca = PCA(n_components=target_dimension)
pca_embeddings = pca.fit_transform(data)
evaluation.print_evaluation(data=data, embeddings=pca_embeddings)

In [None]:
visualization.show_data(pca_embeddings, labels=labels, square=False, outpath='', show_numbers=False, title='PCA embeddings', dpi=300)

### UMAP

In [None]:
umap_obj = umap.UMAP(n_components=target_dimension, random_state=0)
umap_embeddings = umap_obj.fit_transform(data)
evaluation.print_evaluation(data=data, embeddings=umap_embeddings)

In [None]:
visualization.show_data(umap_embeddings, labels=labels, square=False, outpath='', show_numbers=False, title='UMAP embeddings', dpi=300)

### TSNE

In [None]:
tsne = TSNE(n_components=target_dimension, random_state=0)
tsne_embeddings = tsne.fit_transform(data)
evaluation.print_evaluation(data=data, embeddings=tsne_embeddings)

In [None]:
visualization.show_data(tsne_embeddings, labels=labels, square=False, outpath='', show_numbers=False, title='TSNE embeddings', dpi=300)