In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Install needed dependencies. Giotto for TDA stuff and cairocffi to
# visualize the iGraph objects return by Giotto's mapper API

!pip install giotto-tda
!pip install cairocffi

In [None]:
# Basics
import numpy as np
import pandas as pd

# Graphics
import matplotlib.pyplot as plt
import igraph as ig
import matplotlib.cm as cm

# TDA 
from gtda.mapper import (CubicalCover, Projection,
                        plot_static_mapper_graph, make_mapper_pipeline)

# Sklearn 
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

# Warnings
import warnings

%config InlineBackend.figure_formats = ['svg']

In [None]:
# Load data
df = pd.read_csv("../input/iris-flower-dataset/IRIS.csv")

# Species to int labels
df['labels'] = df['species'].astype('category').cat.codes

## PCA as filter

In [None]:
# Make Mapper pipeline
filter_func = PCA(n_components=2)
cover = CubicalCover(n_intervals=10, kind='uniform', overlap_frac=0.4)
scaler = MinMaxScaler()

pipe = make_mapper_pipeline(filter_func=filter_func,
                           cover=cover,
                           scaler=scaler,
                           verbose=True)

# Get the mapper graph
data = df.iloc[:, 0:4].values
graph = pipe.fit_transform(data)

# Save the data points in each node
list_nodes = graph.vs['node_elements']

### Get colors
To color the nodes, we will compute the mean value of all the data point
labels contained in the mapper node. The rounded integer part will
correspond to the assigned color (this is equivalent to just choose
the most repeated label), and we'll use the decimal part to define the alpha
of the node, so we can have an idea of "how mixed" a given mapper node is.

In [None]:
# This cell is just to visualize it with the color alphas, since it doesn't display within a pyplot figure.


graph_layout = graph.layout()

# Get labels
labs = np.empty(len(list_nodes))
for i, nodes in enumerate(list_nodes):
    labs[i] = np.mean(df.iloc[nodes, 5].values)
alphas = 1- abs(np.round(labs) - labs)
    
# Specify attributes
visual_style = {}
visual_style["bbox"] = (500,300)
visual_style["vertex_size"] = 10
visual_style['vertex_color'] = [cm.get_cmap('tab10')(0.1*val, alpha) for
                                (val, alpha) in zip(np.round(labs, 0), alphas)]


# Plot
ig.plot(graph, mark_groups={}, **visual_style)

### Comparison with PCA

In [None]:
# Save graph layout
graph_layout = graph.layout()

# Get labels
labs = np.empty(len(list_nodes))
for i, nodes in enumerate(list_nodes):
    labs[i] = np.mean(df.iloc[nodes, 5].values)
alphas = 1- abs(np.round(labs) - labs)


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,4))
fig.suptitle('Simple PCA and Mapper', size=16)
    
# Specify attributes
visual_style = {}
visual_style["bbox"] = (500,300)
visual_style["vertex_size"] = 10
visual_style['vertex_color'] = [cm.get_cmap('tab10')(0.1*val, alpha) for
                                (val, alpha) in zip(np.round(labs, 0), alphas)] # No alpha :( 


# Plot
ig.plot(graph, layout=graph_layout, mark_groups={}, **visual_style, target=ax2)
ax2.axis('off')

xpca, ypca = PCA(n_components=2).fit_transform(df.iloc[:, 0:4].values).T
ax1.scatter(xpca, ypca, c=df['labels']); 

In [None]:
# For each node, average on the feature
def plot_feature(feature_col,
                 color_map='viridis', target=None):
    
    node_value = np.empty(len(list_nodes))

    for i, nodes in enumerate(list_nodes):
        node_value[i] = data[nodes, feature_col].mean()

    node_value = (node_value - node_value.min())/(node_value.max()-node_value.min())
    
    visual_style['vertex_color'] = [cm.get_cmap(color_map)(nval) for
                                nval in node_value]
    

    return ig.plot(graph, layout=graph_layout, target=target,mark_groups={}, **visual_style)

In [None]:
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2,3, figsize=(13,10))

features = [0, 1, 2, 3]
titles = df.columns[:4]

visual_style['vertex_color'] = [cm.get_cmap('tab10')(0.1*val, alpha) for
                                (val, alpha) in zip(np.round(labs, 0), alphas)]


# Plot species
ig.plot(graph, layout=graph_layout, mark_groups={}, **visual_style, target=ax1)
ax1.axis('off')
ax1.set_title('Species')

# Plot features distribution
for i, (feat, tl) in enumerate(zip(features, titles)): 
    axs = eval(f'ax{i+2}')
    plot_feature(feat, target=axs)
    axs.axis('off')
    axs.set_title(tl)
    
ax6.axis('off');
    

## TSNE as filter

In [None]:
# Just some useless class adding a transform method so we can use it in the mapper pipeline
class TSNE_filter(): 
    
    def __init__(self, n_components=2, perplexity=30.0, early_exaggeration=12.0, init='pca'):
        self.tsne = TSNE(n_components=n_components,
                         perplexity=perplexity,
                         early_exaggeration=early_exaggeration,
                         init=init)
         
    def fit(self, X, y=None):
        self.tsne.fit(X)
        return self.tsne
    
    def transform(self, X, y=None):
        return self.tsne.fit_transform(X)
    
    def fit_transform(self, X, y=None):
        return self.tsne.fit_transform(X)

In [None]:
# Make Mapper pipeline
filter_func = TSNE_filter(perplexity=30.0, early_exaggeration=60.0)
cover = CubicalCover(n_intervals=6, kind='uniform', overlap_frac=0.5)
scaler = MinMaxScaler()

pipe = make_mapper_pipeline(filter_func=filter_func,
                           cover=cover,
                           scaler=scaler,
                           verbose=True)

# Get the mapper graph
data = df.iloc[:, 0:4].values
graph = pipe.fit_transform(data)

# Save the data points in each node
list_nodes = graph.vs['node_elements']

In [None]:
graph_layout = graph.layout()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,4))
fig.suptitle('TSNE and Mapper', size=16)

# Get labels
labs = np.empty(len(list_nodes))
for i, nodes in enumerate(list_nodes):
    labs[i] = np.mean(df.iloc[nodes, 5].values)
alphas = 1- abs(np.round(labs) - labs)

# Specify attributes
visual_style = {}
visual_style["bbox"] = (500,300)
visual_style["vertex_size"] = 10
visual_style['vertex_color'] = [cm.get_cmap('tab10')(0.1*val, alpha) for
                                (val, alpha) in zip(np.round(labs, 0), alphas)]


# Plot
xtsne, ytsne = TSNE(n_components=2).fit_transform(df.iloc[:, 0:4].values).T
ax1.scatter(xtsne, ytsne, c=df['labels'])

ig.plot(graph, layout=graph_layout, mark_groups={}, **visual_style, target=ax2)
ax2.axis('off');


In [None]:
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2,3, figsize=(13,10))

features = [0, 1, 2, 3]
titles = df.columns[:4]

visual_style['vertex_color'] = [cm.get_cmap('tab10')(0.1*val, alpha) for
                                (val, alpha) in zip(np.round(labs, 0), alphas)]


# Plot
ig.plot(graph, layout=graph_layout, mark_groups={}, **visual_style, target=ax1)
ax1.axis('off')
ax1.set_title('Species')


for i, (feat, tl) in enumerate(zip(features, titles)): 
    axs = eval(f'ax{i+2}')
    plot_feature(feat, target=axs)
    axs.axis('off')
    axs.set_title(tl)
    
ax6.axis('off');
    