In [1]:
import ipywidgets as widgets
import kmapper as km
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN, AgglomerativeClustering, KMeans
from umap import UMAP
from hdbscan import HDBSCAN
from ipywidgets import fixed, interact, interact_manual, interactive
from IPython.display import display, clear_output

import graph_tool.all as gt
from functools import lru_cache

%matplotlib inline

## Data Cleaning

We're using the a Big Five Personality Factors dataset from [here](https://openpsychometrics.org/_rawdata/), with the idea that there should be a grouping of *five* that we might be able to pull out from the data.

The only real data cleaning step is to reverse some of the negatively phrased questions for each factor (i.e. a higher score should align with a higher factor score). This will just make interpreting and analysis easier. [See here](https://ipip.ori.org/new_ipip-50-item-scale.htm) for more info on the questions.

In [2]:
df = pd.read_csv('../data/data.csv', sep="\t")

In [3]:
df.head()

Unnamed: 0,race,age,engnat,gender,hand,source,country,E1,E2,E3,...,O1,O2,O3,O4,O5,O6,O7,O8,O9,O10
0,3,53,1,1,1,1,US,4,2,5,...,4,1,3,1,5,1,4,2,5,5
1,13,46,1,2,1,1,US,2,2,3,...,3,3,3,3,2,3,3,1,3,2
2,1,14,2,2,1,1,PK,5,1,1,...,4,5,5,1,5,1,5,5,5,5
3,3,19,2,2,1,1,RO,2,5,2,...,4,3,5,2,4,2,5,2,5,5
4,11,25,2,2,1,2,US,3,1,3,...,3,1,1,1,3,1,3,1,5,3


In [4]:
demo_cols = ['race', 'age', 'engnat', 'gender', 'hand', 'source', 'country']
question_columns = [col for col in df.columns if col not in demo_cols]
o_columns = [col for col in df.columns if col.startswith("O")]
c_columns = [col for col in df.columns if col.startswith("C")]
e_columns = [col for col in df.columns if col.startswith("E")]
a_columns = [col for col in df.columns if col.startswith("A")]
n_columns = [col for col in df.columns if col.startswith("N")]

In [5]:
flip_qs = 'E2 E4 E6 E8 E10 N2 N4 A1 A3 A5 A7 C2 C4 C6 C8 O2 O4 O6'.split()
for q in flip_qs:
    df[q] = df[q].apply(lambda x: abs(x-6))

In [6]:
for columns in [o_columns, c_columns, e_columns, a_columns, n_columns]:
    assert (df[columns].corr() > 0).all(axis=None)

In [7]:
X = df[question_columns].drop_duplicates()

## Creating Widgets
We'll need 4 parameters for MAPPER: the lens, the clusterer, the number of cubes, and the overlap percentage. Since both the lens creation and the mapper can be computationally intensive, we'll include a run button to run on command rather than update continually.

In [8]:
def get_lens(value):
    if value == "svd":
        mapper = km.KeplerMapper(verbose=0)
        projection = TruncatedSVD(n_components=2, random_state=1234)
        lens = mapper.fit_transform(X, projection=projection)
    if value == "iso_l2":
        model = IsolationForest(random_state=1234, contamination="auto", behaviour="new")
        model.fit(X)
        lens1 = model.decision_function(X).reshape((X.shape[0], 1))
        
        mapper = km.KeplerMapper(verbose=0)
        lens2 = mapper.fit_transform(X, projection="l2norm")

        lens = np.c_[lens1, lens2]
    if value == "umap":
        mapper = km.KeplerMapper(verbose=0)
        projection = UMAP(n_neighbors=30, min_dist=0.01, random_state=1234)
        lens = mapper.fit_transform(
            X, projection=projection)
    return lens

def get_clusterer(value):
    if value == "dbscan":
        clusterer = DBSCAN(eps=0.5, min_samples=5)
    if value == "hdbscan":
        clusterer = HDBSCAN(allow_single_cluster=True)
    if value == "agglomerative_3":
        clusterer = AgglomerativeClustering(n_clusters=3)
    if value == "kmeans_3":
        clusterer = KMeans(n_clusters=3)
    return clusterer

In [9]:
# Input Widgets

lens_selector = widgets.Dropdown(
    options=['svd', 'iso_l2', 'umap'],
    value="svd",
    description="Projection")

clusterer_selector = widgets.Dropdown(
    options=['dbscan', 'hdbscan', 'agglomerative_3', 'kmeans_3'],
    value="dbscan",
    description="Clusterer")

cubes_slider = widgets.IntSlider(
    value=10,
    min=5,
    max=50,
    step=5,
    description="N Cubes",
    disabled=False,
    continuous_update=False,
)

overlap_slider = widgets.FloatSlider(
    value=0.5,
    min=0.05,
    max=1.0,
    step=0.05,
    description="Overlap Percentage",
    disabled=False,
    continuous_update=False,
    style={"description_width": "initial"},
)

run_button = widgets.Button(description="Run")

# Output Widgets
out_image = widgets.Output()

# Layout
controls = widgets.VBox(
    [lens_selector, clusterer_selector, cubes_slider, overlap_slider, run_button]
)

layout = widgets.HBox([controls, out_image])

In [10]:
def draw_graph(graph):
    node_map = {index : node for index, node in enumerate(sorted(graph['nodes'].keys()))}
    node_map_inv = {v : k for (k, v) in node_map.items()}
    
    g = gt.Graph(directed=False)
    g.add_vertex(len(node_map))
    for node, links in graph['links'].items():
        source_id = node_map_inv[node]
        for link in links:
            target_id = node_map_inv[link]
            g.add_edge(g.vertex(source_id), g.vertex(target_id))
            

    pos = gt.sfdp_layout(g)
    gt.graph_draw(g, pos=pos, output_size=(300,300));

In [11]:
# This is the callback that will run when we click the button
# The only argument is the button, where we'll save the mapper graph for potential later use

def use_mapper(b):
    lens = get_lens(lens_selector.value)
    clusterer = get_clusterer(clusterer_selector.value)
    cover = km.Cover(n_cubes=cubes_slider.value,
                     perc_overlap=overlap_slider.value)
    mapper = km.KeplerMapper(verbose=0)
    b.value = mapper.map(lens=lens, X=X, cover=cover, clusterer=clusterer)
    graph = b.value
    with out_image:
        clear_output()
        draw_graph(graph)


run_button.on_click(use_mapper)

In [12]:
display(layout)

HBox(children=(VBox(children=(Dropdown(description='Projection', options=('svd', 'iso_l2', 'umap'), value='svd…

In [13]:
graph = run_button.value

AttributeError: 'Button' object has no attribute 'value'