In [None]:
# pip
!pip install umap-learn
!pip install umap-learn[plot]
# conda
!conda install -c conda-forge umap-learn
!conda install seaborn datashader bokeh holoviews

In [None]:
import pandas as pd
import umap
import umap.plot

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import matplotlib.pyplot as plt
%matplotlib notebook
%matplotlib inline
from bokeh.plotting import show, save, output_notebook, output_file
from bokeh.resources import INLINE
output_notebook(resources=INLINE)

In [None]:
dataset = fetch_20newsgroups(subset='all',
                             shuffle=True, random_state=42)

In [None]:
print(f'{len(dataset.data)} documents')
print(f'{len(dataset.target_names)} categories')
print(*list(enumerate(dataset.target_names, 1)), sep='\n')

In [None]:
category_labels = [dataset.target_names[x] for x in dataset.target]
hover_df = pd.DataFrame(category_labels, columns=['category'])

In [None]:
vectorizer = CountVectorizer(min_df=5, stop_words='english')
word_doc_matrix = vectorizer.fit_transform(dataset.data)

In [None]:
embedding = umap.UMAP(n_components=2, metric='hellinger').fit(word_doc_matrix)

In [None]:
f = umap.plot.points(embedding, labels=hover_df['category'])

In [None]:
f = umap.plot.interactive(embedding, labels=dataset.target, hover_data=hover_df, point_size=1)
show(f)