In [None]:
import kmapper as km
from kmapper import Cover, jupyter
import numpy as np, pandas as pd

from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.manifold import Isomap
from sklearn import cluster
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
n_sample = 3000
train = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
train_id = np.array(train["comment_id"][:n_sample])
train = np.array(train["text"][:n_sample])

In [None]:
train[0]

In [None]:
train[2]

## Kepler Mapper Algorithm

## Tf-Idf 

In [None]:
mapper = km.KeplerMapper(verbose = 1)

projected_X = mapper.fit_transform(train,
                                   projection = [TfidfVectorizer(analyzer = "char",
                                                               ngram_range = (1,6),
                                                               max_df = 0.90,
                                                               min_df = 0.05),
                                               TruncatedSVD(n_components = 50,
                                                            random_state = 2048),
                                               Isomap(n_components = 2,
                                                      n_jobs = -1)],
                                   scaler = [None, None, MinMaxScaler()])


graph = mapper.map(projected_X,
                   X = None,
                   clusterer = cluster.AgglomerativeClustering(n_clusters = 5,
                                                             linkage = "complete",
                                                             affinity = "cosine"),
                   cover = Cover(n_cubes = 10, perc_overlap = 0.33))

In [None]:
vec = TfidfVectorizer(analyzer = "word",
                      strip_accents = "unicode",
                      stop_words = "english",
                      ngram_range = (1,3),
                      max_df = 0.90,
                      min_df = 0.10)

interpretable_inverse_X = vec.fit_transform(train).toarray()
interpretable_inverse_X_names = vec.get_feature_names()

## Latent Dirichlet Allocation

In [None]:
n_topics = 8
lda_model = LatentDirichletAllocation(n_components = n_topics, 
                                      learning_method = "online",
                                      batch_size = 128,
                                      evaluate_every = -1,
                                      max_iter = 20,
                                      random_state = 32,
                                      n_jobs = -1)

comment_topics = lda_model.fit_transform(interpretable_inverse_X)

In [None]:
comment_topics = np.matrix(comment_topics)
doc_topics = comment_topics/comment_topics.sum(axis = 1)

lda_keys = []
for i, tweet in enumerate(train):
    lda_keys += [doc_topics[i].argmax()]

In [None]:
_ = mapper.visualize(graph,
                     X = interpretable_inverse_X,
                     X_names = interpretable_inverse_X_names,
                     path_html = "./toxic_comments.html",
                     lens = projected_X,
                     lens_names = ["ISOMAP1", "ISOMAP2"],
                     title = "Latent Semantic Char-gram Analysis with Isometric Embedding",
                     custom_tooltips = train_id,
                     color_values = lda_keys,
                     color_function_name = 'topic')

jupyter.display("./toxic_comments.html")

## Reference
* [Kepler Mapper](https://kepler-mapper.scikit-tda.org/en/latest/notebooks/KeplerMapper-Newsgroup20-Pipeline.html)
* [Topological Methods for the Analysis of High Dimensional
Data Sets and 3D Object Recognition](https://research.math.osu.edu/tgda/mapperPBG.pdf)
* [Cover](https://arxiv.org/pdf/1706.00204.pdf)
* [Latent Dirichlet Allocation](https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf?TB_iframe=true&width=370.8&height=658.8)
* [Agglomerative Clustering](https://link.springer.com/article/10.1007/s00453-012-9717-4)