In [None]:
import os
import re
import pandas as pd
import numpy as np

from random import randint

from sklearn.feature_extraction import text #to access stop words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

from sklearn.cluster import AgglomerativeClustering

from bokeh.plotting import figure, output_notebook, show, ColumnDataSource
from bokeh.models import HoverTool, CategoricalColorMapper
from bokeh.palettes import d3
from bokeh.transform import factor_cmap

output_notebook()

In [None]:
dir_name = '../data_preprocessed_csv/'

In [None]:
filenames = os.listdir(dir_name)

In [None]:
corpus = []
for filename in filenames:
    df = pd.read_csv(dir_name + filename, index_col = 0)
    df = df[df.text_type.isin(['a','q'])]
    try:
        corpus.append(' '.join(df.text.values))
    except:
        print(filename)

In [None]:
# words that were deemed uninformative were manually added to a text file word_to_remove.txt.
# this was done manually - there was no formal criteria used to decide which words to remove.
with open('words_to_remove.txt', 'r') as f:
    words_to_remove = [word.strip('\n') for word in f.readlines()]

my_stop_words = text.ENGLISH_STOP_WORDS.union(words_to_remove)

In [None]:
tfidf_vectorizer = TfidfVectorizer(use_idf = True, stop_words = my_stop_words)
vectors = tfidf_vectorizer.fit_transform(corpus).toarray()
features = tfidf_vectorizer.get_feature_names()

In [None]:
# for each deposition, find the 10 features with the largest weights. these are the keywords for that deposition
# can change the number 10 to whatever you want
keywords = []
for i, row in enumerate(vectors):
    df_temp = pd.DataFrame({"Features": features, "Weight": row})
    words = df_temp.sort_values(by = 'Weight', axis = 0, ascending = False).head(10).Features.values
    keywords.append(', '.join(words))

In [None]:
# clustering done before dimension reduction. might be worth experimenting and swapping the order of this
# also can experiment with number of clusters.
clustering = AgglomerativeClustering(n_clusters = 10).fit(vectors)
clusters = clustering.labels_

In [None]:
tfidf1_tsne = TSNE(random_state = 0).fit_transform(vectors)
tfidf1_umap = umap.UMAP(random_state = 0).fit_transform(vectors)

In [None]:
x = tfidf1_tsne[:, 0]
y = tfidf1_tsne[:, 1]

# categorical variables have to be strings in bokeh
clusters_s = [f'{i}' for i in clusters]

source = ColumnDataSource(
    data=dict(
        x=x,
        y=y,
        filenames=filenames,
        keywords=keywords,
        clusters = clusters_s
    )
)

palette = d3['Category10'][10]
color_map = CategoricalColorMapper(factors=[f'{i}' for i in range(10)],
                                   palette=palette)

TOOLS="box_zoom,hover,reset"
p = figure(tools=TOOLS)
p.background_fill_color = "black"
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

p.scatter(x='x', y='y',
          color={'field': 'clusters', 'transform': color_map},
          source=source)

hover = p.select(dict(type=HoverTool))
hover.tooltips = [
    ("filename", "@filenames"),
    ("keywords", "@keywords")
]

show(p)

In [None]:
x = tfidf1_umap[:, 0]
y = tfidf1_umap[:, 1]

# categorical variables have to be strings in bokeh
clusters_s = [f'{i}' for i in clusters]

source = ColumnDataSource(
    data=dict(
        x=x,
        y=y,
        filenames=filenames,
        keywords=keywords,
        clusters = clusters_s
    )
)

palette = d3['Category10'][10]
color_map = CategoricalColorMapper(factors=[f'{i}' for i in range(10)],
                                   palette=palette)

TOOLS="box_zoom,hover,reset"
p = figure(tools=TOOLS)
p.background_fill_color = "black"
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

p.scatter(x='x', y='y',
          color={'field': 'clusters', 'transform': color_map},
          source=source)

hover = p.select(dict(type=HoverTool))
hover.tooltips = [
    ("filename", "@filenames"),
    ("keywords", "@keywords")
]

show(p)