## Imports

In [1]:
import pytz
import pandas as pd
import numpy as np

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer

from sentence_transformers import SentenceTransformer, util
import umap
import hdbscan

from bokeh.io import output_notebook, curdoc, output_file
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.layouts import gridplot, column, row
from bokeh.models import CustomJS, Slider, Title, LinearColorMapper, ColorBar, Dropdown
from bokeh.palettes import Magma, Inferno, Plasma, Viridis, Cividis
output_notebook()
curdoc().theme = 'dark_minimal'

## Fetching model from HuggingFace (will yield embeddings with 384 dimensions)

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

## Reading Tweets Data and converting to appropriate timezone

In [3]:
df = pd.read_csv(
    'data/elon/TweetsElonMusk.csv', 
    parse_dates = ['created_at'], 
    date_parser=lambda x: pd.to_datetime(x).tz_localize('EET', ambiguous='infer').tz_convert('America/Los_Angeles')
)



## Fetching all English language Tweets

In [4]:
all_english_tweets = df[df['language']=='en']['tweet']
all_english_tweets.reset_index(inplace=True, drop=True)
all_english_tweets

0        @vincent13031925 For now. Costs are decreasing...
1                                 Love this beautiful shot
2        @agnostoxxx @CathieDWood @ARKInvest Trust the ...
3                       The art In Cyberpunk is incredible
4        @WholeMarsBlog If you don’t say anything &amp;...
                               ...                        
11133    @eugenelee3 @PPathole @SpaceX @Tesla Yeah, not...
11134    @PPathole @SpaceX @Tesla That was my night job...
11135    @PPathole @SpaceX @Tesla True. Ancient times …...
11136                    @Erdayastronaut @Tesla Absolutely
11137    @Erdayastronaut @Tesla Tesla is building up co...
Name: tweet, Length: 11138, dtype: object

In [5]:
type(all_english_tweets)

pandas.core.series.Series

## Generating embeddings for all tweets

In [6]:
embeddings = model.encode(all_english_tweets, show_progress_bar=True)

Batches:   0%|          | 0/349 [00:00<?, ?it/s]

#### Sanity Checks

In [7]:
type(embeddings)

numpy.ndarray

In [8]:
len(embeddings)

11138

In [9]:
len(embeddings[0])

384

In [10]:
type(embeddings[0])

numpy.ndarray

## Generating DataFrame of tweets

In [11]:
all_english_tweets = pd.DataFrame(all_english_tweets)
all_english_tweets

Unnamed: 0,tweet
0,@vincent13031925 For now. Costs are decreasing...
1,Love this beautiful shot
2,@agnostoxxx @CathieDWood @ARKInvest Trust the ...
3,The art In Cyberpunk is incredible
4,@WholeMarsBlog If you don’t say anything &amp;...
...,...
11133,"@eugenelee3 @PPathole @SpaceX @Tesla Yeah, not..."
11134,@PPathole @SpaceX @Tesla That was my night job...
11135,@PPathole @SpaceX @Tesla True. Ancient times …...
11136,@Erdayastronaut @Tesla Absolutely


## Reducing embeddings to 2 dimensions using PCA decomposition

In [12]:
pca = PCA(n_components=2)

In [13]:
pca_result = pca.fit_transform(embeddings)
pca_result

array([[ 0.2573638 ,  0.13831055],
       [-0.11503306, -0.27879494],
       [ 0.31690452, -0.07570227],
       ...,
       [-0.12437396, -0.00871725],
       [ 0.40388852,  0.37416613],
       [-0.1291023 ,  0.46907255]], dtype=float32)

In [14]:
len(pca_result)

11138

In [15]:
print(len(pca_result[:,0]))
pca_result[:,0]

11138


array([ 0.2573638 , -0.11503306,  0.31690452, ..., -0.12437396,
        0.40388852, -0.1291023 ], dtype=float32)

In [16]:
print(len(pca_result[:,1]))
pca_result[:,1]

11138


array([ 0.13831055, -0.27879494, -0.07570227, ..., -0.00871725,
        0.37416613,  0.46907255], dtype=float32)

In [17]:
pca.explained_variance_ratio_

array([0.0799657 , 0.03894202], dtype=float32)

In [18]:
pca.explained_variance_

array([0.06799825, 0.03311406], dtype=float32)

In [19]:
all_english_tweets["pca_x"] = pca_result[:,0]
all_english_tweets["pca_y"] = pca_result[:,1]

In [20]:
all_english_tweets

Unnamed: 0,tweet,pca_x,pca_y
0,@vincent13031925 For now. Costs are decreasing...,0.257364,0.138311
1,Love this beautiful shot,-0.115033,-0.278795
2,@agnostoxxx @CathieDWood @ARKInvest Trust the ...,0.316905,-0.075702
3,The art In Cyberpunk is incredible,-0.150569,-0.191344
4,@WholeMarsBlog If you don’t say anything &amp;...,-0.038624,-0.039384
...,...,...,...
11133,"@eugenelee3 @PPathole @SpaceX @Tesla Yeah, not...",-0.265304,0.074655
11134,@PPathole @SpaceX @Tesla That was my night job...,0.079781,0.200149
11135,@PPathole @SpaceX @Tesla True. Ancient times …...,-0.124374,-0.008717
11136,@Erdayastronaut @Tesla Absolutely,0.403889,0.374166


## Reducing embeddings to 2 dimensions using t-SNE

In [21]:
tsne = TSNE(n_components=2, perplexity=50)

In [22]:
tsne_result = tsne.fit_transform(embeddings)

In [23]:
tsne.n_iter_

999

In [24]:
len(tsne_result)

11138

In [25]:
all_english_tweets["tsne_x"] = tsne_result[:,0]
all_english_tweets["tsne_y"] = tsne_result[:,1]

In [26]:
all_english_tweets

Unnamed: 0,tweet,pca_x,pca_y,tsne_x,tsne_y
0,@vincent13031925 For now. Costs are decreasing...,0.257364,0.138311,31.621279,-14.537434
1,Love this beautiful shot,-0.115033,-0.278795,7.037644,-26.465370
2,@agnostoxxx @CathieDWood @ARKInvest Trust the ...,0.316905,-0.075702,33.125565,-33.858826
3,The art In Cyberpunk is incredible,-0.150569,-0.191344,-22.708525,6.065120
4,@WholeMarsBlog If you don’t say anything &amp;...,-0.038624,-0.039384,-24.977428,21.508242
...,...,...,...,...,...
11133,"@eugenelee3 @PPathole @SpaceX @Tesla Yeah, not...",-0.265304,0.074655,-29.497694,61.767452
11134,@PPathole @SpaceX @Tesla That was my night job...,0.079781,0.200149,15.589565,30.932650
11135,@PPathole @SpaceX @Tesla True. Ancient times …...,-0.124374,-0.008717,-15.166372,8.459781
11136,@Erdayastronaut @Tesla Absolutely,0.403889,0.374166,20.994228,17.985449


## Reducing embeddings to 2 dimensions using UMAP

In [27]:
umap_mod = umap.UMAP(n_neighbors=50, n_components=2, min_dist=0.0, metric='cosine')

In [28]:
umap_result = umap_mod.fit_transform(embeddings)

In [29]:
umap_result

array([[11.422199 ,  8.478857 ],
       [11.630537 ,  5.93848  ],
       [12.146359 ,  8.303116 ],
       ...,
       [ 9.245365 ,  7.187661 ],
       [10.823826 ,  9.525682 ],
       [ 8.427513 ,  8.5744915]], dtype=float32)

In [30]:
all_english_tweets["umap_x"] = umap_result[:,0]
all_english_tweets["umap_y"] = umap_result[:,1]

In [31]:
all_english_tweets

Unnamed: 0,tweet,pca_x,pca_y,tsne_x,tsne_y,umap_x,umap_y
0,@vincent13031925 For now. Costs are decreasing...,0.257364,0.138311,31.621279,-14.537434,11.422199,8.478857
1,Love this beautiful shot,-0.115033,-0.278795,7.037644,-26.465370,11.630537,5.938480
2,@agnostoxxx @CathieDWood @ARKInvest Trust the ...,0.316905,-0.075702,33.125565,-33.858826,12.146359,8.303116
3,The art In Cyberpunk is incredible,-0.150569,-0.191344,-22.708525,6.065120,8.981881,6.478120
4,@WholeMarsBlog If you don’t say anything &amp;...,-0.038624,-0.039384,-24.977428,21.508242,8.670399,7.335625
...,...,...,...,...,...,...,...
11133,"@eugenelee3 @PPathole @SpaceX @Tesla Yeah, not...",-0.265304,0.074655,-29.497694,61.767452,7.579733,8.462407
11134,@PPathole @SpaceX @Tesla That was my night job...,0.079781,0.200149,15.589565,30.932650,9.954867,9.520404
11135,@PPathole @SpaceX @Tesla True. Ancient times …...,-0.124374,-0.008717,-15.166372,8.459781,9.245365,7.187661
11136,@Erdayastronaut @Tesla Absolutely,0.403889,0.374166,20.994228,17.985449,10.823826,9.525682


## Clustering

## HDBSCAN

In [32]:
umap_mod_for_clustering = umap.UMAP(
    n_neighbors=15, 
    n_components=5, 
    metric='cosine'
).fit_transform(embeddings)

In [33]:
hdbscan_clustering = hdbscan.HDBSCAN(
    min_cluster_size=25
).fit(umap_mod_for_clustering)

In [34]:
len(hdbscan_clustering.labels_)

11138

In [35]:
set(hdbscan_clustering.labels_)

{-1,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47}

In [36]:
all_english_tweets["hdbscan_cluster_labels"] = hdbscan_clustering.labels_
all_english_tweets

Unnamed: 0,tweet,pca_x,pca_y,tsne_x,tsne_y,umap_x,umap_y,hdbscan_cluster_labels
0,@vincent13031925 For now. Costs are decreasing...,0.257364,0.138311,31.621279,-14.537434,11.422199,8.478857,-1
1,Love this beautiful shot,-0.115033,-0.278795,7.037644,-26.465370,11.630537,5.938480,-1
2,@agnostoxxx @CathieDWood @ARKInvest Trust the ...,0.316905,-0.075702,33.125565,-33.858826,12.146359,8.303116,-1
3,The art In Cyberpunk is incredible,-0.150569,-0.191344,-22.708525,6.065120,8.981881,6.478120,-1
4,@WholeMarsBlog If you don’t say anything &amp;...,-0.038624,-0.039384,-24.977428,21.508242,8.670399,7.335625,43
...,...,...,...,...,...,...,...,...
11133,"@eugenelee3 @PPathole @SpaceX @Tesla Yeah, not...",-0.265304,0.074655,-29.497694,61.767452,7.579733,8.462407,-1
11134,@PPathole @SpaceX @Tesla That was my night job...,0.079781,0.200149,15.589565,30.932650,9.954867,9.520404,35
11135,@PPathole @SpaceX @Tesla True. Ancient times …...,-0.124374,-0.008717,-15.166372,8.459781,9.245365,7.187661,-1
11136,@Erdayastronaut @Tesla Absolutely,0.403889,0.374166,20.994228,17.985449,10.823826,9.525682,35


## K-Means

In [37]:
kmeans_clustering = KMeans(n_clusters=51).fit(preprocessing.normalize(embeddings))



In [38]:
len(kmeans_clustering.labels_)

11138

In [39]:
set(kmeans_clustering.labels_)

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50}

In [40]:
all_english_tweets["kmeans_cluster_labels"] = kmeans_clustering.labels_
all_english_tweets

Unnamed: 0,tweet,pca_x,pca_y,tsne_x,tsne_y,umap_x,umap_y,hdbscan_cluster_labels,kmeans_cluster_labels
0,@vincent13031925 For now. Costs are decreasing...,0.257364,0.138311,31.621279,-14.537434,11.422199,8.478857,-1,8
1,Love this beautiful shot,-0.115033,-0.278795,7.037644,-26.465370,11.630537,5.938480,-1,25
2,@agnostoxxx @CathieDWood @ARKInvest Trust the ...,0.316905,-0.075702,33.125565,-33.858826,12.146359,8.303116,-1,8
3,The art In Cyberpunk is incredible,-0.150569,-0.191344,-22.708525,6.065120,8.981881,6.478120,-1,43
4,@WholeMarsBlog If you don’t say anything &amp;...,-0.038624,-0.039384,-24.977428,21.508242,8.670399,7.335625,43,35
...,...,...,...,...,...,...,...,...,...
11133,"@eugenelee3 @PPathole @SpaceX @Tesla Yeah, not...",-0.265304,0.074655,-29.497694,61.767452,7.579733,8.462407,-1,13
11134,@PPathole @SpaceX @Tesla That was my night job...,0.079781,0.200149,15.589565,30.932650,9.954867,9.520404,35,1
11135,@PPathole @SpaceX @Tesla True. Ancient times …...,-0.124374,-0.008717,-15.166372,8.459781,9.245365,7.187661,-1,13
11136,@Erdayastronaut @Tesla Absolutely,0.403889,0.374166,20.994228,17.985449,10.823826,9.525682,35,27


In [41]:
all_english_tweets.to_csv("tweet_2d_map_and_clusters.csv")

## Topic Extraction [code source and as given in: https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6 blog by Maarten]

In [42]:
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, label_column_name, n=20):
    words = count.get_feature_names_out()
    labels = list(docs_per_topic[label_column_name])
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df, label_column_name):
    topic_sizes = (df.groupby([label_column_name])
                     .tweet
                     .count()
                     .reset_index()
                     .rename({label_column_name: label_column_name, "tweet": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

In [43]:
tweets_per_kmeans_cluster = all_english_tweets.groupby(['kmeans_cluster_labels'], as_index = False).agg({'tweet': ' '.join})
tweets_per_kmeans_cluster

Unnamed: 0,kmeans_cluster_labels,tweet
0,0,@CathieDWood @wintonARK @ARKInvest What do you...
1,1,@Teslarati @ResidentSponge Special mention of ...
2,2,"@Erdayastronaut Yes, Booster 1 is a production..."
3,3,@cleantechnica Congrats to NIO. That is a toug...
4,4,A monkey is literally playing a video game tel...
5,5,@TechAmazing @UniverCurious Tunnels!! Under Ve...
6,6,@DeltavPhotos @PortCanaveral That rocket is a ...
7,7,@SpaceX Looks like engine 2 had issues on asce...
8,8,@vincent13031925 For now. Costs are decreasing...
9,9,@tesla_adri @WholeMarsBlog These things are be...


In [44]:
tweets_per_hdbscan_cluster = all_english_tweets.groupby(['hdbscan_cluster_labels'], as_index = False).agg({'tweet': ' '.join})

In [45]:
tweets_per_hdbscan_cluster

Unnamed: 0,hdbscan_cluster_labels,tweet
0,-1,@vincent13031925 For now. Costs are decreasing...
1,0,I just want to start a flame in your heart @Te...
2,1,"@ID_AA_Carmack Some kind of ELO level, updated..."
3,2,Nicheman — his superpower is appealing to smal...
4,3,The device is implanted flush with skull &amp;...
5,4,@EvaFoxU We will have special colors for new R...
6,5,"@engineers_feed Ultimately, yes @engineers_fee..."
7,6,"@teslaownerssv @AstroJordy @flcnhvy No, just a..."
8,7,@PPathole @Supernova_Style New York seeing sam...
9,8,@cleantechnica So crazy to subsidize pollution...


In [46]:
tf_idf, count = c_tf_idf(tweets_per_hdbscan_cluster.tweet.values, m=len(all_english_tweets))

In [47]:
len(tweets_per_hdbscan_cluster.tweet.values)

49

In [48]:
len(all_english_tweets)

11138

In [49]:
hdbscan_cluster_top_n_words = extract_top_n_words_per_topic(tf_idf, count, tweets_per_hdbscan_cluster, "hdbscan_cluster_labels", n=20)
hdbscan_cluster_sizes = extract_topic_sizes(all_english_tweets, "hdbscan_cluster_labels")

hdbscan_cluster_sizes

Unnamed: 0,hdbscan_cluster_labels,Size
0,-1,6135
48,47,714
23,22,501
36,35,491
28,27,294
45,44,207
38,37,193
31,30,187
22,21,185
44,43,151


In [50]:
hdbscan_cluster_top_n_words[0][:10]

[('flamethrower', 0.3585438910249149),
 ('hat', 0.2289633403805179),
 ('flamethrowers', 0.14089881114296404),
 ('boring', 0.13910192912552036),
 ('hats', 0.10987384664119064),
 ('sold', 0.08312909141559141),
 ('https', 0.07301097605933168),
 ('extinguisher', 0.05999639179183579),
 ('company', 0.05662626362576091),
 ('apocalypse', 0.05626773760376287)]

In [52]:
hdbscan_cluster_top_n_words[20][:10]

[('regulatory', 0.23514035067923655),
 ('approval', 0.21642933664743327),
 ('approvals', 0.18791605904880987),
 ('regulators', 0.14475233526994194),
 ('eu', 0.12202163061360023),
 ('authorities', 0.09536910785910796),
 ('time', 0.08045502586250547),
 ('working', 0.07989162281591111),
 ('formal', 0.07700866771062204),
 ('takes', 0.07680266552268086)]

In [53]:
hdbscan_cluster_top_n_words[25][:10]

[('orbit', 0.09908680818799585),
 ('payload', 0.08468055153059553),
 ('starship', 0.07451965930991951),
 ('rocket', 0.07435135725388031),
 ('cost', 0.07020895420213157),
 ('earth', 0.06651963628253296),
 ('tons', 0.06562785989343209),
 ('rockets', 0.06538746381092915),
 ('orbital', 0.05496947293521482),
 ('sciguyspace', 0.0492441686785702)]

In [55]:
hdbscan_cluster_top_n_words[45][:10]

[('safety', 0.2013396167444198),
 ('nhtsa', 0.10763831111479223),
 ('crash', 0.08044305992108411),
 ('probability', 0.06758388207577842),
 ('car', 0.06485450458251686),
 ('tesla', 0.06384901148928704),
 ('model', 0.06197363723907851),
 ('safest', 0.053819155557396116),
 ('injury', 0.05145146982417281),
 ('accident', 0.05145146982417281)]

In [56]:
hdbscan_cluster_top_n_words[40][:10]

[('supercharger', 0.26018061621478783),
 ('superchargers', 0.13439731318254602),
 ('tesla', 0.06668528709386996),
 ('network', 0.0605444636970077),
 ('battery', 0.050476862455142456),
 ('charge', 0.05014793125822334),
 ('v3', 0.04915979362850082),
 ('la', 0.04729787060927834),
 ('year', 0.04601277663822443),
 ('power', 0.041907128460748316)]

In [57]:
# kmeans_cluster_top_n_words = extract_top_n_words_per_topic(tf_idf, count, tweets_per_kmeans_cluster, "kmeans_cluster_labels", n=20)
# kmeans_cluster_sizes = extract_topic_sizes(all_english_tweets, "kmeans_cluster_labels")

# kmeans_cluster_sizes

## Visualizing embeddings in 2-D

In [58]:
# output_file('dr_algo_vis.html')
source = ColumnDataSource(data=all_english_tweets)
TOOLTIPS = [
    ("(x,y)", "($x, $y)"),
    ("Tweet", "@tweet")
]
slider = Slider(start=1, end=6, value=1, step=1, title="Point Size", align="center")
point_color = "white"

pca_2d = figure(title="Reducing embeddings to 2-D via PCA", title_location="below", width=500, height=500, tooltips=TOOLTIPS)
pca_2d.title.align = 'center'
pca_2d.add_layout(Title(text="Reducing embeddings to 2-D", text_font_style="bold", text_font_size="16pt", align="center"), 'above')
pca_2d_points = pca_2d.circle("pca_x", "pca_y", size=1, source=source, color=point_color, alpha=0.5)
pca_2d.min_border_bottom = 70
pca_2d.xgrid.grid_line_color = None
pca_2d.ygrid.grid_line_color = None
pca_2d.xaxis.major_label_text_font_size = '6pt'
pca_2d.yaxis.major_label_text_font_size = '6pt' 

tsne_2d = figure(title="Reducing embeddings to 2-D via t-SNE", title_location="below", width=500, height=500, tooltips=TOOLTIPS)
tsne_2d.title.align = 'center'
tsne_2d_points = tsne_2d.circle("tsne_x", "tsne_y", size=1, source=source, color=point_color, alpha=0.5)
tsne_2d.min_border_bottom = 70
tsne_2d.min_border_top = 30
tsne_2d.xgrid.grid_line_color = None
tsne_2d.ygrid.grid_line_color = None
tsne_2d.xaxis.major_label_text_font_size = '6pt'
tsne_2d.yaxis.major_label_text_font_size = '6pt' 

umap_2d = figure(title="Reducing embeddings to 2-D via UMAP", title_location="below", width=500, height=500, tooltips=TOOLTIPS)
umap_2d.title.align = 'center'
umap_2d_points = umap_2d.circle("umap_x", "umap_y", size=1, source=source, color=point_color, alpha=0.5)
umap_2d.xgrid.grid_line_color = None
umap_2d.min_border_top = 30
umap_2d.xgrid.grid_line_color = None
umap_2d.ygrid.grid_line_color = None
umap_2d.xaxis.major_label_text_font_size = '6pt'
umap_2d.yaxis.major_label_text_font_size = '6pt' 

slider.js_link('value', pca_2d_points.glyph, 'size')
slider.js_link('value', tsne_2d_points.glyph, 'size')
slider.js_link('value', umap_2d_points.glyph, 'size')

grid = gridplot([[slider], [pca_2d], [tsne_2d], [umap_2d]], merge_tools=False)

show(grid)

In [59]:
all_english_tweets

Unnamed: 0,tweet,pca_x,pca_y,tsne_x,tsne_y,umap_x,umap_y,hdbscan_cluster_labels,kmeans_cluster_labels
0,@vincent13031925 For now. Costs are decreasing...,0.257364,0.138311,31.621279,-14.537434,11.422199,8.478857,-1,8
1,Love this beautiful shot,-0.115033,-0.278795,7.037644,-26.465370,11.630537,5.938480,-1,25
2,@agnostoxxx @CathieDWood @ARKInvest Trust the ...,0.316905,-0.075702,33.125565,-33.858826,12.146359,8.303116,-1,8
3,The art In Cyberpunk is incredible,-0.150569,-0.191344,-22.708525,6.065120,8.981881,6.478120,-1,43
4,@WholeMarsBlog If you don’t say anything &amp;...,-0.038624,-0.039384,-24.977428,21.508242,8.670399,7.335625,43,35
...,...,...,...,...,...,...,...,...,...
11133,"@eugenelee3 @PPathole @SpaceX @Tesla Yeah, not...",-0.265304,0.074655,-29.497694,61.767452,7.579733,8.462407,-1,13
11134,@PPathole @SpaceX @Tesla That was my night job...,0.079781,0.200149,15.589565,30.932650,9.954867,9.520404,35,1
11135,@PPathole @SpaceX @Tesla True. Ancient times …...,-0.124374,-0.008717,-15.166372,8.459781,9.245365,7.187661,-1,13
11136,@Erdayastronaut @Tesla Absolutely,0.403889,0.374166,20.994228,17.985449,10.823826,9.525682,35,27


## Visualizing Clusters in 2-D

In [60]:
source = ColumnDataSource(data=all_english_tweets)
source_with_unclustered_hdbscan = ColumnDataSource(data=all_english_tweets[all_english_tweets['hdbscan_cluster_labels']==-1])
source_with_clustered_hdbscan = ColumnDataSource(data=all_english_tweets[all_english_tweets['hdbscan_cluster_labels']!=-1])
KMEANS_TOOLTIPS = [
    ("(x,y)", "($x, $y)"),
    ("Tweet", "@tweet"),
    ("Cluster", "@kmeans_cluster_labels")
]
HDBSCAN_TOOLTIPS = [
    ("(x,y)", "($x, $y)"),
    ("Tweet", "@tweet"),
    ("Cluster", "@hdbscan_cluster_labels")
]
slider = Slider(start=4, end=6, value=5, step=1, title="Point Size", align="start")
exp_cmap = LinearColorMapper(palette=Inferno[256], low=0, high=50)
bar = ColorBar(color_mapper=exp_cmap, location=(0,0), width=5)

# K MEANS CLUSTERING
tsne_2d_kmeans = figure(title="Representing K-MEANS Clustering in 2-D via t-SNE", title_location="below", width=600, height=600, tooltips=KMEANS_TOOLTIPS)
tsne_2d_kmeans.title.align = 'center'
tsne_2d_kmeans.add_layout(Title(text="Clustering Embeddings:", text_font_style="bold", text_font_size="20pt", align="right"), 'above')
tsne_2d_points_kmeans = tsne_2d_kmeans.circle("tsne_x", "tsne_y", size=5, source=source, alpha=0.5, color={"field":"kmeans_cluster_labels", "transform":exp_cmap})
tsne_2d_kmeans.min_border_bottom = 50
tsne_2d_kmeans.min_border_top = 30
tsne_2d_kmeans.xgrid.grid_line_color = None
tsne_2d_kmeans.ygrid.grid_line_color = None
tsne_2d_kmeans.xaxis.major_label_text_font_size = '0pt'
tsne_2d_kmeans.yaxis.major_label_text_font_size = '0pt'
tsne_2d_kmeans.add_layout(bar, "left")

umap_2d_kmeans = figure(title="Representing K-MEANS Clustering in 2-D via UMAP", title_location="below", width=600, height=600, tooltips=KMEANS_TOOLTIPS)
umap_2d_kmeans.title.align = 'center'
umap_2d_points_kmeans = umap_2d_kmeans.circle("umap_x", "umap_y", size=5, source=source, alpha=0.5, color={"field":"kmeans_cluster_labels", "transform":exp_cmap})
umap_2d_kmeans.xgrid.grid_line_color = None
umap_2d_kmeans.min_border_bottom = 50
umap_2d_kmeans.min_border_top = 30
umap_2d_kmeans.ygrid.grid_line_color = None
umap_2d_kmeans.xaxis.major_label_text_font_size = '0pt'
umap_2d_kmeans.yaxis.major_label_text_font_size = '0pt'
umap_2d_kmeans.add_layout(bar, "left")

# HDBSCAN CLUSTERING
tsne_2d_hdbscan = figure(title="Representing HDBSCAN Clustering in 2-D via t-SNE", title_location="below", width=600, height=600, tooltips=HDBSCAN_TOOLTIPS)
tsne_2d_hdbscan.title.align = 'center'
tsne_2d_hdbscan.add_layout(Title(text="K-MEANS vs HDBSCAN", text_font_style="bold", text_font_size="20pt", align="left"), 'above')
tsne_2d_points_hdbscan_unclustered = tsne_2d_hdbscan.circle("tsne_x", "tsne_y", size=4, source=source_with_unclustered_hdbscan, alpha=0.07, color='grey')
tsne_2d_points_hdbscan = tsne_2d_hdbscan.circle("tsne_x", "tsne_y", size=5, source=source_with_clustered_hdbscan, alpha=0.5, color={"field":"hdbscan_cluster_labels", "transform":exp_cmap})
tsne_2d_hdbscan.min_border_bottom = 50
tsne_2d_hdbscan.min_border_top = 30
tsne_2d_hdbscan.xgrid.grid_line_color = None
tsne_2d_hdbscan.ygrid.grid_line_color = None
tsne_2d_hdbscan.xaxis.major_label_text_font_size = '0pt'
tsne_2d_hdbscan.yaxis.major_label_text_font_size = '0pt'
tsne_2d_hdbscan.add_layout(bar, "left")

umap_2d_hdbscan = figure(title="Representing HDBSCAN Clustering in 2-D via UMAP", title_location="below", width=600, height=600, tooltips=HDBSCAN_TOOLTIPS)
umap_2d_hdbscan.title.align = 'center'
umap_2d_points_hdbscan_unclustered = umap_2d_hdbscan.circle("umap_x", "umap_y", size=4, source=source_with_unclustered_hdbscan, alpha=0.07, color='grey')
umap_2d_points_hdbscan = umap_2d_hdbscan.circle("umap_x", "umap_y", size=5, source=source_with_clustered_hdbscan, alpha=0.5, color={"field":"hdbscan_cluster_labels", "transform":exp_cmap})
umap_2d_hdbscan.xgrid.grid_line_color = None
umap_2d_hdbscan.min_border_bottom = 50
umap_2d_hdbscan.min_border_top = 30
umap_2d_hdbscan.ygrid.grid_line_color = None
umap_2d_hdbscan.xaxis.major_label_text_font_size = '0pt'
umap_2d_hdbscan.yaxis.major_label_text_font_size = '0pt'
umap_2d_hdbscan.add_layout(bar, "left")

slider.js_link('value', tsne_2d_points_kmeans.glyph, 'size')
slider.js_link('value', umap_2d_points_kmeans.glyph, 'size')
slider.js_link('value', tsne_2d_points_hdbscan.glyph, 'size')
slider.js_link('value', umap_2d_points_hdbscan.glyph, 'size')

grid = gridplot([[slider], [tsne_2d_kmeans, tsne_2d_hdbscan], [umap_2d_kmeans, umap_2d_hdbscan]], merge_tools=False)

show(grid)