In [None]:
import sys
print(sys.executable)

In [None]:
pip install tensorflow tensorflow_hub scikit-learn seaborn plotly nbformat

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

In [None]:
# Load the Universal Sentence Encoder's TF Hub module
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
type(embed)

In [None]:
# get the word embedding of a single word
embed(["apple"])

In [None]:
words = ['king', 'queen', 'radio', 'TV', 'bike', 'car', 'Boston', 'London', 'lake', 'river']

embeddings = embed(words)
embeddings.shape

In [None]:
from numpy.linalg import norm

def cosineSimilarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors."""
    V1 = np.array(vec1)
    V2 = np.array(vec2)
    cosine = np.dot(V1, V2)/(norm(V1)*norm(V2))
    return cosine

In [None]:
def pairwiseSimilarity(embeddings):
    """Given a matrix of embeddings for words or sentences,
    calculate the cosine similarity for each pair.
    """
    simMatrix = []
    for vec1 in embeddings:
        simRow = []
        for vec2 in embeddings:
            simRow.append(cosineSimilarity(vec1, vec2))
        simMatrix.append(simRow)
    return simMatrix
    
simMatrix = pairwiseSimilarity(embeddings)
print(simMatrix)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def drawHeatmap(labels, simMtrx, plotTitle):
    """Draws a heatmap for the similarity matrix.
    """
    sns.set(font_scale=0.9)
    g = sns.heatmap(
          simMtrx, # similarity matrix with the cosine sim values
          xticklabels=labels,
          yticklabels=labels,
          vmin=0,
          vmax=1,
          cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=90)
    g.set_title(plotTitle, fontsize=14)
    plt.show()

In [None]:
drawHeatmap(words, simMatrix, "Similarity for Word Embeddings")

In [None]:
def elbowMethod(embeddings, maxK):
    """
    Implements the Elbow method for finding most optimal k.
    It keeps track of a measure named "inertia" for each cluster.
    """
    sumSquaredDistances = []
    kValues = list(range(1, maxK))
    for k in kValues:
        km = KMeans(n_clusters=k, random_state=42)
        km = km.fit(embeddings)
        sumSquaredDistances.append(km.inertia_)
    
    # plot the line to identify the elbow
    plt.plot(kValues, sumSquaredDistances, 'ro-')
    plt.xlabel('k')
    plt.xticks(kValues)
    plt.ylabel('Sum of squared distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()

In [None]:
elbowMethod(newsEmbed, 20)

In [None]:
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(newsEmbed)

clusters[:10]

In [None]:
from collections import Counter
Counter(clusters)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)  
tsne_results = tsne.fit_transform(newsEmbed)

In [None]:
import pandas as pd

df = pd.DataFrame(tsne_results, columns=['tsne_1', 'tsne_2'])
df['hashtag'] = news  
df['cluster'] = clusters # the cluster indices where each news hashtags belong
df.head()

In [None]:
import plotly.express as px

# Create the scatter plot
fig = px.scatter(df, x='tsne_1', y='tsne_2', text='hashtag', color="cluster", color_continuous_scale="BlueRed")

# Format what to show next to the markers
fig.update_traces(textposition='top center', 
                  mode='markers+text', 
                  textfont=dict(size=6))

fig.update_layout(title='Embeddings of TikTok News Hashtags', width=800, height=800)
fig.show()
