In [1]:
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import seaborn as sns
import pandas as pd
import numpy as np
import random
import json

from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import preprocessing

In [2]:
pio.templates.default = "plotly_dark"

In [3]:
original = list(json.loads(open("./res.json").read()).values())

In [4]:
df = pd.read_json("./res.json")
df = df.transpose()

In [5]:
df.head(1)

Unnamed: 0,title,href,desc,likes,duration,channelName,subCount,dateAndViews,embedded
In Rainbows but its just my voice,In Rainbows but its just my voice,https://youtube.com/watch?v=Fno_k0VZtKs,my favorite is nude\n\n0:00 3x5 Stride\n3:54 C...,6.6K,41:12,giideon,1.22K subscribers,"82,457 views 25 Mar 2023","[0.012158858589828, -0.009881187230348, -0.016..."


In [6]:
df = df.reset_index()

In [7]:
df = df.drop("index", axis=1)

In [8]:
kmeans = KMeans(n_clusters=20)

In [9]:
dense_vectors = np.array([np.array(item) for item in df["embedded"].to_numpy()])

In [10]:
kmeans.fit(dense_vectors)

In [11]:
cluster_labels = kmeans.labels_

In [12]:
print("Cluster assignments:", cluster_labels)

Cluster assignments: [ 2  2  2 ... 15 17 15]


In [13]:
silhouette_score(dense_vectors, cluster_labels, metric="euclidean")

0.017074417956952116

In [14]:
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(dense_vectors)

unique_labels = np.unique(cluster_labels)

df = pd.DataFrame(
    {
        "PCA Component 1": reduced_vectors[:, 0],
        "PCA Component 2": reduced_vectors[:, 1],
        "Cluster": [f"Cluster {label}" for label in cluster_labels],
    }
)

fig = px.scatter(
    df,
    x="PCA Component 1",
    y="PCA Component 2",
    color="Cluster",
    color_discrete_sequence=px.colors.qualitative.Plotly,
    title="K-means Clustering Visualization (PCA)",
)

fig.update_layout(
    xaxis_title="PCA Component 1",
    yaxis_title="PCA Component 2",
    legend_title="Cluster",
    width=1000,
    height=1000,
)

fig.show()

In [15]:
num_samples = 10
sampled_data = {}

for label, data in zip(cluster_labels, original):
    if label not in sampled_data:
        sampled_data[label] = []
    if len(sampled_data[label]) < num_samples:
        sampled_data[label].append(data)

for label, data in sampled_data.items():
    print(f"Cluster {label}:")
    for item in data:
        print(item["desc"].replace("\n", ""))
    print()

Cluster 2:
my favorite is nude0:00 3x5 Stride3:54 Corpseretrievers7:55 Unclothed12:06 Strange Aquatic Creatures17:22 Everything That I Require21:09 Faustus Synthesizer 23:15 Considerer27:56 Building Comprised of Stiff Paper32:26 Puzzle Piece Arriving At Its Intended Location36:34 Magnetic Tape For Recording and Reproducing Visual Images and Soundthx @ooflespoofle3691 for the song names
Title: 'maybe we'll hug each other in a past life'Genre: avant-garde, psychedelic/indie, post-rock, weirdcoreRelease Date: 14 aug 2024Type: EPArtist/Project: three lices and a molly / rolly abore | Dream Radio FMBandcamp: https://threelicesandamolly.bandcamp....Tracklist:0:00 - Dream Radio Introduction0:16 - over the cotton hills.4:44 - i'm dead.8:00 - Commercial Break [distant reality ad] '  Hey Billy, don't you think we should eat the kids before the eggs? We have plenty of time to consume food anyway.   I think you're right, Daisy. Let's make a blockage inside their airway so they stop screaming first

In [16]:
def visualize_cluster(target_cluster, cluster_labels, reduced_vectors, sampled_data):
    # Create a boolean mask to select data points belonging to the target cluster
    mask = [label == target_cluster for label in cluster_labels]

    # Extract the reduced vectors for the target cluster and other data points
    cluster_vectors = [vector for vector, include in zip(reduced_vectors, mask) if include]
    other_vectors = [
        vector for vector, include in zip(reduced_vectors, mask) if not include
    ]

    # Create a list to store the data for the plot
    data = []

    # Add the trace for other clusters
    data.append(
        go.Scatter(
            x=[vector[0] for vector in other_vectors],
            y=[vector[1] for vector in other_vectors],
            mode="markers",
            marker=dict(color="gray", size=6, opacity=0.3),
            name="Other Clusters",
            hoverinfo="skip",  # Skip hover information for other clusters
        )
    )

    # Add the trace for the target cluster
    data.append(
        go.Scatter(
            x=[vector[0] for vector in cluster_vectors],
            y=[vector[1] for vector in cluster_vectors],
            mode="markers",
            marker=dict(color="blue", size=10),
            name=f"Cluster {target_cluster}",
            hoverinfo="skip",  # Skip hover information for the target cluster
        )
    )

    # Add traces for the samples in the target cluster
    for i, sample in enumerate(sampled_data[target_cluster]):
        data.append(
            go.Scatter(
                x=[cluster_vectors[i][0]],
                y=[cluster_vectors[i][1]],
                mode="markers",
                marker=dict(color=f"rgb({i*50}, {i*80}, {i*120})", size=12, symbol="star"),
                name=f"Sample {i+1}",
                hovertemplate="<b>Sample %{text}</b><br>" +
                "<br>".join([f"{key}: {value[:60]}{"..." if len(value) > 60 else ""}" if key != "embedded" else "" for key, value in sample.items()]),
                text=[f"{i+1}"],
            )
        )

    # Create the layout for the plot
    layout = go.Layout(
        title=f"Visualization of Cluster {target_cluster} relative to Other Clusters",
        xaxis=dict(title="Dimension 1"),
        yaxis=dict(title="Dimension 2"),
        hovermode="closest",
        width=1000,
        height=1000
    )

    # Create the figure
    fig = go.Figure(data=data, layout=layout)

    # Display the plot
    fig.show()

In [17]:
visualize_cluster(
    1,
    cluster_labels=cluster_labels,
    reduced_vectors=reduced_vectors,
    sampled_data=sampled_data,
)