In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA # Import PCA

def cluster_and_plot_latent(df, numeric_cols, categorical_cols, 
                            true_label_col='main_genre'):
    """
    Preprocesses data, reduces dimensions to 2 latent features using PCA,
    clusters the data, and plots the results on the latent axes.
    """
    
    # 1. Prepare Data
    features = numeric_cols + categorical_cols
    X = df[features]
    true_labels = df[true_label_col]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])
    
    X_processed = preprocessor.fit_transform(X)

    # 2. Clustering
    n_clusters = 20
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X_processed)

    # Dimensionality Reduction (PCA)
    # Compress the processed data into 2 latent features
    pca = PCA(n_components=2)
    latent_components = pca.fit_transform(X_processed)
    
    # Calculate how much info these 2 features capture
    explained_variance = pca.explained_variance_ratio_.sum() * 100

    # 3. Setup Plot Data
    plot_df = X.copy().reset_index(drop=True)
    plot_df['True Genre'] = true_labels.reset_index(drop=True)
    plot_df['Cluster'] = cluster_labels.astype(str)
    
    # Add the new latent features to the dataframe
    plot_df['Latent Feature 1'] = latent_components[:, 0]
    plot_df['Latent Feature 2'] = latent_components[:, 1]
    
    # Add hover data if available
    for col in ['reviewer_reviews']:
        if col not in plot_df.columns and col in df.columns:
            plot_df[col] = df[col].reset_index(drop=True)

    # 4. Plot using Latent Features as axes
    fig = px.scatter(
        plot_df, 
        x='Latent Feature 1', 
        y='Latent Feature 2', 
        color='Cluster',
        title=f'K-Means Clusters on Latent Features (PCA) - {explained_variance:.1f}% Variance Captured',
        labels={'Cluster': 'Cluster Label'},
        hover_data=['True Genre', 'reviewer_reviews', 'score', 'log_length'] 
    )
    
    fig.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')))
    fig.show()
    
    return plot_df, kmeans, pca

In [None]:
numeric_features = ['score', 'log_length', 'log_followers_count']
categorical_features = []

df = pd.read_csv('data/clean/Cleaned_Data.csv')
df = df.dropna(subset=df.columns.drop('label'))
df['log_followers_count'] = np.log(df['followers_count']+1)
df['log_length'] = np.log(df['length']+1)

cluster_and_plot_latent(df, numeric_features, categorical_features)

(      score  log_length  log_followers_count  True Genre Cluster  \
 0       7.2    6.747587            12.800691        Rock       7   
 1       5.8    6.298949            11.499840  Electronic      12   
 2       8.0    6.561031            13.213545         Rap      17   
 3       7.5    6.274762            13.213545         Rap      17   
 4       7.4    6.652863            13.213545         Rap       7   
 ...     ...         ...                  ...         ...     ...   
 8786    7.7    6.748760             9.449357  Electronic       0   
 8787    7.9    6.646391             9.449357  Electronic       0   
 8788    7.7    6.298949             9.449357  Electronic       6   
 8789    7.8    6.504288            11.150333       Metal      19   
 8790    9.0    7.531016            15.411139        Rock      13   
 
       Latent Feature 1  Latent Feature 2  reviewer_reviews  
 0             0.381764          0.425580                61  
 1            -1.555564          0.985432     