In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'/Users/aliciayang/Downloads/music data - musical similarity.csv')

In [3]:
cluster_centroids = df.groupby('text-clusters').agg(
    count=('text-x', 'count'),
    mean_x=('text-x', 'mean'),
    mean_y=('text-y', 'mean')
).reset_index()
print("Cluster Counts and Centroids:\n", cluster_centroids, "\n")

Cluster Counts and Centroids:
    text-clusters  count     mean_x     mean_y
0            0.0     13  -1.591165  -3.843169
1            1.0     42   8.436730  10.678971
2            2.0     35   3.678081 -16.761030
3            3.0     16  -0.627809  12.212778
4            4.0     42 -11.110566   5.713136 



In [5]:
cluster_spreads = df.groupby('text-clusters').agg(
    std_x=('text-x', 'std'),
    std_y=('text-y', 'std')
).reset_index()
print("Coordinate Spread (Standard Deviations):\n", cluster_spreads)


Coordinate Spread (Standard Deviations):
    text-clusters      std_x      std_y
0            0.0  23.305238  10.223643
1            1.0   9.432079  12.603261
2            2.0  12.852867  11.571277
3            3.0  22.849989  20.717936
4            4.0  14.672001  20.271497


In [16]:
def categorize(sim):
    if sim == 'Artist':
        return 'Artist'
    elif sim == 'Genre':
        return 'Genre'
    elif sim in ('Genre - Artist', 'Genre–Artist'):
        return 'Genre-Artist'
    else:
        return 'Sampling'

In [17]:
df['category'] = df['Similarity'].apply(categorize)

In [18]:
df['num_similar'] = (
    df['Similar Songs (ID)']
      .fillna('')                   # guard against NaNs
      .str.split(',')               # split into list of substrings
      .apply(lambda lst: sum(1 for x in lst if x.strip().isdigit()))
)

In [19]:
summary = (
    df
    .groupby('category')
    .agg(
        num_songs   = ('ID',          'count'),
        avg_similar = ('num_similar', 'mean')
    )
    .reset_index()
)

print(summary)

       category  num_songs  avg_similar
0        Artist         57     2.052632
1         Genre         50     1.160000
2  Genre-Artist          1     1.000000
3      Sampling         43     1.232558


# Getting audio embedding coordinates

In [None]:
pip install ace_tools

In [21]:
import re
import base64
import numpy as np
import pandas as pd

In [24]:
with open('/Users/aliciayang/Downloads/cluster_plot.html', 'r', encoding='utf-8', errors='ignore') as f:
    html = f.read()

In [25]:
scripts = re.findall(r'<script type="text/javascript">(.*?)</script>', html, re.DOTALL)
script_fig = scripts[-1]

In [26]:
cluster_names = re.findall(r'"name":"([^"]+)"', script_fig)
x_b64       = re.findall(r'"x":\{"dtype":"f4","bdata":"([^"]+)"\}', script_fig)
y_b64       = re.findall(r'"y":\{"dtype":"f4","bdata":"([^"]+)"\}', script_fig)

In [28]:

def decode_js_base64(s: str) -> bytes:
    # undoing js unicode escapes for / +
    s = s.replace('\\u002F', '/').replace('\\u002f', '/')
    s = s.replace('\\u002B', '+').replace('\\u002b', '+')
    # stripping whitespace and newlines
    s = re.sub(r'\s+', '', s)
    # padding to mult of 4
    padding = (-len(s)) % 4
    s += '=' * padding
    # decoding
    return base64.b64decode(s)

rows = []
for name, xb, yb in zip(cluster_names, x_b64, y_b64):
    x_bytes = decode_js_base64(xb)
    y_bytes = decode_js_base64(yb)
    xs = np.frombuffer(x_bytes, dtype=np.float32)
    ys = np.frombuffer(y_bytes, dtype=np.float32)
    for x, y in zip(xs, ys):
        rows.append({'cluster': name, 'x': float(x), 'y': float(y)})

df = pd.DataFrame(rows)
print(df.head())

  cluster         x         y
0       2  1.635875 -2.488665
1       2 -1.201906 -2.737567
2       2  5.040506 -3.950674
3       2  5.308413 -2.543144
4       2 -0.663470 -4.130695


In [31]:
stats = df.groupby('cluster').agg(
    count=('x', 'size'),
    mean_x=('x', 'mean'),
    mean_y=('y', 'mean'),
    std_x=('x', 'std'),
    std_y=('y', 'std')
).reset_index()

In [33]:
print("cluster quantitative analysis")
print(stats.to_string(index=False, float_format="%.4f"))

cluster quantitative analysis
cluster  count  mean_x  mean_y  std_x  std_y
      0     36 -3.8776  2.6695 2.6682 3.0375
      1     57  0.5133  2.2351 2.6326 2.4618
      2     31  1.6259 -3.1398 2.6269 1.3155
      3      1 -6.5964 -2.0742    NaN    NaN
      4     25  5.0885  6.9797 1.1706 1.6565
