In [1]:
!pip install scikit-learn
!pip install pyyaml
!pip install gensim
!pip install plotly
!pip install pandas
!pip install numpy

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting numpy>=1.19.5 (from scikit-learn)
  Downloading numpy-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading numpy-2.2.2-cp311-cp311-manylinux_2_1

In [1]:
# lets just open all the .yaml files and copy the dictionary values to a new file
import os
import glob
import yaml
import re
import numpy as np
import pandas as pd
import gensim.downloader as api
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:

# Step 1: Preprocessing - Add a space between lowercase and uppercase
def add_space_between_cases(text):
    return re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

# Step 2: Load Word2Vec model (pre-trained)
# You can download a smaller model like "glove-wiki-gigaword-50" if you need faster performance.
word2vec_model = api.load("glove-wiki-gigaword-50")  # 50-dimensional word vectors

# Step 3: Convert names to vectors using Word2Vec
def get_word2vec_vector(text, model):
    words = text.lower().split()  # Tokenize and make lowercase
    vector = np.zeros((model.vector_size,))  # Initialize with zeros
    valid_words = 0
    
    for word in words:
        if word in model:
            vector += model[word]
            valid_words += 1
            
    if valid_words > 0:
        vector /= valid_words  # Average the vectors of words
    return vector



In [3]:
# Step 4: Generate embeddings for the list of names
with open("/home/soumya/cambridge-mt_scrapper/data/post-processing/unique_cleaned_trackname_metadata.txt", "r") as f:
        names = f.readlines()
        names = [name.strip() for name in names]
# Preprocess names by adding spaces between lowercase and uppercase
processed_names = [add_space_between_cases(name) for name in names]
print(processed_names[:15])



['Horn', 'Cymbals OD', 'Drums Snare Up', 'BASS CAB   CU  copperhead   BAE', 'Kick SFX', 'RIde', 'Saxophone Close Mic', 'Elec Piano SFX', 'Bass FX', 'Hi Hat Ride', 'Vocals Erica M', 'Cymbal Swells', 'Vocals Acoustic Guitar Proto', 'AC GUITAR', 'Backing Vox DT']


In [4]:

# Generate Word2Vec embeddings for each name
embeddings = np.array([get_word2vec_vector(name, word2vec_model) for name in processed_names])
print("Embeddings: ", embeddings)
# Step 5: Normalize the embeddings (if necessary)
# This step can help improve DBSCAN performance, especially if cosine similarity is used.
embeddings_normalized = normalize(embeddings)


# fig.write_html("word2vec_clusters.html")


Embeddings:  [[-0.024018   -0.30223    -0.63186002 ... -0.23839    -0.51620001
  -0.29591   ]
 [-0.93992999 -0.52413101 -0.44327401 ... -0.26174551  0.22575501
  -0.32693002]
 [-0.44563799 -0.47118667  0.35877199 ... -0.64918666 -0.777698
  -0.38596332]
 ...
 [-0.42752665  0.10849666  0.04459867 ... -0.65335665 -0.58712767
  -0.24241332]
 [-0.26964    -0.48569    -0.21484999 ... -0.36733001 -0.1663
  -1.15509999]
 [-0.37262     0.40030668  0.98552    ... -0.50115333 -0.21622333
   0.03370667]]


In [5]:
# Step 6: Apply DBSCAN clustering
db = DBSCAN(metric='cosine', eps=0.2, min_samples=2)  # Adjust eps and min_samples if needed
labels = db.fit_predict(embeddings_normalized)
print(labels)
# print("Labels: ", labels)


[-1  0  0 ...  0  0  0]


In [6]:

# Step 7: Dimensionality Reduction (PCA or t-SNE)
# Here, we'll use both PCA and t-SNE for comparison
pca = PCA(n_components=2)
pca_result = pca.fit_transform(embeddings_normalized)
print("PCA Result: ", pca_result)




PCA Result:  [[ 0.1005133  -0.2279326 ]
 [-0.52103845 -0.119463  ]
 [ 0.28447555 -0.35254506]
 ...
 [ 0.23201897 -0.32172467]
 [-0.11155117 -0.30218168]
 [ 0.04775788  0.25805419]]


In [18]:
!pip install ipykernel
!pip install nbformat
import nbformat
!pip install matplotlib




In [None]:
# Alternatively, use t-SNE for more complex relationships
tsne = TSNE(n_components=2, perplexity=2)
tsne_result = tsne.fit_transform(embeddings_normalized)
import nbformat
import matplotlib.pyplot as plt

# Step 8: Plot interactive clusters using Plotly
# Create a DataFrame for plotting
# df = pd.DataFrame(pca_result, columns=["PC1", "PC2"])
# df['Cluster'] = labels
# df['Name'] = names

# # Plot clusters using Plotly
# fig = px.scatter(df, x="PC1", y="PC2", color="Cluster", hover_data=["Name"])
# fig.update_traces(marker=dict(size=5))
# fig.show()

# plot the tsne result
plt.figure(figsize=(10, 5))
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=labels, cmap='viridis')
plt.colorbar()
plt.show()


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
# Step 1: Vectorize names using TF-IDF
vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
X = vectorizer.fit_transform(names)

In [11]:
print("X: ", X)
# 

X:    (0, 524)	1.0
  (1, 261)	1.0
  (2, 346)	1.0
  (3, 96)	0.3458261960662402
  (3, 178)	0.5676561527956507
  (3, 250)	0.33902990896343754
  (3, 242)	0.5284513599496383
  (3, 89)	0.40493038327991165
  (4, 550)	0.6268065447118077
  (4, 870)	0.7791749197108725
  (5, 826)	1.0
  (6, 856)	1.0
  (7, 405)	1.0
  (8, 115)	1.0
  (9, 520)	1.0
  (10, 1120)	0.5004546828819861
  (10, 413)	0.8657627333060085
  (11, 264)	1.0
  (12, 1120)	0.44215631897114405
  (12, 16)	0.5984664212871523
  (12, 476)	0.4108953977191229
  (12, 792)	0.5267833561521099
  (13, 476)	0.5783335121701128
  (13, 1)	0.8158004343593978
  (14, 75)	1.0
  :	:
  (1786, 785)	1.0
  (1787, 96)	0.5010157937088779
  (1787, 49)	0.5866430006851352
  (1787, 1025)	0.6362650109831622
  (1788, 832)	1.0
  (1789, 982)	1.0
  (1790, 550)	0.2693578806676052
  (1790, 1026)	0.40646674856893067
  (1790, 587)	0.40646674856893067
  (1790, 868)	0.4461000996554696
  (1790, 1077)	0.4461000996554696
  (1790, 790)	0.4461000996554696
  (1791, 96)	0.724229191678

In [18]:
# Step 2: Apply DBSCAN clustering
db = DBSCAN(metric='cosine', eps=0.2, min_samples=3)
labels = db.fit_predict(X)
print("Labels: ", labels)

Labels:  [-1 -1 -1 ... -1 -1  4]


In [19]:
# Step 3: Group names by cluster labels
clusters = {}
for idx, label in enumerate(labels):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(names[idx])

# Step 4: Print the grouped names
for cluster_id, group in clusters.items():
    print(f"Cluster {cluster_id}: {group}")

Cluster -1: ['Horn', 'CymbalsOD', 'DrumsSnareUp', 'BASS CAB   CU  copperhead   BAE', 'Kick SFX', 'SaxophoneCloseMic', 'ElecPianoSFX', 'BassFX', 'HiHatRide', 'Vocals Erica M', 'CymbalSwells', 'BackingVoxDT', 'DRUM OH AR LANGEVIN.L', 'LESLIE BOTTOM M', 'ElecGtra Mic', 'DrumsTom', 'Saxophone', 'PercLoop', 'LeadVoxAccents', 'Vocals Bone M', 'SFXLoop', 'LeadVoxMic', 'Drumkit RoomMono', 'BassClarinet', 'AcGtrDI', 'ElecGtra Close', 'RoomMono', 'Bulldozer', 'BassHi', 'SampleBrass', 'DoubleBass', 'Hat SFX', 'ROOM STEREO L   C w TFUNK CK   Neve', 'GlassBell', 'Clavinet', 'MIDIStrings', 'GUITAR AMP    M', 'BackingVoxa', 'LeadVoxTapeEffect', 'ReversePiano', 'BackingVoxBr', 'DrumsSideR', 'FloorTom', 'OfficeAmbience', 'BEATBOX ALTO B AK SSLE', 'LeadVox AltTake', 'BASS   DI    API', 'Vocals BGV M', 'VoxSFX', 'PIANO AK LANGEVIN.L', 'UkeleleMic', 'Loops', 'ViolaSamples', 'VoxGuitar', 'VOX BG U', 'LeadVoxCha', 'LeadVoxClose', 'HornsDT', 'RoomLoFi', 'BassPedals', 'RunningAsiza', 'LeadVoxVs', 'NBATG   aco

In [23]:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
# Step 5: Apply Agglomerative Hierarchical Clustering
agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5)  # No predefined number of clusters
labels = agg_clustering.fit_predict(embeddings_normalized)
print("Labels: ", labels)

Labels:  [ 905   28  133 ... 1014  411  205]


In [25]:
# calculate the depth of the dendrogram
Z = sch.linkage(embeddings_normalized, method='ward')
depth = 3
# Step 6: Group names by cluster labels
clusters = {}
for idx, label in enumerate(labels):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(names[idx])
print("Clusters: ", clusters)

Clusters:  {905: ['Horn'], 28: ['CymbalsOD', 'CymbalsOD Overheads'], 133: ['DrumsSnareUp', 'DrumsSnareDown'], 162: ['BASS CAB   CU  copperhead   BAE', 'GUITAR   CU  copperhead   BAE'], 313: ['Kick SFX', 'KickSFX'], 567: ['RIde', 'Ride'], 215: ['SaxophoneCloseMic', 'SaxophoneFarMic', 'BassCloseMic'], 929: ['ElecPianoSFX'], 763: ['BassFX'], 93: ['HiHatRide', 'Loop HiHat'], 77: ['Vocals Erica M', 'Vocals Amy M', 'Vocals Leslie  Ela M'], 274: ['CymbalSwells', 'CymbalSwell'], 44: ['Vocals Acoustic Guitar Proto', 'Vocals Bass Proto', 'Vocals Keys Proto', 'Vocals Banjo Proto', 'Vocals Guitar Proto', 'Acoustic Guitar Proto'], 337: ['AC GUITAR', 'AcGuitar'], 1010: ['BackingVoxDT'], 354: ['DRUM OH AR LANGEVIN.L', 'DRUM OH AR LANGEVIN.R'], 243: ['LESLIE BOTTOM M', 'Keys Leslie Bottom M'], 443: ['ElecGtra Mic', 'ElecGtrb Mic', 'ElecGtrc Mic'], 1027: ['DrumsTom'], 228: ['Saxophone', 'BassClarinet', 'TromboneBass', 'BassTrombone'], 813: ['PercLoop'], 642: ['VOX AC GUITAR C GRACE', 'VOX AC GUITAR M G

In [None]:

# Step 6: Generate a dendrogram for hierarchical clustering
# This shows the hierarchical structure
plt.figure(figsize=(100, 30))
sch.dendrogram(sch.linkage(embeddings_normalized, method='ward'))
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()

# Step 7: Dimensionality Reduction (PCA) for visualization
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_result = pca.fit_transform(embeddings_normalized)

# Step 8: Plot interactive clusters using Plotly
df = pd.DataFrame(pca_result, columns=["PC1", "PC2"])
df['Cluster'] = labels
df['Name'] = names

# Plot using Plotly Express
fig = px.scatter(df, x="PC1", y="PC2", color='Cluster', text='Name', title="Hierarchical Clustering of Names")
fig.update_traces(textposition='top center')

# Show the plot
fig.show()

In [26]:
print(names)

['Horn', 'CymbalsOD', 'DrumsSnareUp', 'BASS CAB   CU  copperhead   BAE', 'Kick SFX', 'RIde', 'SaxophoneCloseMic', 'ElecPianoSFX', 'BassFX', 'HiHatRide', 'Vocals Erica M', 'CymbalSwells', 'Vocals Acoustic Guitar Proto', 'AC GUITAR', 'BackingVoxDT', 'DRUM OH AR LANGEVIN.L', 'LESLIE BOTTOM M', 'ElecGtra Mic', 'DrumsTom', 'Saxophone', 'PercLoop', 'VOX AC GUITAR C GRACE', 'LeadVoxAccents', 'Vocals Bone M', 'SFXLoop', 'LeadVoxMic', 'Drumkit RoomMono', 'BassClarinet', 'AcGtrDI', 'ElecGtra Close', 'RoomMono', 'Bulldozer', 'BassHi', 'SampleBrass', 'DoubleBass', 'Hat SFX', 'ROOM STEREO L   C w TFUNK CK   Neve', 'GlassBell', 'Clavinet', 'MIDIStrings', 'DRUM Snare Top M SH', 'GUITAR AMP    M', 'BackingVoxa', 'LeadVoxTapeEffect', 'ReversePiano', 'BackingVoxBr', 'DrumsSideR', 'FloorTom', 'OfficeAmbience', 'BEATBOX ALTO B AK SSLE', 'LeadVox AltTake', 'BASS   DI    API', 'Vocals BGV M', 'VoxSFX', 'PIANO AK LANGEVIN.L', 'UkeleleMic', 'Loops', 'ViolaSamples', 'VoxGuitar', 'VOX BG U', 'LeadVoxCha', 'Lead

In [57]:
# Lowercase all names
instrument_groups = {}
for idx, name in enumerate(names):
    names[idx] = name.lower()

    if "kick" in name:
        instrument_groups.setdefault("kick", []).append(name)
    elif "snare" in name:
        instrument_groups.setdefault("snare", []).append(name)
    elif "hihat" in name:
        instrument_groups.setdefault("hihat", []).append(name)
    elif "tom" in name:
        instrument_groups.setdefault("tom", []).append(name)
    elif "cymbal" in name:
        instrument_groups.setdefault("cymbal", []).append(name)
    elif any(keyword in name for keyword in ["perc", "shaker", "kalimba", "clap", "snap", "crash", "hi", "hat", "djembe","cowbell", "conga","glockenspiel", "timpani","congo", "ride", "beat", "perc", "overhead", "beatbox", "tambourine", "triangle", "maraca", "bongo", "cabasa", "guiro", "woodblock", "clave", "castanet", "agogo", "whistle", "bell", "chime", "gong", "tambourine"]):
        instrument_groups.setdefault("percussion", []).append(name)
    elif "drum" in name:
        instrument_groups.setdefault("drum", []).append(name)
    elif "bass" in name:
        instrument_groups.setdefault("bass", []).append(name)
    elif "synth" in name:
        instrument_groups.setdefault("synth", []).append(name)
    elif any(keyword in name for keyword in ["piano", "keys", "clavinet", "rhodes", "accordion"]):
        instrument_groups.setdefault("keys", []).append(name)
    elif "room" in name:
        instrument_groups.setdefault("room", []).append(name)
    elif "organ" in name:
        instrument_groups.setdefault("organ", []).append(name)
    elif any(keyword in name for keyword in ["brass", "trumpet", "trombone", "horn", "bagpipe","tuba", "euphonium"]):
        instrument_groups.setdefault("brass", []).append(name)
    elif any(keyword in name for keyword in ["woodwind", "flute", "clarinet", "oboe", "saxophone", "sax", "bassoon", "alto", "soprano"]):
        instrument_groups.setdefault("woodwind", []).append(name)
    elif any(keyword in name for keyword in ["vocal", "choir", "vox", "backing"]):
        instrument_groups.setdefault("vocal", []).append(name)
    elif any(keyword in name for keyword in ["string", "violin", "cello", "viola"]):
        instrument_groups.setdefault("string", []).append(name)
    elif "fx" in name:
        instrument_groups.setdefault("fx", []).append(name)
    elif any(keyword in name for keyword in ["guitar", "gtr","ukulele", "ukelele" "banjo", "fiddle", "mandolin"]):
        instrument_groups.setdefault("guitar", []).append(name)
    else:
     
        instrument_groups.setdefault("other", []).append(name)

# Print instrument groups and their counts
for key, value in instrument_groups.items():
    print(key, len(value))
    print(value)
    print("\n\n")


brass 58
['horn', 'samplebrass', 'hornsdt', 'trumpet', 'trumpetsamples', 'horns sax flute m', 'horns bone ela m', 'bagpipedrones', 'trumpet', 'horns trumpet ela m', 'horn trumpet ela m', 'miditrombone', 'frenchhorn', 'horns sax  ak', 'brassandreeds', 'horns bone proto', 'frenchhornsamples', 'trombones', 'flugelhorns', 'horns trumpet msh', 'trombone', 'tubas', 'brass', 'horns sax m', 'miditrombones', 'vocals bg trumpet m', 'horns bari m', 'trumpets', 'trombone a   ar    neve', 'trombone m api', 'miditrumpet', 'horns bone m', 'brassstaccatolo', 'horns bone wet m', 'brasscrescendo', 'horns tenor ar', 'horns sousaphone m', 'horns trumpet ela m', 'horns trumpet  ela m', 'vocals trumpet m', 'trumpet a   ar    neve', 'horns bone ar', 'frenchhorns', 'horns bone ela m', 'sd vox trumpet m', 'trombone b   ar    neve', 'horn tenor m', 'horns bari ar', 'tuba', 'bagpipechanter', 'trumpet b   ar    neve', 'horns sax proto', 'trumpet m api', 'trombone', 'horns tenor m', 'horns trumpet m', 'brassdrop',