## Setting up SVM

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import matplotlib.pyplot as plt

In [7]:
labelPath = '../data/acousticbrainz-mediaeval_labels_part_aa'
dataPath = '../data/acousticbrainz-mediaeval-train/00/00eb9c5f-2fbe-4105-9ab3-3d83cf1e52e9.json'

label = pd.read_csv(labelPath, delimiter='\t')
data = pd.read_csv(dataPath, delimiter='\t')

  label = pd.read_csv(labelPath, delimiter='\t')


In [25]:
def get_value(d, keys):
    try:
        for key in keys:
            d = d[key]
        return d
    except (KeyError, TypeError):
        return np.nan

In [49]:
import os
import json
from glob import glob

# Path to your folder with JSON files
folder_path = '../data/acousticbrainz-mediaeval-train/00'

# Use glob to find all .json files
json_files = glob(os.path.join(folder_path, '*.json'))

# Read and store all JSON data
all_data = []

# List to hold each row of data
feature_rows = []

for file_path in json_files:
    with open(file_path, 'r') as file:
        data = json.load(file)
        # print(data)
        all_data.append(data)
    
    entry = {}

    entry['id'] = get_value(data, ['metadata', 'tags', 'musicbrainz_recordingid'])[0]

    # --- RHYTHM ---
    entry["bpm"] = get_value(data, ["rhythm", "bpm"])
    entry["beats_count"] = get_value(data, ["rhythm", "beats_count"])
    entry["danceability"] = get_value(data, ["rhythm", "danceability"])
    entry["onset_rate"] = get_value(data, ["rhythm", "onset_rate"])
    entry["bpm_first_peak"] = get_value(data, ["rhythm", "bpm_histogram_first_peak_bpm", "mean"])
    entry["bpm_second_peak"] = get_value(data, ["rhythm", "bpm_histogram_second_peak_bpm", "mean"])
    entry["bpm_first_peak_weight"] = get_value(data, ["rhythm", "bpm_histogram_first_peak_weight", "mean"])

    # --- TONAL / HARMONY ---
    entry["key_key"] = get_value(data, ["tonal", "key_key"])
    entry["key_scale"] = get_value(data, ["tonal", "key_scale"])
    entry["key_strength"] = get_value(data, ["tonal", "key_strength"])
    entry["chords_key"] = get_value(data, ["tonal", "chords_key"])
    entry["chords_scale"] = get_value(data, ["tonal", "chords_scale"])
    entry["chords_strength"] = get_value(data, ["tonal", "chords_strength"])
    entry["chords_changes_rate"] = get_value(data, ["tonal", "chords_changes_rate"])
    entry["hpcp_entropy"] = get_value(data, ["tonal", "hpcp_entropy"])

    # Get mean and var of HPCP vector
    hpcp = get_value(data, ["tonal", "hpcp"])
    if isinstance(hpcp, dict):
        entry["hpcp_mean"] = np.mean(hpcp["mean"]) if "mean" in hpcp else np.nan
        entry["hpcp_var"] = np.mean(hpcp["var"]) if "var" in hpcp else np.nan

    # --- LOWLEVEL / TIMBRE ---
    for feature in ["mfcc", "gfcc"]:
        coeffs = get_value(data, ["lowlevel", feature, "mean"])
        if isinstance(coeffs, dict):
            entry[f"{feature}_mean"] = np.mean(coeffs["mean"]) if "mean" in coeffs else np.nan
            entry[f"{feature}_var"] = np.mean(coeffs["var"]) if "var" in coeffs else np.nan

    # Spectral features
    spectral_features = [
        "spectral_centroid", "spectral_rolloff", "spectral_flux",
        "spectral_entropy", "zerocrossingrate"
    ]
    for feat in spectral_features:
        entry[feat] = get_value(data, ["lowlevel", feat, "mean"])

    entry['dynamic_complexity'] = get_value(data, ["lowlevel", "dynamic_complexity"])

    # Optional: average loudness, tuning info
    entry["average_loudness"] = get_value(data, ["lowlevel", "average_loudness"])
    entry["tuning_frequency"] = get_value(data, ["tonal", "tuning_frequency"])
    entry["tuning_equal_tempered_deviation"] = get_value(data, ["tonal", "tuning_equal_tempered_deviation"])

    # print(entry.keys())
    feature_rows.append(entry)

# Convert to DataFrame
df = pd.DataFrame(feature_rows)

# Preview
print(df.head())
# print(df['bpm_first_peak'][:10])

# Now all_data is a list of all the JSON contents
# print(all_data[:1][0].keys())
# print(all_data[:1][0]['tonal'].keys())
# print(all_data[:1][0]['metadata'].keys())
# print(all_data[:1][0]['rhythm'].keys())
# print(all_data[:1][0]['lowlevel'].keys())

                                     id         bpm  beats_count  \
0  00f33a4f-f080-44f1-8395-e11234047823  108.785461          429   
1  00f50cf5-585c-474d-8111-95b764953104   91.341766          245   
2  00ec41c0-b7ba-49d0-b55e-b5256eb7aa48  172.265350          607   
3  00f8978c-3fd2-4414-a96f-56327e1d035a  126.320808          365   
4  00ffd6b2-b809-43d0-b9e3-b0de014bb3b8  119.136162          570   

   danceability  onset_rate  bpm_first_peak  bpm_second_peak  \
0      0.910391    2.978464             108              115   
1      0.984062    3.386376              92              105   
2      1.063926    4.013623             172              167   
3      1.297712    3.767412             126              133   
4      1.182275    3.570038             120              115   

   bpm_first_peak_weight key_key key_scale  ...  hpcp_var spectral_centroid  \
0               0.072430       A     minor  ...  0.050783        890.708740   
1               0.729508       G     major  ... 

# the column names in the data file

In [9]:
print(data.keys())
print(label.keys())

print(data.head())

Index(['{"tonal": {"thpcp": [1, 0.629603326321, 0.383067965508, 0.316950440407, 0.343345165253, 0.519540309906, 0.672314703465, 0.549730956554, 0.373753219843, 0.321292728186, 0.375348091125, 0.621568977833, 0.77174693346, 0.556295514107, 0.429237008095, 0.436566978693, 0.304708749056, 0.19539925456, 0.173713847995, 0.219884097576, 0.396218538284, 0.50098156929, 0.449931651354, 0.421544730663, 0.3481336236, 0.345291048288, 0.702768564224, 0.954897522926, 0.666923820972, 0.363381117582, 0.378183037043, 0.519108712673, 0.602247774601, 0.583307504654, 0.618141174316, 0.934442341328], "hpcp": {"dmean2": [0.281801402569, 0.208695709705, 0.183336406946, 0.194276601076, 0.212391108274, 0.227089762688, 0.208987876773, 0.186438143253, 0.185076087713, 0.187985137105, 0.216272875667, 0.228151544929, 0.19593551755, 0.154567614198, 0.144274502993, 0.124987594783, 0.100733600557, 0.095040358603, 0.117083542049, 0.15368963778, 0.175540432334, 0.191182911396, 0.214235395193, 0.200715601444, 0.18380732

In [None]:
# print(label.head())
data['is_rnb'] = 0
for track in label:
    # check if genre contains 'rnb'
    for i, song in enumerate(label[track]):
        print(i,song)
        print(label[track])
        if 'rnb' in label[track][i].values:
            data['is_rnb'] = 1
    print(track)
    print(label[track])
    break

0 6eafad9e-3e4e-4af7-ad2c-dba94cfedecf
0         6eafad9e-3e4e-4af7-ad2c-dba94cfedecf
1         8eed6eed-24e7-4ac9-98dd-2e20502c1b13
2         a05966c4-aaf8-4508-ba23-3193b5067fc0
3         173dc6b4-5f76-4ec7-b86c-10f864187435
4         1a00a335-fead-46ec-8d4f-06e8341291ea
                          ...                 
459994    a3fafc72-fea1-4a4c-9619-fef1a0c43182
459995    3b3341c9-7d97-4470-b6cd-ab24e9edd619
459996    d0a24f69-51a2-48a1-8be6-c5f6ef3ad348
459997    32da73e1-8fed-44a4-91ec-d57ea5e6bb36
459998    73f65bf0-ecd7-4ec5-9441-24f9540ba2f6
Name: recordingmbid, Length: 459999, dtype: object


AttributeError: 'str' object has no attribute 'values'

In [None]:
# depending on how the data is set up we will need to figure out how to classify the genre
# keeping it general for now
data['is_rnb'] = data['genre'].apply(lambda x: 1 if x == 'R&B' else 0)

# once we decide which columns we want to drop from our features
dropCols = ['genre', 'is_rnb']
X = data.drop(columns=dropCols)

# what we want to predict
y = data['is_rnb']

# train-test split of 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# there are many kernal options, we can choose from 
# {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} or callable
svm_model = SVC(kernel='linear') 

# fit the model and predict
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

# output results
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

KeyError: 'genre'

# functions to plot the trained model with X and y

In [None]:
# this graphing of the model comes from Alex Thomo's SENG 474 class

def plot_points(features, labels):
    X = np.array(features)
    y = np.array(labels)
    spam = X[np.argwhere(y==1)]
    ham = X[np.argwhere(y==0)]
    plt.scatter([s[0][0] for s in spam],
                   [s[0][1] for s in spam],
                   s = 35,
                   color = 'cyan',
                   edgecolor = 'k',
                   marker = '^')
    plt.scatter([s[0][0] for s in ham],
                   [s[0][1] for s in ham],
                   s = 25,
                   color = 'red',
                   edgecolor = 'k',
                   marker = 's')
    plt.xlabel('x_1')
    plt.ylabel('x_2')
    plt.legend(['label 1','label 0'])

def plot_model(X, y, model):
    X = np.array(X)
    y = np.array(y)
    plot_step = 0.01
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contour(xx, yy, Z,colors = 'k',linewidths = 3)
    plot_points(X, y)
    plt.contourf(xx, yy, Z, colors=['red', 'blue'], alpha=0.2, levels=range(-1,2))
    plt.show()