# Music genre classification with naive Bayes


In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from sklearn import metrics

Below, we have helper functions for the required functions. 

In [2]:
# Calculates the probability density for a gaussian function
def gaussian(x, mu, sig):
    return (1/(sig*np.sqrt(2*math.pi))) * np.exp(-np.power(x-mu, 2) / (2*np.power(sig, 2)))

In [3]:
# Calculates the prior probability for all labels, 
# modified code form tutorial 3
def calc_prior(data):
    prior_p = {} # dictionary of prior probabilities

    labels = data.values[:, -1]
    n = len(labels)
    unique_labels, counts = np.unique(labels, return_counts=True)

    for i in range(len(unique_labels)):
        prior_p[unique_labels[i]] = (counts[i] / n)

    return prior_p

In [4]:
# Calculates the mean and standard deviation for each attribute given its label
def calc_distributions(data):
    avg_vals = data.groupby(['label']).mean()
    std_vals = data.groupby(['label']).std()

    return (avg_vals, std_vals)

In [5]:
# Calculates the posterior probabilities for all items in the data set
# modified from tutorial 3
def calc_posterior(data, prior_p, distributions):
    epsilon = 1e-8 # Value used instead of 0 when finding logs
    posterior_probs = []
    features_list = data.columns 

    for _, instance in data.iterrows():
        post_p = {}

        for label in prior_p:
            post_p[label] = np.log(prior_p[label])
            
            for feature in features_list:
                x = instance[feature]
                mu = distributions[0][feature][label]
                sig = distributions[1][feature][label]

                prob = gaussian(x, mu, sig)
                if(prob == 0): prob = epsilon

                post_p[label] += np.log(prob)
        
        posterior_probs.append(post_p)

    return posterior_probs

In [6]:
def find_tp_tn_fp_fn(actual, predicted):
    tp = 0 # true positive
    tn = 0 # true negative
    fp = 0 # false positive
    fn = 0 # false negative

    for i in range(len(actual)):
        if actual[i] == 'classical' and predicted[i] == 'classical':
            tp += 1
        elif actual[i] == 'pop' and predicted[i] == 'classical':
            fp += 1
        elif actual[i] == 'pop' and predicted[i] == 'pop':
            tn += 1
        elif actual[i] == 'classical' and predicted[i] == 'pop':
            fn += 1

    return tp, tn, fp, fn

The required functions:

In [7]:
# Prepares the data by reading it from a file and converting it into a useful format for training and testing
def preprocess(file):
    out = pd.read_csv(file)

    return out.iloc[: , 1:]

In [8]:
# This function should calculate prior probabilities and likelihoods from the training data and using
# them to build a naive Bayes model

def train(data):
    # Calculate prior probabilities
    prior_p = calc_prior(data)

    # Calculate distributions
    distributions = calc_distributions(data)

    return prior_p, distributions

In [9]:
# This function should predict classes for new items in a test dataset, 
# modified from tutorial 3

def predict(data, prior_p, distributions):
    # calculate posterior probabilities
    posterior_p = calc_posterior(data, prior_p, distributions)
    
    # find argmax for each instance
    argmax_labels = []
    for post_p in posterior_p:
        max_prob = float('-inf')
        max_label = None
        for label in post_p:
            if label in prior_p:
                prob = post_p[label]
                if prob > max_prob:
                    max_prob = prob
                    max_label = label
        if max_label is not None:
            argmax_labels.append(max_label)

    return argmax_labels

In [10]:
# This function should evaliate the prediction performance by comparing your model’s class outputs to ground
# truth labels
def evaluate(predictions, data):
    tp, tn, fp, fn = find_tp_tn_fp_fn(data.values[:, -1].tolist(), predictions)

    accuracy = (tp + tn) / (tp + fp + fn + tn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    
    return accuracy, precision, recall

## Task 1. Pop vs. classical music classification

### Q1
Compute and report the accuracy, precision, and recall of your model (treat "classical" as the "positive" class).

In [11]:
# First, we preprocess the data
train_df = preprocess("music_data/pop_vs_classical_train.csv")
test_df = preprocess("music_data/pop_vs_classical_test.csv")

# Next, we train the model
prior_p, distr = train(train_df)

# Now, we predict labels for the test_data
predictions = predict(test_df.iloc[:,:-1], prior_p, distr)

# Now find our evaluation metrics
accuracy, precision, recall = evaluate(predictions, test_df)
print(f"Accuracy is: {accuracy}\nPrecision is: {precision}\nRecall is: {recall}")

Accuracy is: 0.9767441860465116
Precision is: 0.9523809523809523
Recall is: 1.0


### Q2
For each of the features X below, plot the probability density functions P(X|Class = pop) and P(X|Class = classical). If you had to classify pop vs. classical music using just one of these three features, which feature would you use and why? Refer to your plots to support your answer.
- spectral centroid mean
- harmony mean
- tempo

In [12]:
def make_pdf(distr, feature, title, filename, axis):
    plt.plot(axis, norm.pdf(axis, distr[0][feature]['classical'], distr[1][feature]['classical']), color='b')
    plt.plot(axis, norm.pdf(axis, distr[0][feature]['pop'], distr[1][feature]['pop']), color='r')
    plt.legend(["classical", "pop"], loc="upper right")
    plt.title(title)
    plt.savefig(filename)
    plt.close()
    return

In [13]:
# Construct pdf for spectral_centroid_mean
make_pdf(distr, 'spectral_centroid_mean', 
         "PDFs of Spectral Centroid Mean for Each Label", 
         'scm_pdf.png', np.arange(400, 4600, 0.1))

# Construct pdf for harmony_mean
make_pdf(distr, 'harmony_mean',
         "PDFs of Harmony Mean for Each Label",
         'hm_pdf.png', np.arange(-0.004, 0.012, 0.000001))

# Construct pdf for tempo
make_pdf(distr, 'tempo', 
         "PDFs of Tempo for Each Label", 
         'tempo_pdf.png', np.arange(65, 235, 0.001))

## Task 2. 10-way music genre classification

#### NOTE: you may develop codes or functions to help respond to the question here, but your formal answer must be submitted separately as a PDF.

In [14]:
# Predicts values using 0R
def zero_r_predictions(train, test):
    out = []

    prediction = train['label'].mode()
    for i in range(len(test)):
        out.append(prediction)

    return out

In [15]:
# Predicts values using one-attribute baseline
def one_attr_predictions(train_df, test_df):
    # Make a data frame to store attributes for one-attribute naive bayes and 
    # their error rates
    accuracy_df = pd.DataFrame(columns = ['accuracy', 'predictions'])

    # List of all attributes, besides label
    feature_list = train_df.iloc[: , :-1].columns

    # Iterate over each feature and construct Naive Bayes
    for feature in feature_list:
        # Train using one attribute
        prior_p, distr = train(train_df[[feature, 'label']])

        # Predict using one attribute model
        predictions = predict(test_df[[feature]], prior_p, distr)

        # Save accuracy rate
        accuracy_df.loc[feature] = [metrics.accuracy_score(test_df['label'].tolist(), predictions), predictions]

    # Find feature with highest accuracy
    best_feature = accuracy_df['accuracy'].idxmax()
    print(f"Feature used for one-attribute naive Bayes is {best_feature}\n")

    return accuracy_df.at[best_feature, 'predictions']

In [16]:
# First, we preprocess the data
train_df = preprocess("music_data/gztan_train.csv")
test_df = preprocess("music_data/gztan_train.csv")

# Next, we train the model
prior_p, distr = train(train_df)

# Now, we predict labels for the test_data
predictions = predict(test_df.iloc[:,:-1], prior_p, distr)

### Q3
Compare the performance of the full model to a 0R baseline and a one-attribute baseline. The one-attribute baseline should be the best possible naive Bayes model which uses only a prior and a single attribute. In your write-up, explain how you implemented the 0R and one-attribute baselines.

In [17]:
labels = test_df['label'].unique()

# Find actual labels of test values
true_labels = test_df.values[:, -1].tolist() 

# To prepare for cross-model performance over categories
model_predictions = [predictions, 
                    zero_r_predictions(train_df, test_df), 
                    one_attr_predictions(train_df, test_df)]
model_names = ["Naive Bayes", "Zero R Baseline", "One Attribute Baseline"]
cf_filenames = ["nb_cf.png", "zero_r_cf.png", "one_attr_cf.png"]

for i in range(3):
    # Evaluating accuracy of model
    accuracy = metrics.accuracy_score(true_labels, model_predictions[i])
    print(f"Overall accuracy score of {model_names[i]} is: {accuracy}")

    # Evaluating model based on performance over categories
    conf_mat = metrics.confusion_matrix(true_labels, model_predictions[i])

    # Plot confusion matrix
    conf_mat_display = metrics.ConfusionMatrixDisplay(conf_mat, 
                    display_labels = labels)
    conf_mat_display.plot()
    plt.xticks(rotation=90)
    plt.subplots_adjust(bottom=0.25)
    plt.title(f"{model_names[i]} Confusion Matrix")
    plt.savefig(cf_filenames[i])
    plt.close()

Feature used for one-attribute naive Bayes is chroma_stft_mean

Overall accuracy score of Naive Bayes is: 0.5875
Overall accuracy score of Zero R Baseline is: 0.1075
Overall accuracy score of One Attribute Baseline is: 0.28125
