In [None]:
# import packages
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# read in mood data
mood_data = pd.read_csv("data/mood_data.csv")
mood_data

In [None]:
# pie chart for primary mood distribution
primary_groups = mood_data.groupby('primary')
plt.rc('font', size=14)
ax = primary_groups.size().plot.bar(y = 'primary', figsize=(15, 10), color = "orange", rot = 0)
ax.set_ylabel('Number of Songs')
ax.set_xlabel('Primary Mood')
ax.set_title('Primary Mood Distribution')
ax.set_xticklabels(('Epic', 'Lighthearted', 'Energetic', 'Calm', 'Chill', 'Miscellaneous'))
plt.savefig("primary_dist.pdf")

In [None]:
# pie chart for secondary mood distribution
secondary_groups = mood_data.groupby('secondary')
plt.rc('font', size=14)
ax = secondary_groups.size().plot.bar(y = 'secondary', figsize=(15, 10), color = "green", rot = 0)
ax.set_ylabel('Number of Songs')
ax.set_xlabel('Secondary Mood')
ax.set_title('Secondary Mood Distribution')
ax.set_xticklabels(('Epic', 'Lighthearted', 'Energetic', 'Calm', 'Chill', 'Miscellaneous'))
plt.savefig("secondary_dist.pdf")

In [None]:
# list top 10 artists
artist_groups = mood_data.groupby('artist')
plt.rc('font', size=14)
top10 = artist_groups.size().sort_values(ascending = False).head(10)
top10

In [None]:
# create a correlation DataFrame for the basic features
features = pd.read_csv("data/features.csv")
features_corr = features.drop(columns = ['title', 'artist', 'primary', 'secondary'])
features_corr

In [None]:
# plot the correlation matrix for basic features
corr = features_corr.corr()
plt.matshow(corr)
labels = ['tempo', 'chroma number', 'zero crossing rate', 'energy entropy', 'spectral centroid']
plt.xticks(range(len(corr.columns)), labels, rotation = 'vertical')
plt.yticks(range(len(corr.columns)), labels)
plt.savefig("data/correlation_matrix.pdf", bbox_inches = 'tight')
plt.show()

In [None]:
# create a correlation DataFrame for the engineered features
eng_features = pd.read_csv("data/engineered_features.csv")
eng_features_corr = eng_features.drop(columns = ['title', 'artist', 'primary', 'secondary'])
eng_features_corr

In [None]:
# plot the correlation matrix for the engineered features
corr = eng_features_corr.corr()
plt.matshow(corr)
labels = eng_features_corr.columns
plt.xticks(range(len(corr.columns)), labels, rotation = 'vertical')
plt.yticks(range(len(corr.columns)), labels)
plt.savefig("data/eng_correlation_matrix.pdf", bbox_inches = 'tight')
plt.show()

In [None]:
# reload basic features and load neural network features
features = pd.read_csv("data/features.csv")
nn_features = pd.read_csv("data/nn_features.csv")

In [None]:
# combine neural network features with basic features to add mood labels
combined = pd.concat([features, nn_features], axis = 1)
combined['song_check'] = combined['title'] + ' - ' + combined['artist']
if combined['song'].equals(combined['song_check']):
    print ("Features match.")
    combined = combined.drop(columns = ['song', 'song_check'])
else:
    print ("Features DO NOT match.")

In [None]:
# drop repeating columns and song information
combined = combined.drop(columns = ['title', 'artist', 'zero_crossing_rate', 'energy_entropy', 'spectral_centroid'])

# generate boxplots of distribution of each feature for feature engineering
for col in combined.columns:
    if col == "primary" or col == "secondary":
        continue
    
    # boxplot for primary mood
    combined.boxplot(column = col, by = 'primary')
    filename = "data/feature_dist/primary_dist_" + col + ".pdf"
    plt.ylabel(col)
    plt.xlabel('primary mood')
    plt.suptitle('')
    plt.title(col + ' grouped by primary mood')
    plt.savefig(filename, bbox_inches = 'tight')
    plt.show()
    
    # boxplot for secondary mood
    combined.boxplot(column = col, by = 'secondary')
    filename = "data/feature_dist/secondary_dist_" + col + ".pdf"
    plt.ylabel(col)
    plt.xlabel('secondary mood')
    plt.suptitle('')
    plt.title(col + ' grouped by secondary mood')
    plt.savefig(filename, bbox_inches = 'tight')
    plt.show()