# Music/Sentiments Research - EDA for single sentiment

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
file_name = '/data/notebook_files/MFD_Predictions_May2023.csv'  # note: you can right-click on your file and copy-paste the path here
dataset = pd.read_csv(file_name)
dataset.drop(columns='Unnamed: 0', inplace = True)
dataset.rename(columns={'label':'First_Label'}, inplace = True)

In [3]:
file_name = '/data/notebook_files/Fully_Sample_Dataset.csv'  # note: you can right-click on your file and copy-paste the path here
dataset = pd.read_csv(file_name)
dataset.drop(columns='Unnamed: 0', inplace = True)
dataset.rename(columns={'label':'First_Label'}, inplace = True)

In [4]:
dataset.sample(5)

In [5]:
dataset.First_Label.value_counts()

In [6]:
eda_cols = ['song_id', 'group_id', 'user_id','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
                    'instrumentalness', 'liveness', 'valence', 'tempo']

spotify_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
                    'instrumentalness', 'liveness', 'valence', 'tempo']

## First, Let's see the differences between each sentiment:
Before we diving into each sentiment data and focus on one of them, Let's try and find some interesting insights from the differences between each sentiment,  
or in other words: **how the features influence the sentiments?**

In [7]:
# Define the list of feature names
features = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 
                 'acousticness','liveness', 'valence', 'tempo']

# Create the subplots
fig, axes = plt.subplots(3,3, figsize=(20, 16), gridspec_kw={'wspace': 0.3, 'hspace': 0.4})
axes = axes.flatten()

# Loop over the feature names and plot histograms and density plots for each feature
for i, feature_name in enumerate(features):
    sns.pointplot(data=dataset,x=feature_name,y='First_Label', color = 'cornflowerblue', ax=axes[i])
    axes[i].set_title(feature_name.capitalize(), fontweight='bold', fontsize=14)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].spines['top'].set_visible(False)
    axes[i].spines['right'].set_visible(False)
    axes[i].grid(axis='x', linestyle='--', alpha=0.7)


# Set the title and adjust the spacing
fig.suptitle(f'Ranges plot for each Feature and Sentiment', fontsize=20, fontweight='bold');\

fig.savefig('Sentiments_Ranges_plot.png');

In [16]:
features = [['danceability', 'energy', 'key', 'loudness', 'speechiness', 
                 'acousticness','liveness', 'valence', 'tempo', 'instrumentalness']]

In [17]:
import seaborn as sns
import matplotlib.pyplot as plt

feature_name = 'instrumentalness'
# calculate the mean for each sentiment
means = dataset.groupby('First_Label')[feature_name].mean().sort_values()

# create pointplot with ordered groups
sns.set(rc={'figure.figsize':(15,10)})
sns.set_style(rc={'figure.figsize':(15,10)},style='white') 
ax = sns.pointplot(data=dataset, x=feature_name, y='First_Label',estimator='mean', errorbar=('ci', 50),
                   color='cornflowerblue', join=False, order=means.index, markers='.')
ax.set_ylabel('Sentiment')
ax.set_xlabel('')
ax.grid(axis='x', linestyle='--', alpha=0.7)

# add the mean values to the plot and draw lines over them in red
# for i, mean in enumerate(means):
#     ax.text(mean+0.0005, i-0.1, f'{mean:.4f}', color='black')

# plot line over the mean values
plt.plot(means.values, means.index, color='tomato', linestyle='--', marker='o', markersize=7)

# Add the mean line
mean_value = dataset[feature_name].mean()
plt.axvline(x=mean_value, color='red', linestyle='--')
# ax.text(mean_value-0.003, 1.5, f'Mean value ({mean_value:.3f})', color='tomato',
#         fontsize=10, rotation=90, fontweight='bold')

# remove top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Reduca the namber of tick params on the X axis.
ax.locator_params(axis = 'x', nbins=7) 

# set the title of the plot
# ax.set_title(f'{feature_name.capitalize()} Means by Sentiments', fontweight='bold', fontsize=25)
ax.set_title('')
ax.set_ylabel('')
ax.set_xlabel('')
# ax.set_yticklabels([])
ax.tick_params(axis='x', which='major', labelsize=27)
ax.tick_params(axis='y', which='major', labelsize=31)
# set the font of the x-axis label and tick labels

plt.savefig(f'{feature_name.capitalize()}_Ranges.png')

# show the plot
plt.show();

## Metrics Comparison Table for selected feature.
Just select the desired feature and see the differences between the mean, median, minimum and maximum values for each emotion.

In [19]:
emotions = ['joy','neutral','sadness','surprise','fear','disgust','anger']
feature_name = 'danceability'
compare_df = pd.DataFrame(columns=['Mean','Median','Minimum','Maximum'])


for emotion in emotions:
    emotion_df = dataset[dataset.First_Label == emotion]
    mean = emotion_df[feature_name].mean()
    median = emotion_df[feature_name].median()
    minimum = emotion_df[feature_name].min()
    maximum = emotion_df[feature_name].max()
    compare_df.loc[emotion,] = [mean, median, minimum, maximum]

compare_df.sort_values(by = 'Mean', ascending=False, inplace = True)

print(f'Metrics Comparison Table for {feature_name.capitalize()}:')

compare_df

## Now, we'll dive into 'Joy':
### We'll start with understanding the distributions of each sentiment with focus on 'joy' data.

In [23]:
emotion = 'joy'
emotion_df = dataset[dataset.First_Label == emotion][spotify_features]

In [24]:
emotion_df[spotify_features].describe()

In [25]:
# Define the list of feature names
features = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 
                 'acousticness', 'liveness', 'valence', 'tempo']

# Create the subplots
fig, axes = plt.subplots(3,3, figsize=(20, 16), gridspec_kw={'wspace': 0.3, 'hspace': 0.4})
axes = axes.flatten()

# Loop over the feature names and plot histograms and density plots for each feature
for i, feature_name in enumerate(features):
    sns.histplot(data=emotion_df, x=feature_name, color='cornflowerblue', stat='density', ax=axes[i])
    sns.kdeplot(data=emotion_df, x=feature_name, color='tomato', ax=axes[i])
    axes[i].set_title(feature_name.capitalize(), fontweight='bold', fontsize=14)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].spines['top'].set_visible(False)
    axes[i].spines['right'].set_visible(False)
    axes[i].grid(axis='y', linestyle='--', alpha=0.7)
    mean = emotion_df[feature_name].mean()
    median = emotion_df[feature_name].median()
    minimum = emotion_df[feature_name].min()
    maximum = emotion_df[feature_name].max()
    if feature_name == 'loudness':
        axes[i].legend([f'Mean: {mean:.3f}\nMedian: {median:.2f}\nMin: {minimum:.3f}\n'
                        f'Max: {maximum:.3f}'], loc='upper left', fontsize=11, handlelength=0)
        continue
    axes[i].legend([f'Mean: {mean:.3f}\nMedian: {median:.2f}\nMin: {minimum:.3f}\n'
    f'Max: {maximum:.3f}'], loc='upper right', fontsize=11, handlelength=0)

# Remove the unused subplots
for i in range(len(features), len(axes)):
    fig.delaxes(axes[i])

# Set the title and adjust the spacing
fig.suptitle(f'Density Distribution of Audio Features for {emotion}ful Music', fontsize=20, fontweight='bold')

plt.savefig(f'{emotion}ful_Density_Distribution.png');

### Focus on one Feature Density Distribution plot:

In [26]:
# Add summary statistics to plot
mean = emotion_df['danceability'].mean()
median = emotion_df['danceability'].median()
minimum = emotion_df['danceability'].min()
maximum = emotion_df['danceability'].max()

In [27]:
# Density Plot
sns.set(rc={'figure.figsize':(11,8)})
sns.set_style(rc={'figure.figsize':(11,8)},style='white') 
ax = sns.histplot(data=emotion_df, x="danceability", kde=False, color = 'cornflowerblue', stat='density')
ax.text(mean+0.1, ax.get_ylim()[1]*0.8, f"Mean: {mean:.3f}\n Median: {median:.2f}", ha='center', fontsize=11.5)
ax.text(minimum, ax.get_ylim()[1]*0.1, f"Min: {minimum:.3f}", ha='left', fontsize=11.5)
ax.text(maximum, ax.get_ylim()[1]*0.1, f"Max: {maximum:.3f}", ha='right', fontsize=11.5)
sns.kdeplot(data=emotion_df, x="danceability", color='red', ax=ax)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_title('Density Distribution of Danceability for Joyful Music', fontweight='bold', fontsize=20)
plt.savefig(f'Danceability_Density_Distribution_Joy.png');

### Now, the relations between each pair of features:

In [28]:
g = sns.PairGrid(emotion_df[['danceability', 'energy', 'loudness', 'speechiness', 
                         'acousticness', 'liveness', 'valence', 'tempo']])
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot);

plt.savefig(f'Features_Pair_Grid.png');