In [6]:
# Load dependencies

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os

In [7]:
# Loading the CSV file 
file_path = '~/Desktop/project-3/Dataset/songs_normalize.csv'

# Load the dataset
songs_df = pd.read_csv(file_path)


In [8]:
# Cleaning up the Data - checking for missing values 
missing_values = songs_df.isnull().sum()
print("Missing Values:\n", missing_values)

# Drop rows with missing values 
songs_df_cleaned = songs_df.dropna()

# Verfiy if missing values have been dropped 
print("Missing Values After Cleanup:\n", songs_df_cleaned.isnull().sum())

Missing Values:
 artist              0
song                0
duration_ms         0
explicit            0
year                0
popularity          0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
genre               0
dtype: int64
Missing Values After Cleanup:
 artist              0
song                0
duration_ms         0
explicit            0
year                0
popularity          0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
genre               0
dtype: int64


In [9]:
# Standardizing Genere Names - Splitting multi-genre 
songs_df_cleaned['genre'] = songs_df_cleaned['genre'].apply(lambda x: x.split(','[0].strip()))
print(songs_df_cleaned['genre'].value_counts())

genre
[pop]                                        428
[hip hop,  pop]                              277
[hip hop,  pop,  R&B]                        244
[pop,  Dance/Electronic]                     221
[pop,  R&B]                                  178
[hip hop]                                    124
[hip hop,  pop,  Dance/Electronic]            78
[rock]                                        58
[rock,  pop]                                  43
[Dance/Electronic]                            41
[rock,  metal]                                38
[pop,  latin]                                 28
[pop,  rock]                                  26
[set()]                                       22
[hip hop,  Dance/Electronic]                  16
[latin]                                       15
[pop,  rock,  metal]                          14
[hip hop,  pop,  latin]                       14
[R&B]                                         13
[pop,  rock,  Dance/Electronic]               13
[country]     

In [10]:
# Standardize text in artist, song, and genre columns
songs_df_cleaned['artist'] = songs_df_cleaned['artist'].str.lower().str.strip()
songs_df_cleaned['song'] = songs_df_cleaned['song'].str.lower().str.strip()

In [12]:
# Normalize genre names by keeping only the first genre (if multiple genres are listed)
# This handles both cases: if the genre is a list or a string
songs_df_cleaned['genre'] = songs_df_cleaned['genre'].apply(lambda x: x[0] if isinstance(x, list) else x.split(',')[0].strip())

# Check the cleaned genre column
print(songs_df_cleaned['genre'].head())

0     pop
1    rock
2     pop
3    rock
4     pop
Name: genre, dtype: object


In [13]:
# Remove duplicate rows
songs_df_cleaned = songs_df_cleaned.drop_duplicates()

In [14]:
# Normalize text by converting artist and song names to lowercase and stripping extra spaces
songs_df_cleaned['artist'] = songs_df_cleaned['artist'].str.lower().str.strip()
songs_df_cleaned['song'] = songs_df_cleaned['song'].str.lower().str.strip()

In [15]:
# Convert duration from milliseconds to minutes
songs_df_cleaned['duration_minutes'] = songs_df_cleaned['duration_ms'] / 60000

In [16]:
# Check for any invalid years (e.g., negative values or far future years)
songs_df_cleaned = songs_df_cleaned[songs_df_cleaned['year'].between(1900, 2024)]

In [17]:
# Remove outliers beyond a certain threshold (for example, using 1.5 * IQR rule)
Q1 = songs_df_cleaned[['tempo', 'energy', 'loudness']].quantile(0.25)
Q3 = songs_df_cleaned[['tempo', 'energy', 'loudness']].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for acceptable values
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove rows that contain outliers
songs_df_cleaned = songs_df_cleaned[~((songs_df_cleaned[['tempo', 'energy', 'loudness']] < lower_bound) | 
                                      (songs_df_cleaned[['tempo', 'energy', 'loudness']] > upper_bound)).any(axis=1)]

In [18]:
# Display the cleaned data (first 5 rows and summary)
print(songs_df_cleaned.head())
print("\nSummary of the cleaned data:\n", songs_df_cleaned.describe())
print("\nRemaining missing values (should be 0 for all columns):\n", songs_df_cleaned.isnull().sum())
print(f"\nNumber of duplicates after cleaning: {songs_df_cleaned.duplicated().sum()}")

           artist                    song  duration_ms  explicit  year  \
0  britney spears  oops!...i did it again       211160     False  2000   
1       blink-182    all the small things       167066     False  1999   
2      faith hill                 breathe       250546     False  1999   
3        bon jovi            it's my life       224493     False  2000   
4          *nsync             bye bye bye       200560     False  2000   

   popularity  danceability  energy  key  loudness  mode  speechiness  \
0          77         0.751   0.834    1    -5.444     0       0.0437   
1          79         0.434   0.897    0    -4.918     1       0.0488   
2          66         0.529   0.496    7    -9.007     1       0.0290   
3          78         0.551   0.913    0    -4.063     0       0.0466   
4          65         0.614   0.928    8    -4.806     0       0.0516   

   acousticness  instrumentalness  liveness  valence    tempo genre  \
0        0.3000          0.000018    0.3550  

In [19]:
# Save the cleaned dataset to a new CSV file
songs_df_cleaned.to_csv('cleaned_songs_data.csv', index=False)

In [27]:
heading = "VISUALIZATION CODE STARTING HERE"

print(f"{heading:=^30}")

question = "What are the defining characteristics of the most popular songs?, specifically focusing on loudness and valence for the top and bottom 10% of popular songs"
print(f"{question:=^150}")

VISUALIZATION CODE STARTING HERE
What are the defining characteristics of the most popular songs?, specifically focusing on loudness and valence for the top and bottom 10% of popular songs


In [29]:
# Determine the top and bottom 10% of songs by popularity
top_10_percent = songs_df_cleaned['popularity'].quantile(0.9)
bottom_10_percent = songs_df_cleaned['popularity'].quantile(0.1)

In [30]:
top_songs = songs_df_cleaned[songs_df_cleaned['popularity'] >= top_10_percent]
bottom_songs = songs_df_cleaned[songs_df_cleaned['popularity'] <= bottom_10_percent]

In [31]:
# Plot for Top 10% songs
fig_top = px.scatter(top_songs, x='loudness', y='valence', 
                     color='popularity', 
                     title="Top 10% Most Popular Songs: Loudness vs Valence",
                     labels={'loudness':'Loudness (dB)', 'valence':'Valence (Positivity)'},
                     hover_name='song',
                     size='popularity',
                     color_continuous_scale=px.colors.sequential.Peach,
                     template='plotly_dark')

In [32]:
# Plot for Bottom 10% songs
fig_bottom = px.scatter(bottom_songs, x='loudness', y='valence', 
                        color='popularity', 
                        title="Bottom 10% Least Popular Songs: Loudness vs Valence",
                        labels={'loudness':'Loudness (dB)', 'valence':'Valence (Positivity)'},
                        hover_name='song',
                        size='popularity',
                        color_continuous_scale=px.colors.sequential.Peach,
                        template='plotly_dark')


In [33]:
# 3. Show the plots
fig_top.show()
fig_bottom.show()