In [1]:
# Load dependencies

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [8]:
# Loading the CSV file 
file_path = '~/Desktop/project-3/Dataset/songs_normalize.csv'

# Load the dataset
songs_df = pd.read_csv(file_path)


In [9]:
# Cleaning up the Data - checking for missing values 
missing_values = songs_df.isnull().sum()
print("Missing Values:\n", missing_values)

# Drop rows with missing values 
songs_df_cleaned = songs_df.dropna()

# Verfiy if missing values have been dropped 
print("Missing Values After Cleanup:\n", songs_df_cleaned.isnull().sum())

Missing Values:
 artist              0
song                0
duration_ms         0
explicit            0
year                0
popularity          0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
genre               0
dtype: int64
Missing Values After Cleanup:
 artist              0
song                0
duration_ms         0
explicit            0
year                0
popularity          0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
genre               0
dtype: int64


In [11]:
# Standardizing Genere Names - Splitting multi-genre 
songs_df_cleaned['genre'] = songs_df_cleaned['genre'].apply(lambda x: x.split(','[0].strip()))
print(songs_df_cleaned['genre'].value_counts())

genre
[pop]                                        428
[hip hop,  pop]                              277
[hip hop,  pop,  R&B]                        244
[pop,  Dance/Electronic]                     221
[pop,  R&B]                                  178
[hip hop]                                    124
[hip hop,  pop,  Dance/Electronic]            78
[rock]                                        58
[rock,  pop]                                  43
[Dance/Electronic]                            41
[rock,  metal]                                38
[pop,  latin]                                 28
[pop,  rock]                                  26
[set()]                                       22
[hip hop,  Dance/Electronic]                  16
[latin]                                       15
[pop,  rock,  metal]                          14
[hip hop,  pop,  latin]                       14
[R&B]                                         13
[pop,  rock,  Dance/Electronic]               13
[country]     

In [30]:
# Standardize text in artist, song, and genre columns
songs_df_cleaned['artist'] = songs_df_cleaned['artist'].str.lower().str.strip()
songs_df_cleaned['song'] = songs_df_cleaned['song'].str.lower().str.strip()

In [31]:
# Normalize genre names by keeping only the first genre (if multiple genres are listed)
songs_df_cleaned['genre'] = songs_df_cleaned['genre'].apply(lambda x: x.split(',')[0].strip())

In [32]:
# Remove duplicate rows
songs_df_cleaned = songs_df_cleaned.drop_duplicates()

In [19]:
# Normalize text by converting artist and song names to lowercase and stripping extra spaces
songs_df_cleaned['artist'] = songs_df_cleaned['artist'].str.lower().str.strip()
songs_df_cleaned['song'] = songs_df_cleaned['song'].str.lower().str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_df_cleaned['artist'] = songs_df_cleaned['artist'].str.lower().str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_df_cleaned['song'] = songs_df_cleaned['song'].str.lower().str.strip()


In [21]:
# Convert duration from milliseconds to minutes
songs_df_cleaned['duration_minutes'] = songs_df_cleaned['duration_ms'] / 60000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_df_cleaned['duration_minutes'] = songs_df_cleaned['duration_ms'] / 60000


In [22]:
# Check for any invalid years (e.g., negative values or far future years)
songs_df_cleaned = songs_df_cleaned[songs_df_cleaned['year'].between(1900, 2024)]

In [23]:
# Remove outliers beyond a certain threshold (for example, using 1.5 * IQR rule)
Q1 = songs_df_cleaned[['tempo', 'energy', 'loudness']].quantile(0.25)
Q3 = songs_df_cleaned[['tempo', 'energy', 'loudness']].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for acceptable values
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove rows that contain outliers
songs_df_cleaned = songs_df_cleaned[~((songs_df_cleaned[['tempo', 'energy', 'loudness']] < lower_bound) | 
                                      (songs_df_cleaned[['tempo', 'energy', 'loudness']] > upper_bound)).any(axis=1)]

In [33]:
# Display the cleaned data (first 5 rows and summary)
print(songs_df_cleaned.head())
print("\nSummary of the cleaned data:\n", songs_df_cleaned.describe())
print("\nRemaining missing values (should be 0 for all columns):\n", songs_df_cleaned.isnull().sum())
print(f"\nNumber of duplicates after cleaning: {songs_df_cleaned.duplicated().sum()}")

           artist                    song  duration_ms  explicit  year  \
0  britney spears  oops!...i did it again       211160         0  2000   
1       blink-182    all the small things       167066         0  1999   
2      faith hill                 breathe       250546         0  1999   
3        bon jovi            it's my life       224493         0  2000   
4          *nsync             bye bye bye       200560         0  2000   

   popularity  danceability    energy  key  loudness  mode  speechiness  \
0          77      0.735225  0.763271    1  0.488807     0       0.0437   
1          79      0.360520  0.853659    0  0.545968     1       0.0488   
2          66      0.472813  0.278336    7  0.101608     1       0.0290   
3          78      0.498818  0.876614    0  0.638883     0       0.0466   
4          65      0.573286  0.898135    8  0.558140     0       0.0516   

   acousticness  instrumentalness  liveness   valence     tempo         genre  \
0        0.3000        

In [29]:
# Save the cleaned dataset to a new CSV file
songs_df_cleaned.to_csv('cleaned_songs_data.csv', index=False)