In [None]:
# Our team decided to use the Spotify dataset.
# The question our group will be exploring is "can we predict a song's popularity using its musical features?"
# The main features we will be using will be track_id, popularity, explicit, danceability, energy, loudness, liveness, tempo, and track_genre.
import pandas as pd

df = pd.read_csv("hf://datasets/maharshipandya/spotify-tracks-dataset/dataset.csv")

# Check for missing values
print("Missing values before cleaning:\n", df.isnull().sum())

# Drop rows where critical features like track_id or track_name are missing
df_cleaned = df.dropna(subset=['track_id', 'track_name'])

# Fill missing numerical values with mean or median
df_cleaned.loc[:, 'popularity'] = df_cleaned['popularity'].fillna(df_cleaned['popularity'].mean())
df_cleaned.loc[:, 'danceability'] = df_cleaned['danceability'].fillna(df_cleaned['danceability'].median())
df_cleaned.loc[:, 'energy'] = df_cleaned['energy'].fillna(df_cleaned['energy'].mean())
df_cleaned.loc[:, 'loudness'] = df_cleaned['loudness'].fillna(df_cleaned['loudness'].median())
df_cleaned.loc[:, 'liveness'] = df_cleaned['liveness'].fillna(df_cleaned['liveness'].mean())
df_cleaned.loc[:, 'tempo'] = df_cleaned['tempo'].fillna(df_cleaned['tempo'].mean())

# Fill missing categorical values with the most common value (mode)
df_cleaned.loc[:, 'track_genre'] = df_cleaned['track_genre'].fillna(df_cleaned['track_genre'].mode()[0])
df_cleaned.loc[:, 'explicit'] = df_cleaned['explicit'].fillna(df_cleaned['explicit'].mode()[0])

# Verify no more missing values
print("Missing values after cleaning:\n", df_cleaned.isnull().sum())


: 

In [None]:
#more data cleaning
# CHECK FOR OUTLIERS
Q1 = df_cleaned['popularity'].quantile(0.25)
Q3 = df_cleaned['popularity'].quantile(0.75)
IQR = Q3 - Q1

outliers = df_cleaned[(df_cleaned['popularity'] < (Q1 - 1.5 * IQR)) | (df_cleaned['popularity'] > (Q3 + 1.5 * IQR))]
print(outliers)
print("Number of outliers in popularity:", outliers.shape[0])

# CHECK FOR DATA VALUES THAT ARE OUTSIDE OF THE POSSIBLE RANGE

# CHECK FOR DUPLICATES

# duplicates = df_cleaned[df_cleaned.duplicated()]
# print("Number of duplicate rows:", duplicates.shape[0])
# df_cleaned = df_cleaned.drop_duplicates()

# Remove duplicates based on the track_id column
df_cleaned = df_cleaned.drop_duplicates(subset=['track_name'])

# Verify that duplicates have been removed
print("Number of duplicate track_name entries:", df_cleaned.duplicated(subset=['track_name']).sum())


In [None]:
df_cleaned.select_dtypes('number').describe()

In [None]:
# 1. Distribution of Popularity
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['popularity'], bins=20, kde=True)
plt.title('Distribution of Song Popularity')
plt.xlabel('Popularity')
plt.ylabel('Frequency')
plt.show()



In [None]:
# 2. Correlation heatmap to examine relationships between numerical features and popularity
plt.figure(figsize=(10, 8))
correlation_matrix = df_cleaned[['popularity', 'danceability', 'energy', 'loudness', 'liveness', 'tempo']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap: Features vs Popularity')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Group by track_genre and calculate the average popularity
df_genre_popularity = df_cleaned.groupby('track_genre')['popularity'].mean().reset_index()

# Sort the genres by the average popularity in descending order
df_genre_popularity = df_genre_popularity.sort_values(by='popularity', ascending=False)

# Set the figure size for better readability
plt.figure(figsize=(12, 20))

# Create a bar plot to show the relationship between track_genre and average popularity
sns.barplot(x='popularity', y='track_genre', data=df_genre_popularity)

# Add a title and labels
plt.title('Bar Plot of Track Genre vs Average Popularity')
plt.xlabel('Average Popularity')
plt.ylabel('Track Genre')

# Show the plot
plt.show()
