In [1]:
# import basic libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',None)

In [None]:
# code to load the dataset and displaying top 5 rows
data=pd.read_csv(r"D:\Downloads\spotify-2023.csv",encoding="latin1")
data.head()

In [None]:
# code to display all the columns
data.columns

In [None]:
# code to see total rows and columns available in the dataset
data.shape

In [None]:
# code to see type and null object of each column
data.info()

In [None]:
# checking null values
data.isnull().sum()/data.shape[0]

In [None]:
# filling null values with 0
data.fillna(0, inplace=True)

In [None]:
# checking total null value present in each columns
data.isnull().sum()

In [None]:
## Feature Engineering

In [None]:
# Convert non-numeric values to NaN
data['streams'] = pd.to_numeric(data['streams'], errors='coerce')
data.dropna(subset=['streams'], inplace=True)

In [None]:
# Convert non-numeric values to NaN
data['in_deezer_playlists'] = pd.to_numeric(data['in_deezer_playlists'], errors='coerce')
data.dropna(subset=['in_deezer_playlists'], inplace=True)

In [None]:
# Convert non-numeric values to NaN
data['in_shazam_charts'] = pd.to_numeric(data['in_shazam_charts'], errors='coerce')
data.dropna(subset=['in_shazam_charts'], inplace=True)

In [None]:

# Mapping dictionary for alphanumeric keys to numeric values
key_mapping = {'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4, 'F': 5, 'F#': 6, 'G': 7, 'G#': 8, 'A': 9, 'A#': 10, 'B': 11}

# Convert alphanumeric keys to numeric values
data['key'] = data['key'].map(key_mapping)

In [None]:
# check value present in mode column
data['mode'].value_counts()

In [None]:
key_map={'Major':1.0,'Minor':0.0}
# Convert alphanumeric keys to numeric values
data['mode'] = data['mode'].map(key_map)

In [None]:
# change all the int to float
# Select only the integer columns
integer_columns = data.select_dtypes(include='int64').columns

# Convert selected integer columns to float
data[integer_columns] = data[integer_columns].astype(float)

In [None]:
# checking duplicate rows
data[data.duplicated()]



In [None]:
## EDA

In [None]:
# Description of the Data
data.describe()

In [None]:
# top 10 highest streamed song
top_streams=data.sort_values('streams',ascending=False).head(10)
top_streams[['track_name','streams']]

In [None]:
top_streams[['track_name','streams']].plot(kind='barh',x='track_name',y='streams')
plt.title("Top 10 streamed songs")
plt.show()

In [None]:
# top 10 highest presence in_spotify_playlists
sorted_spotify_playlist=data.sort_values('in_spotify_playlists',ascending=False).head(10)
sorted_spotify_playlist[['track_name','in_spotify_playlists']]

In [None]:
# visualtion of Top 10 songs with highest playlist in spotify
sorted_spotify_playlist[['track_name','in_spotify_playlists']].plot(kind='barh',x='track_name',y='in_spotify_playlists')
plt.title('Top 10 songs with highest playlist in spotify')
plt.show()

In [None]:
# top 10 highest in_apple_playlists
sorted_apple_playlist=data.sort_values('in_apple_playlists',ascending=False).head(10)
sorted_apple_playlist=sorted_apple_playlist[['track_name','in_apple_playlists']]
sorted_apple_playlist

In [None]:
# visualtion of Top 10 songs with highest playlist in Apple
sorted_apple_playlist.plot(kind='barh',x='track_name',y='in_apple_playlists')
plt.title('Top 10 songs with highest playlist in Apple')
plt.show()

In [None]:
# Top 10 artists with most songs in the dataset
top_artists = data['artist(s)_name'].value_counts().head(10)
top_artists

In [None]:
# visualization of top 10 artist
plt.figure(figsize=(4,4))
top_artists.plot(kind='bar')
plt.title("Top 10 Artists with most songs")
plt.ylabel('no. of songs')
plt.show()

In [None]:
# top 5 year when most song released
plt.figure(figsize=(4, 4))
data['released_year'].value_counts().head().plot(kind='bar')
plt.title('Top 5 Released Years')
plt.xlabel('Year')
plt.ylabel('No. of Songs')

In [None]:
data.columns

In [None]:
# Select columns for audio features
playlist_charts_colums =['in_spotify_playlists','in_spotify_charts','in_apple_playlists','in_apple_charts','in_deezer_playlists','in_deezer_charts','in_shazam_charts','streams']
# Calculate the correlation matrix
correlation_matrix = data[playlist_charts_colums].corr(method='pearson')

# Plot the heatmap
plt.figure(figsize=(7, 4))
sns.heatmap(correlation_matrix, annot=True,fmt="0.1g",cmap='inferno', center=0,linewidths=1,linecolor="Black")
plt.title('Correlation Heatmap: Cross-Platform Metrics')
plt.show()

In [None]:
# code for checking for bell curve for each features 
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig=make_subplots(rows=3,cols=3,subplot_titles=('<i>danceability_%', '<i>energy_%','<i>instrumentalness_%', '<i>speechiness_%','<i>acousticness_%','<i>liveness_%' , '<i>valence_%' ))
fig.add_trace(go.Histogram(x=data['danceability_%'],name='danceability_%'),row=1,col=1)
fig.add_trace(go.Histogram(x=data['energy_%'],name='energy_%'),row=1,col=2)
fig.add_trace(go.Histogram(x=data['instrumentalness_%'],name='instrumentalness_%'),row=1,col=3)
fig.add_trace(go.Histogram(x=data['speechiness_%'],name='speechiness_%'),row=2,col=1)
fig.add_trace(go.Histogram(x=data['acousticness_%'],name='acousticness_%'),row=2,col=2)
fig.add_trace(go.Histogram(x=data['liveness_%'],name='liveness_%'),row=2,col=3)
fig.add_trace(go.Histogram(x=data['valence_%'],name='valence_%'),row=3,col=1)
fig.update_layout(height=900,width=900,title_text='<b>Feature Distribution')
fig.update_layout(template='plotly_dark',title_x=0.5)

In [None]:
## Applying regression plot to understand what song attribute significantly contributes to a song’s success

In [None]:
# reg plot code for Danceability VS Streams
plt.figure(figsize=(5,4))
sns.regplot(data=data,x='danceability_%',y='streams',color='c').set(title='Danceability VS Streams')
plt.show()

In [None]:
# reg plot code for valence_% VS Streams
plt.figure(figsize=(5,4))
sns.regplot(data=data,x='valence_%',y='streams',color='y').set(title='valence_% VS Streams')
plt.show()

In [None]:
# reg plot code for energy_% VS Streams
plt.figure(figsize=(5,4))
sns.regplot(data=data,x='energy_%',y='streams',color='g').set(title='energy_% VS Streams')
plt.show()

In [None]:
# Select columns for audio features and popularity
audio_features_columns = ['danceability_%', 'energy_%', 'valence_%']
popularity_column = 'streams'

# Create scatter plots
plt.figure(figsize=(12, 4))
for feature in audio_features_columns:
    sns.scatterplot(data=data, x=feature, y=popularity_column, alpha=0.5, label=feature.replace('_', ' ').title())

plt.xlabel('Audio Feature')
plt.ylabel('Number of Streams')
plt.title('Relationship Between Audio Features and Song Popularity')
plt.legend()
plt.show()

In [None]:
# Select columns for audio features
success__metrics_colums = ['danceability_%', 'energy_%', 'valence_%','acousticness_%', 'instrumentalness_%', 'liveness_%',  'speechiness_%','streams']

# Calculate the correlation matrix
correlation_matrix = data[success__metrics_colums].corr(method='pearson')

# Plot the heatmap
plt.figure(figsize=(7, 4))
sns.heatmap(correlation_matrix, annot=True,fmt="0.1g",cmap='inferno', center=0,linewidths=1,linecolor="Black")
plt.title('Correlation Heatmap: Cross-Platform Metrics')
plt.show()

In [None]:
# code for checking coorelation between the features
correlation_matrix