In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from plotly.subplots import make_subplots

# 1. Read data and analyze it

In [2]:
input_df = pd.read_csv('/kaggle/input/most-streamed-spotify-songs-2024/Most Streamed Spotify Songs 2024.csv', encoding='ISO-8859-1')

In [49]:
input_df.head()

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,...,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725.4,390470936,30716,196631588,...,684,62.0,17598718,114.0,18004655,22931,4818457.0,2669262,,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545.9,323703884,28113,174597137,...,3,67.0,10422430,111.0,7780028,28444,6623075.0,1118279,,1
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538.4,601309283,54331,211607669,...,536,136.0,36321847,172.0,5022621,5639,7208651.0,5285340,,0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444.9,2031280633,269802,136569078,...,2182,264.0,24684248,210.0,190260277,203384,,11822942,,0
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423.3,107034922,7223,151469874,...,1,82.0,17660624,105.0,4493884,7006,207179.0,457017,,1


In [50]:
input_df.shape

(4600, 29)

In [26]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track                       4600 non-null   object 
 1   Album Name                  4600 non-null   object 
 2   Artist                      4595 non-null   object 
 3   Release Date                4600 non-null   object 
 4   ISRC                        4600 non-null   object 
 5   All Time Rank               4600 non-null   object 
 6   Track Score                 4600 non-null   float64
 7   Spotify Streams             4487 non-null   object 
 8   Spotify Playlist Count      4530 non-null   object 
 9   Spotify Playlist Reach      4528 non-null   object 
 10  Spotify Popularity          3796 non-null   float64
 11  YouTube Views               4292 non-null   object 
 12  YouTube Likes               4285 non-null   object 
 13  TikTok Posts                3427 

# 2. Clear data

Delete columns from datasets, which I will not use for visualization

In [4]:

columns_to_delete = input_df.columns[[4,27]]

df = input_df.drop(axis=1, labels=columns_to_delete)

df.shape

(4600, 27)

Fill columns, where data is null, with mean value of the column


In [5]:
df.loc[df.Artist.isnull(), 'Artist'] = 'Unknown'
df = df.ffill()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track                       4600 non-null   object 
 1   Album Name                  4600 non-null   object 
 2   Artist                      4600 non-null   object 
 3   Release Date                4600 non-null   object 
 4   All Time Rank               4600 non-null   object 
 5   Track Score                 4600 non-null   float64
 6   Spotify Streams             4600 non-null   object 
 7   Spotify Playlist Count      4600 non-null   object 
 8   Spotify Playlist Reach      4600 non-null   object 
 9   Spotify Popularity          4600 non-null   float64
 10  YouTube Views               4600 non-null   object 
 11  YouTube Likes               4600 non-null   object 
 12  TikTok Posts                4600 non-null   object 
 13  TikTok Likes                4600 

Convert All columns data types
and change dtypes for columns, where int is expected

In [6]:
df = df.convert_dtypes()

convert_to_int = [df.columns[4]] + list(df.columns[6:9]) + list(df.columns[10:16]) + \
                 list(df.columns[17:19]) + [df.columns[20]] + list(df.columns[22:26])



df.loc[:, convert_to_int] = df.loc[:, convert_to_int].apply(lambda x: x.str.replace(',', '',regex=False).astype(int))
df.iloc[:,3] = pd.to_datetime(df.iloc[:,3], format='%m/%d/%Y')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Track                       4600 non-null   string        
 1   Album Name                  4600 non-null   string        
 2   Artist                      4600 non-null   string        
 3   Release Date                4600 non-null   datetime64[ns]
 4   All Time Rank               4600 non-null   int64         
 5   Track Score                 4600 non-null   Float64       
 6   Spotify Streams             4600 non-null   int64         
 7   Spotify Playlist Count      4600 non-null   int64         
 8   Spotify Playlist Reach      4600 non-null   int64         
 9   Spotify Popularity          4600 non-null   Int64         
 10  YouTube Views               4600 non-null   int64         
 11  YouTube Likes               4600 non-null   int64       

  df.loc[:, convert_to_int] = df.loc[:, convert_to_int].apply(lambda x: x.str.replace(',', '',regex=False).astype(int))
  df.loc[:, convert_to_int] = df.loc[:, convert_to_int].apply(lambda x: x.str.replace(',', '',regex=False).astype(int))
  df.loc[:, convert_to_int] = df.loc[:, convert_to_int].apply(lambda x: x.str.replace(',', '',regex=False).astype(int))
  df.loc[:, convert_to_int] = df.loc[:, convert_to_int].apply(lambda x: x.str.replace(',', '',regex=False).astype(int))
  df.loc[:, convert_to_int] = df.loc[:, convert_to_int].apply(lambda x: x.str.replace(',', '',regex=False).astype(int))
  df.loc[:, convert_to_int] = df.loc[:, convert_to_int].apply(lambda x: x.str.replace(',', '',regex=False).astype(int))
  df.loc[:, convert_to_int] = df.loc[:, convert_to_int].apply(lambda x: x.str.replace(',', '',regex=False).astype(int))
  df.loc[:, convert_to_int] = df.loc[:, convert_to_int].apply(lambda x: x.str.replace(',', '',regex=False).astype(int))
  df.loc[:, convert_to_int] = df.loc[:, 

# 2.Data Visualization

This plots represent infromation about the best artist, album, track


In [7]:
def get_best_ten_by_column(column_name):
    best_df = df.loc[:, [column_name, 'Track Score']].groupby([column_name]).sum().sort_values('Track Score', ascending=False).head(10)
    
    best_df = best_df.reset_index()
    
    return best_df


best_artists = get_best_ten_by_column('Artist')
best_albums = get_best_ten_by_column('Album Name')
best_tracks = df.iloc[:,[0,5]].head(10)

fig_artist = px.bar(best_artists,title='Top 10 Artist', x='Artist', y='Track Score', color='Track Score',
                    hover_data=['Artist'], labels={'Track Score': 'Score'}, 
                    color_continuous_scale='Cividis')


fig_album = px.bar(best_albums,title='Top 10 Albums', x='Album Name', y='Track Score', color='Track Score',
                   hover_data=['Album Name'], labels={'Track Score': 'Score'},
                   color_continuous_scale='Viridis')

fig_track = px.bar(best_tracks,title='Top 10 Tracks', x='Track', y='Track Score', color='Track Score',
                   hover_data=['Track'], labels={'Track Score': 'Score'},
                   color_continuous_scale='Plasma')

fig_artist.show()
fig_album.show()
fig_track.show()

Plot, which will shown the count of all tracks track score fo rtop 10 artist and shown their best tracks score

In [51]:
best_artists_tracks = pd.merge(df, best_artists, on='Artist', how='right')
best_artists_best_track = best_artists_tracks.loc[best_artists_tracks.groupby('Artist')['Track Score_x'].idxmax()]

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=best_artists_best_track['Artist'],
        y=best_artists_best_track['Track Score_x'],
        mode='markers+lines',
        name='Best Track Score per Artist',
        marker=dict(color='blue', size=10),
        line=dict(dash='dash'),
        hovertext = best_artists_best_track['Track']
    )
)

fig.add_trace(
    go.Bar(
        x=best_artists['Artist'],
        y=best_artists['Track Score'],
        name='Overall Best Track Score per Artist',
        marker=dict(color='orange', opacity=0.7),
    )
)

fig.update_layout(
    title='Comparison of Best Tracks and Overall Best Scores per TOP 10 Artist',
    xaxis_title='Artist',
    yaxis_title='Track Score',
    barmode='group',
    template='plotly_white'
)

# Show the plot
fig.show()


In [9]:
max_track_by_year = df.groupby(df['Release Date'].dt.year)['Track Score'].max().reset_index()
min_track_by_year = df.groupby(df['Release Date'].dt.year)['Track Score'].min().reset_index()
mean_score_by_year = df.groupby(df['Release Date'].dt.year)['Track Score'].mean().reset_index()

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=max_track_by_year['Release Date'],
        y=max_track_by_year['Track Score'],
        mode='lines+markers',
        name='Max Track Score by Year',
        line=dict(color='green', width=2),
        marker=dict(size=8)
    )
)

fig.add_trace(
    go.Scatter(
        x=min_track_by_year['Release Date'],
        y=min_track_by_year['Track Score'],
        mode='lines+markers',
        name='Min Track Score by Year',
        line=dict(color='red', width=2, dash='dash'),
        marker=dict(size=8)
    )
)

fig.add_trace(
    go.Scatter(
        x=mean_score_by_year['Release Date'],
        y=mean_score_by_year['Track Score'],
        mode='lines+markers',
        name='Mean Track Score by Year',
        line=dict(color='blue', width=2, dash='dot'),
        marker=dict(size=8)
    )
)

fig.update_layout(
    title='Max, Min, and Mean Track Scores by Year',
    xaxis_title='Year',
    yaxis_title='Track Score',
    template='plotly_white',
    legend=dict(orientation='h', x=0.5, y=-0.2, xanchor='center')
)

fig.show()


In [10]:
explicit_tracks_count_per_year = df[df['Explicit Track'] == 1].groupby(df['Release Date'].dt.year)['Explicit Track'].count().reset_index()

not_explicit_tracks_count_per_year = df[df['Explicit Track'] == 0].groupby(df['Release Date'].dt.year)['Explicit Track'].count().reset_index()

fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=explicit_tracks_count_per_year['Release Date'],
        y=explicit_tracks_count_per_year['Explicit Track'],
        name='Explicit Tracks',
        marker=dict(color='red', opacity=0.7)
    )
)

fig.add_trace(
    go.Bar(
        x=not_explicit_tracks_count_per_year['Release Date'],
        y=not_explicit_tracks_count_per_year['Explicit Track'],
        name='Non-Explicit Tracks',
        marker=dict(color='blue', opacity=0.7)
    )
)

fig.update_layout(
    title='Explicit vs. Non-Explicit Tracks Count per Year',
    xaxis_title='Year',
    yaxis_title='Track Count',
    barmode='group',  # Display bars side by side
    template='plotly_white',
    legend=dict(orientation='h', x=0.5, y=-0.2, xanchor='center')
)

fig.show()


In [11]:
platform_streams_count = df.iloc[:,[6,10,14,17,18,22,24]].sum().sort_values(ascending=False)

fig = px.bar(platform_streams_count, 
             x=platform_streams_count.index, y=platform_streams_count.values,
            title='Streams count per Platform')
fig.show()

In [36]:
import plotly.graph_objects as go

# Assuming df is your DataFrame containing the required columns
platforms_track_score_and_stream_per_track = df.iloc[:, [0,2, 5, 6, 10, 14]]

# Create a list to store hover text information
hover_text = []
for index, row in platforms_track_score_and_stream_per_track.iterrows():
    hover_text.append(f"Track: {row['Track']}<br>Artist: {row['Artist']}")

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=platforms_track_score_and_stream_per_track['Track Score'],
    y=platforms_track_score_and_stream_per_track['Spotify Streams'],
    mode="markers",
    name="Spotify Streams",
    marker=dict(color='green', opacity=0.7),
    hovertext=hover_text,  # Assign hover text from the list
    hoverinfo='text'  # Show only text info on hover
))

fig.add_trace(go.Scatter(
    x=platforms_track_score_and_stream_per_track['Track Score'],
    y=platforms_track_score_and_stream_per_track['YouTube Views'],
    mode="markers",
    name="YouTube Views",
    marker=dict(color='red', opacity=0.7),
    hovertext=hover_text,  # Assign hover text from the list
    hoverinfo='text'  # Show only text info on hover
))

fig.add_trace(go.Scatter(
    x=platforms_track_score_and_stream_per_track['Track Score'],
    y=platforms_track_score_and_stream_per_track['TikTok Views'],
    mode="markers",
    name="TikTok Views",
    marker=dict(color='black', opacity=0.7),
    hovertext=hover_text,  # Assign hover text from the list
    hoverinfo='text'  # Show only text info on hover
))

fig.update_layout(
    title="Track Scores vs. Streams/Videos",
    xaxis_title="Track Score",
    yaxis_title="Streams/Views",
    hovermode='closest'  # Display hover info closest to the point
)

fig.show()


In [46]:
spotify_streams_count_per_best_artist = best_artists_tracks.groupby('Artist')['Spotify Streams'].sum()
youtube_streams_count_per_best_artist = best_artists_tracks.groupby('Artist')['YouTube Views'].sum()
tiktok_streams_count_per_best_artist = best_artists_tracks.groupby('Artist')['TikTok Views'].sum()

fig = go.Figure()

# Spotify bar trace
fig.add_trace(
    go.Bar(
        x=spotify_streams_count_per_best_artist.index,
        y=spotify_streams_count_per_best_artist.values,
        name='Spotify',
        marker=dict(color='#1DB954', opacity=0.8),
        hoverinfo='text',
        hovertext=['Artist: {}<br>Spotify Streams: {}'.format(artist, value) 
                   for artist, value in zip(spotify_streams_count_per_best_artist.index, spotify_streams_count_per_best_artist.values)]
    )
)

# YouTube bar trace
fig.add_trace(
    go.Bar(
        x=youtube_streams_count_per_best_artist.index,
        y=youtube_streams_count_per_best_artist.values,
        name='YouTube',
        marker=dict(color='#FF0000', opacity=0.8),
        hoverinfo='text',
        hovertext=['Artist: {}<br>YouTube Views: {}'.format(artist, value) 
                   for artist, value in zip(youtube_streams_count_per_best_artist.index, youtube_streams_count_per_best_artist.values)]
    )
)

# TikTok bar trace
fig.add_trace(
    go.Bar(
        x=tiktok_streams_count_per_best_artist.index,
        y=tiktok_streams_count_per_best_artist.values,
        name='TikTok',
        marker=dict(color='#000000', opacity=0.8),
        hoverinfo='text',
        hovertext=['Artist: {}<br>TikTok Views: {}'.format(artist, value) 
                   for artist, value in zip(tiktok_streams_count_per_best_artist.index, tiktok_streams_count_per_best_artist.values)]
    )
)

# Update layout
fig.update_layout(
    title='Platform Streams/Views Count per Best Artist',
    xaxis_title='Artist',
    yaxis_title='Total Streams/Views',
    barmode='group',  # Group bars side by side
    template='plotly_white',
    xaxis_tickangle=-45,  # Rotate x-axis labels for better readability
    legend=dict(
        orientation='h',  # Horizontal legend
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5,
        title_text='Platforms'
    ),
    margin=dict(t=80, b=150)
)

fig.update_traces(
    hoverlabel=dict(
        bgcolor="white",
        font_size=12,
        font_family="Arial"
    )
)

fig.show()

In [48]:
# Grouping data
spotify_stream_count_per_year = df.groupby(df['Release Date'].dt.year)['Spotify Streams'].sum()
youtube_stream_count_per_year = df.groupby(df['Release Date'].dt.year)['YouTube Views'].sum()
tiktok_stream_count_per_year = df.groupby(df['Release Date'].dt.year)['TikTok Views'].sum()

# Creating the figure
fig = go.Figure()

# Spotify Streams
fig.add_trace(
    go.Scatter(
        x=spotify_stream_count_per_year.index,
        y=spotify_stream_count_per_year.values,
        mode='lines+markers',
        name='Spotify Streams',
        line=dict(color='green', width=3),  # Slightly thicker line for better visibility
        marker=dict(size=10)  # Larger marker size for emphasis
    )
)

# YouTube Views
fig.add_trace(
    go.Scatter(
        x=youtube_stream_count_per_year.index,
        y=youtube_stream_count_per_year.values,
        mode='lines+markers',
        name='YouTube Views',
        line=dict(color='red', width=3, dash='dash'),  # Consistent line thickness
        marker=dict(size=10)
    )
)

# TikTok Views
fig.add_trace(
    go.Scatter(
        x=tiktok_stream_count_per_year.index,
        y=tiktok_stream_count_per_year.values,
        mode='lines+markers',
        name='TikTok Views',
        line=dict(color='blue', width=3, dash='dot'),  # Consistent line thickness
        marker=dict(size=10)
    )
)

# Updating layout
fig.update_layout(
    title='Spotify, YouTube, and TikTok Streams by Year',  # Clear and precise title
    xaxis_title='Year',
    yaxis_title='Total Streams/Views',
    template='plotly_white',
    legend=dict(
        orientation='h',
        x=0.5,
        y=1.1,
        xanchor='center',
        yanchor='bottom',
        bgcolor='rgba(255, 255, 255, 0.8)',  # Transparent background for legend
        bordercolor='lightgrey',
        borderwidth=1
    ),
    margin=dict(l=50, r=50, t=100, b=50)  # Adjusted margins for better spacing
)

# Showing the figure
fig.show()