In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import seaborn as sns
import datetime as dt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
init_notebook_mode(connected=True)

# Data Analysis

In [None]:
df=pd.read_csv('/kaggle/input/-spotify-tracks-dataset/dataset.csv')
df.head()

In [None]:
df = df.drop('Unnamed: 0', axis=1)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df = df[df['artists'].notna()]
df = df[df['album_name'].notna()]
df = df[df['track_name'].notna()]

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().value_counts()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
cdf = df.copy()
for col in cdf.select_dtypes(include=['object', 'category']).columns: 
    cdf[col] = cdf[col].astype('category').cat.codes
fig=px.imshow(cdf.corr(),text_auto=True,height=800,width=800,color_continuous_scale=px.colors.sequential.Blues,aspect='auto',title='<b>pairwise correlation of columns')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
top = df.drop('track_genre', axis=1)
top.drop_duplicates(inplace=True)
top.head()

In [None]:
fig=make_subplots(rows=3,cols=2,subplot_titles=('<i>popularity', '<i>duration_ms', '<i>danceability', '<i>energy', '<i>loudness', '<i>speechiness', '<i>acousticness', '<i>instrumentalness', '<i>liveness', '<i>valence', '<i>tempo' '<i>track_genre'))
fig.add_trace(go.Histogram(x=top['popularity'],name='popularity'),row=1,col=1)
fig.add_trace(go.Histogram(x=top['duration_ms'],name='duration_ms'),row=1,col=2)
fig.add_trace(go.Histogram(x=top['danceability'],name='danceability'),row=2,col=1)
fig.add_trace(go.Histogram(x=top['energy'],name='energy'),row=2,col=2)
fig.add_trace(go.Histogram(x=top['loudness'],name='loudness'),row=3,col=1)
fig.add_trace(go.Histogram(x=top['speechiness'],name='speechiness'),row=3,col=2)
fig.update_layout(height=900,width=900,title_text='<b>Feature Distribution')
fig.update_layout(template='plotly_dark',title_x=0.5)

In [None]:
sfig=make_subplots(rows=3,cols=2,subplot_titles=('<i>acousticness', '<i>instrumentalness', '<i>liveness', '<i>valence', '<i>tempo'))
sfig.add_trace(go.Histogram(x=df['acousticness'],name='acousticness'),row=1,col=1)
sfig.add_trace(go.Histogram(x=df['instrumentalness'],name='instrumentalness'),row=1,col=2)
sfig.add_trace(go.Histogram(x=df['liveness'],name='liveness'),row=2,col=1)
sfig.add_trace(go.Histogram(x=df['valence'],name='valence'),row=2,col=2)
sfig.add_trace(go.Histogram(x=df['tempo'],name='tempo'),row=3,col=1)
sfig.update_layout(height=900,width=900,title_text='<b>Feature Distribution')
sfig.update_layout(template='plotly_dark',title_x=0.5)

In [None]:
fig=px.histogram(df.groupby('track_genre',as_index=False).sum().sort_values(by='popularity',ascending=False),x='track_genre',y='popularity',color_discrete_sequence=['pink'],template='plotly_dark',marginal='box',title='<b>Popular genres based on popularity</b>')
fig.update_layout(title_x=0.5)

In [None]:
fig=px.histogram(df.groupby('track_genre',as_index=False).count().sort_values(by='track_name',ascending=False),
                 x='track_genre',
                 y='track_name',
                 color_discrete_sequence=['pink'],
                 template='plotly_dark',
                 marginal='box',
                 title='<b>Total songs based on genres</b>')
fig.update_layout(title_x=0.5)

In [None]:
px.bar(top.groupby('artists',as_index=False).count().sort_values(by='track_name',ascending=False).head(50),
       x='artists',
       y='track_name',
       labels={'track_name':'Total Songs'},
       width=1000,
       color_discrete_sequence=['lightblue'],
       text='track_name',
       title='<b> List of Songs Recorded by Each Singer')

In [None]:
px.bar(top.groupby('artists',as_index=False).sum().sort_values(by='popularity',ascending=False).head(30),
       x='artists',
       y='popularity',
       color_discrete_sequence=['#1DB954'],
       template='plotly_dark',
       text='popularity',
       title='<b>Top 30 Popular Singers')

In [None]:
top_25 = top.nlargest(25, 'popularity')
fig=px.line(top_25,
            x='track_name',
            y='popularity',
            hover_data=['artists'],
            color_discrete_sequence=['pink'],
            markers=True,
            title='<b> Top 10 songs in Spotify')
fig.show()

In [None]:
artist_popularity = df.groupby('artists')['popularity'].sum().reset_index()
threshold = 430
popular_artists = artist_popularity[artist_popularity['popularity'] > threshold]
pdf = pd.merge(df, popular_artists, on='artists', how='inner')
pdf = pdf.drop('popularity_y', axis=1)
pdf.rename(columns = {'popularity_x':'popularity'}, inplace = True) 
pdf.shape

In [None]:
fig=px.treemap(pdf,path=[px.Constant('Artists'),'artists', 'track_genre', 'track_name'],
               values='popularity',
               title='<b>TreeMap of Artists Playlist')
fig.update_traces(root_color='lightgreen')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
fig=px.pie(top.groupby('explicit',as_index=False).count().sort_values(by='track_name',ascending=False),
           names='explicit',
           values='track_name',
           labels={'track_name':'Total songs'},
           hole=.6,
           color_discrete_sequence=['#237f52','#e32636'],
           template='plotly_dark',title='<b>Songs having explicit content')
fig.update_layout(title_x=0.5)

In [None]:
px.box(top,
       x='explicit',
       y='popularity',
       color='explicit',
       template='plotly_dark',
       color_discrete_sequence=['cyan','magenta'],
       title='<b>popularity based on explicit content')

In [None]:
px.scatter(top.head(5700),
           x='danceability',
           y='popularity',
           color='danceability',
           color_continuous_scale=px.colors.sequential.Plasma,
           template='plotly_dark',
           title='<b>Danceability Versus Popularity')

In [None]:
px.scatter(top.head(5700),
           x='speechiness',
           y='popularity',
           color='speechiness',
           color_continuous_scale=px.colors.sequential.Plasma,
           template='plotly_dark',
           title='<b> Speechiness Versus Popularity')

In [None]:
px.scatter(top.head(5700),
           x='energy',
           y='danceability',
           color='danceability',
           color_continuous_scale=px.colors.sequential.Plotly3,
           template='plotly_dark',
           title='<b>Energy Versus Danceability')

In [None]:
px.scatter(top,
           x='energy',
           y='loudness',
           color_discrete_sequence=['lightgreen'],
           template='plotly_dark',
           title='<b>Energy versus Loudness correlation')

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(data=top.head(5700),y="popularity",x="acousticness",color="c").set(title="popularity vs accoustic corr")