# This notebook goes through some of the exploratory data analysis of the Kaggle Spotify dataset with Python.

## Aim: To Analyze Genre with rest of the data on Top 50 Spotify Songs – 2019

In [None]:
#Import all the relevant dependencies and libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rnd
from scipy import stats

# Import Dependencies
%matplotlib inline

from sklearn.linear_model import TheilSenRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Import Visualization 
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import plotly.express as px
import mplcyberpunk
import plotly.graph_objects as go
from wordcloud import WordCloud
from collections import Counter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

**Top 50 songs listened in 2019 on Spotify dataset contains name of Artists, Genre, and other 11 variables.**
*     Let us convert the data into pandas’ readable format by encoding to 'ISO-8859-1'

In [None]:
#Importing data

filename='/kaggle/input/top50spotify2019/top50.csv'
spotify_df=pd.read_csv(filename,encoding='ISO-8859-1',index_col=0)
spotify_df.head() 

In [None]:
spotify_df.info() #lets explore data type.

In [None]:
spotify_df.describe()

In [None]:
#Lets reduce the number of decimal places to 2
pd.set_option('precision', 2)
spotify_df.describe()

In [None]:
#Finding the missing values
# Plot graphic of missing values
msno.matrix(spotify_df, figsize = (15,5))

* All column are complete an no empty cells can be found.

In [None]:
spotify_df.describe(include = ['O'])

* All track values are unique across the dataset
* Artist Names have several dupicates which means several artists have more than one track.

**Analyzing and visualizing data by pivoting features**

In [None]:
#Understanding Correlation between variables
plt.figure(figsize=(15,8))
sns.heatmap(spotify_df.corr(), annot=True, square = True, cmap="YlGnBu", linewidths=1, fmt= '.2f')
plt.title('Pearson Correlation Matrix\n',fontsize=20);

In [None]:
plt.figure(figsize=(15,8))
correl = spotify_df.corr()
sns.heatmap(correl[((correl >= 0.2) | (correl <= -0.2)) & (correl != 1)], annot=True, linewidths=1, fmt= '.2f', square = True, cmap="YlGnBu")
plt.title('Configured Correlation coefficients between variables.\n',fontsize=20);

Slightly positive correlation can be found between Loudness and Energy.

![](http://)

In [None]:
# Plotting a histogram to show the original data distribution 
plt.hist(spotify_df['Popularity'],bins=10)
plt.title('Popularity Chart.\n',fontsize=20);
plt.ylabel('Count', fontsize=15)
plt.xlabel("Popularity", fontsize=15)

In [None]:
#Analysing Genre Vs Popularity

sns.FacetGrid(spotify_df, hue="Genre",aspect=2)\
.map(sns.kdeplot,'Popularity',shade= True)\
.set(xlim=(60, spotify_df['Popularity'].max()))\
.add_legend()

plt.xlabel("Popularity", fontsize=15)
plt.title('Genre distrubution vs Popularity\n', fontsize=20)
plt.ylabel('Frequency', fontsize=15)

In [None]:
#Genre and Number of songs in that Genre.
#Representing using Pie chart
fig = px.pie(spotify_df, values = 'Popularity', names='Genre', hole = 0.5)
fig.update_layout(annotations=[dict(text='Genre',font_size=20, showarrow=False)])
fig.show()

**Dance Pop and Pop are two most listed Genre in the top 50**

In [None]:
#Artists and Number of songs by each artists in database.
sns.set(font_scale = 1.2)
ab = sns.catplot(y = "Artist.Name", kind = "count", edgecolor = ".1", data = spotify_df, aspect=1, size=10)
plt.ylabel('Artist Name', fontsize=15)
plt.xlabel("Number of songs", fontsize=15)
plt.title("Artists and Number of songs", fontsize=20);
#plt.show()


**Ed Sheeran has more number of songs in the Top 50**

In [None]:
#Finding Most Popular aritist using word cloud

allSongs = [trackname for trackname in spotify_df['Artist.Name']]
wc_dict = Counter(allSongs)

wordcloud = WordCloud(width=1000, height=500, background_color = 'white', collocations=False).generate_from_frequencies(wc_dict)
plt.figure(figsize = (20, 10))
plt.imshow(wordcloud)
plt.axis('off');

In [None]:
#Corelation between Beats.Per.Minute and Popularity
#Ploting data and a linear regression model fit.

plt.figure(figsize=(15,15))
sns.jointplot(x=spotify_df["Beats.Per.Minute"].values, y=spotify_df['Popularity'].values, size=10, kind="reg",color='Purple')
plt.ylabel('Popularity', fontsize=15)
plt.xlabel("Beats.Per.Minute", fontsize=15)

In [None]:
sns.pairplot(spotify_df, hue="Popularity")

In [None]:
#Corelation between Loudness and Danceability
sns.set(font_scale = 1.2)
plt.figure(figsize=(15,10))
sns.violinplot(x='Loudness..dB..', y='Danceability', data=spotify_df, palette="Set1")
plt.xlabel('Loudness in dB', fontsize=15)
plt.ylabel('Danceability index', fontsize=15)
plt.title("\n Danceability Vs Loudness\n", fontsize=20);

In [None]:
#Plot top 10 songs based on Popularity using mplcyberpunk
plt.style.use('cyberpunk')
plt.figure(figsize = (8, 10))
top10 = spotify_df.sort_values(by=['Popularity'],ascending=False).head(10)
ax = sns.barplot(data = top10, y = 'Track.Name', x= 'Popularity');
mplcyberpunk.add_glow_effects(ax)

plt.xlabel('Popularity', fontsize=15)
plt.ylabel('Track Name', fontsize=15)
plt.title("\n Top 10 songs based on Popularity \n", fontsize=20);

**Bad Guy by Billie Eilish. Gener Electropop is the most Popular track**

In [None]:
# Analysing Danceability of each track
plt.style.use('cyberpunk')
plt.figure(figsize = (25, 5))
sns.pointplot(data = spotify_df, x = 'Track.Name',y = 'Danceability',hue = 'Popularity', palette = 'inferno');
plt.xticks(rotation = 90)
plt.ylabel('Danceability', fontsize=15)
plt.xlabel('Track Name', fontsize=15)
plt.title("\n Analysing Danceability of each track \n", fontsize=20);
plt.legend(loc='lower right')
mplcyberpunk.add_glow_effects()

In [None]:
# Finding the correlation between Energy, Acousticness and Loudness of the song
fig = go.Figure(data = [go.Scatter3d(
    x = spotify_df['Energy'],
    y = spotify_df['Loudness..dB..'],
    z = spotify_df['Acousticness..'],
    text = spotify_df['Track.Name'], #pointer
    mode = 'markers',
    marker = dict(
    color = spotify_df['Popularity'],
    colorbar_title = 'Popularity',
    colorscale = 'aggrnyl'
    )
)])

# Set variables and size
fig.update_layout(width=800, height=800, title = 'Correlation between Energy, Acousticness and Loudness of the song',
                  scene = dict(xaxis=dict(title='Energy'),
                               yaxis=dict(title='Loudness'),
                               zaxis=dict(title='Acousticness')
                               )
                 )

fig.show()

In [None]:
# Finding the correlation between Danceability, Liveness and Valence of the song
fig = go.Figure(data = [go.Scatter3d(
    x = spotify_df['Danceability'],
    y = spotify_df['Liveness'],
    z = spotify_df['Valence.'],
    text = spotify_df['Track.Name'], #pointer
    mode = 'markers',
    marker = dict(
    color = spotify_df['Popularity'],
    colorbar_title = 'Popularity',
    colorscale = 'agsunset'
    )
)])

# Set variables and size
fig.update_layout(width=800, height=800, title = 'Correlation between Danceability, Liveness and Valence of the song',
                  scene = dict(xaxis=dict(title='Danceability'),
                               yaxis=dict(title='Liveness'),
                               zaxis=dict(title='Valence')
                               )
                 )

fig.show()

In [None]:
#Trying to understand if length of the song varies due to Speechiness
fig = px.histogram(spotify_df,
                   x="Speechiness.",
                    y="Length.",
                  opacity = 1,
                  title = 'Length of the song Vs Speechiness',
                  color = 'Artist.Name')
fig.show()

# Predicting popularity of the songs

In [None]:
# Retaining all numeric columns
Sfy_df_pre = spotify_df.drop(['Track.Name', 'Artist.Name', 'Genre'], axis = 1)
Sfy_df_pre.head()

In [None]:
#Predictive Modelling with outliers
X = Sfy_df_pre.iloc[: , :-1] # Asigning all rows and all columns without popularity
y = Sfy_df_pre.iloc[:, -1] # Asigning Popularity values

In [None]:
# Spliting the data into test and train datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 42)

**Theil-Sen Estimator: robust multivariate regression model.**

The algorithm calculates least square solutions on subsets with size n_subsamples of the samples in X. Any value of n_subsamples between the number of features and samples leads to an estimator with a compromise between robustness and efficiency. Since the number of least square solutions is “n_samples choose n_subsamples”, it can be extremely large and can therefore be limited with max_subpopulation. If this limit is reached, the subsets are chosen randomly. In a final step, the spatial median (or L1 median) is calculated of all least square solutions.

In [None]:
#Using Theil-Sen Regression model.
TSReg = TheilSenRegressor() #Define Model
TSReg.fit(X_train, y_train) #Fit Model
y_pred = TSReg.predict(X_test) #Get Predictions
print('\nOrginal Value \t Predicted Value')
for (orgVal, predVal) in zip(y_test, y_pred):
     print(f"\t{orgVal}\t\t{predVal:.2f}")

#Checking the accuracy of Theil-Sen Regression model
print('\nMean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
#Theil-Sen Regression model Error analysis
plt.style.use('dark_background')
plt.figure(figsize=(10,10))
plt.plot(y_pred,y_test,color='Red',linestyle='dashdot',marker='p',markerfacecolor='blue',markersize=15)
plt.title('Theil-Sen Regression model Error analysis\n',fontsize=20)
plt.xlabel('Predicted values',fontsize=15)
plt.ylabel('Test values',fontsize=15)

In [None]:
#Using XGB Regression model.
        
XGBReg = XGBRegressor()
XGBReg.fit(X_train, y_train)
y_pred = XGBReg.predict(X_test)
print('Orginal Value \t Predicted Value')
for (orgVal, predVal) in zip(y_test, y_pred):
     print(f"\t{orgVal}\t\t{predVal:.2f}")
        
#Checking the accuracy of XGB Regression model
print('\nMean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
#XGB Regression model Error analysis

plt.figure(figsize=(10,10))
plt.plot(y_pred,y_test,color='Red',linestyle='dashdot',marker='p',markerfacecolor='blue',markersize=15)
plt.title('XGB Regression model Error analysis\n',fontsize=20)
plt.xlabel('Predicted values',fontsize=15)
plt.ylabel('Test values',fontsize=15)

**Ordinary least squares Linear Regression.**

LinearRegression fits a linear model with coefficients w = (w1, …, wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.

In [None]:
#Using Linear Regression model.

LinReg = LinearRegression() #Define Model
LinReg.fit(X_train, y_train) #Fit Model
y_pred = LinReg.predict(X_test) #Get Predictions
print('Orginal Value \t Predicted Value')
for (orgVal, predVal) in zip(y_test, y_pred):
     print(f"\t{orgVal}\t\t{predVal:.2f}")
        
#Checking the accuracy of XGB Regression model
print('\nMean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
#Linear Regression model Error analysis

plt.figure(figsize=(10,10))
plt.plot(y_pred,y_test,color='Red',linestyle='dashdot',marker='p',markerfacecolor='blue',markersize=15)
plt.title('Linear Regression model Error analysis\n',fontsize=20)
plt.xlabel('Predicted values',fontsize=15)
plt.ylabel('Test values',fontsize=15)

**Reference**

https://www.kaggle.com/arpita28/analysis-of-spotify-trends

https://www.kaggle.com/duttasd28/spotify-theil-sen-cyberpunk-plotly


***If you like it Please Upvote!***

**Thanks!**