An introductory EDA and Linear regression performed over the dataset.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
from collections import Counter
from scipy import stats

from sklearn.linear_model import LinearRegression  
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Read the data 

In [None]:
filename='/kaggle/input/top50spotify2019/top50.csv'
df=pd.read_csv(filename,encoding='ISO-8859-1')
df

In [None]:
df.info()

Convert the datatype of Track.Name , Artist.Name and Genre from object to category

In [None]:
df['Track.Name'] = df['Track.Name'].astype('category')
df['Artist.Name'] = df['Artist.Name'].astype('category')
df['Genre'] = df['Genre'].astype('category')

In [None]:
df.describe()

Used missingno to visualize missing data

In [None]:
missingno.bar(df) # no missing values

Categorical data representation

In [None]:
def categorical_eda(df):
    """Given dataframe, generate EDA of categorical data"""
    print("To check: Unique count of non-numeric data")
    print(df.select_dtypes(include=['category']).nunique())
    # Plot count distribution of categorical data
    for col in df.select_dtypes(include='category').columns:
        fig = sns.catplot(x=col, kind="count", data=df, height = 10)
        fig.set_xticklabels(rotation=90)
        plt.show()
        
categorical_eda(df)

Finding the number of songs as per each Artist 

In [None]:
# count of songs by each artist

Artist_count = df["Artist.Name"].value_counts()
print(Artist_count, "\n")

Song_count = df.groupby('Artist.Name')['Track.Name'].value_counts()
print(Song_count)


In [None]:
plt.figure(figsize=(20,5))
fig = sns.countplot(df['Artist.Name'], orient= "v", palette = "Set3")
fig.set_xticklabels(fig.get_xticklabels(),rotation=45)
plt.show()
    

In [None]:
## find the shape of each data variable and check for skewness ##
fig = df.hist(figsize=(10, 10),)
[x.title.set_size(10) for x in fig.ravel()]
plt.tight_layout()
plt.show()

Normalize the skewness of the data using box cox transformation 

In [None]:
## Normalize the attributes by removing the skewness of the data ##

fig = sns.distplot(df['Danceability'], bins=10,kde=True,kde_kws={"color": "k", "lw": 2, "label": "KDE"})
plt.tight_layout()
plt.show()

transform_val1 = np.asarray(df[['Danceability']].values)
transform_df1 = stats.boxcox(transform_val1)[0]
print("Transformed Danceability")
fig = sns.distplot(transform_df1, bins=10,kde=True,kde_kws={"color": "k", "lw": 2, "label": "KDE"})
plt.show()

In [None]:
sns.distplot(df['Popularity'],bins=10,kde=True,kde_kws={"color": "k", "lw": 2, "label": "KDE"})
plt.show()

transform_val2 = np.asarray(df[['Popularity']].values)
transform_df2 = stats.boxcox(transform_val1)[0]
print("Transformed Popularity")
fig = sns.distplot(transform_df2, bins=10,kde=True,kde_kws={"color": "k", "lw": 2, "label": "KDE"})
plt.show()


In [None]:
sns.distplot(df['Liveness'], bins =10, kde = True, kde_kws = {"color" : "k" , "lw" :2 , "label" : "KDE"})
plt.tight_layout()
plt.show()

transform_val3 = np.asarray(df[['Liveness']].values)
transform_df3 = stats.boxcox(transform_val3)[0]
print("Transformed Liveness")
sns.distplot(transform_df3, bins=10 , kde = True, kde_kws = {"color" : "k", "lw" : 2 , "label" : "KDE"})
plt.tight_layout()
plt.show()

Finding the correlation between the attributes in the dataset

In [None]:
# Separate both dataframes into 
numeric_df = df.select_dtypes(exclude="object")
# categorical_df = df.select_dtypes(include="object")

corr_numeric = numeric_df.corr()
sns.heatmap(corr_numeric, cbar=True, cmap="RdBu_r")
plt.title("Correlation Matrix", fontsize=16)
plt.show()

Analyzing how different attributes affect the popularity of a song

In [None]:
# Analysing the relationship between Popularity and Loudness

fig=plt.subplots(figsize=(7,7))
sns.regplot(x='Popularity',y='Loudness..dB..',data=df,color='teal')

In [None]:
# Analysing the relationship between Popularity and Speechniess (Linear positive relation)

fig=plt.subplots(figsize=(7,7))
sns.regplot(x='Popularity',y='Speechiness.',data=df,color='orange')

In [None]:
# Analysing the relationship between Popularity and Energy

fig=plt.subplots(figsize=(7,7))
sns.regplot(x='Popularity',y='Energy',data=df,color='purple')

In [None]:
# Analysing the relationship between Popularity and Danceability

fig=plt.subplots(figsize=(7,7))
sns.regplot(x='Popularity',y='Danceability',data=df,color='red')

In [None]:
# Finding the Favorite / Popular Artist in 2019 

fig=plt.subplots(figsize=(9,10))
result = df.groupby(["Artist.Name"])['Popularity'].aggregate(np.mean).reset_index().sort_values('Popularity', ascending = False)
sns.barplot(x='Popularity',y= 'Artist.Name', data = df, order = result['Artist.Name'] )

Performing Linear Regression over the dataset and calculating the performance/accuracy of the model using Root mean Square Error 

In [None]:
# Liner Regression
# dependent variable : Popularity
# independent variable :  Loudness_dB , Danceability, Liveness, Energy, Acousticness, Speechiness 
from sklearn.metrics import mean_squared_error

x = df.loc[ : ,['Loudness..dB..' , 'Danceability' , 'Liveness', 'Energy' , 'Acousticness..', 'Speechiness.']].values

y = df.loc[:,['Popularity']].values

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.40)

model = LinearRegression()

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print("Actual\n" , y_test)
print("\nPredicted\n", y_pred)

# find the Root Mean Suared Error to check the accuracy of the model
rmse_test = mean_squared_error(y_test,y_pred) ** (0.5)
print('\nRMSE on test dataset : ', rmse_test)


In [None]:
## Plot the Predicted and the True Values 

fig=plt.subplots(figsize=(7,7))
plt.scatter(y_pred,y_test, color='black')
plt.title('Error analysis')
plt.ylabel('Test values ')
plt.xlabel('Predicted values')