# Spotify Data Visualisation

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Library for mathematical computation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.stats as stats #Statistics

#Library for Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/spotify-top-200-charts-20202021/spotify_dataset.csv")
df

# Let's first analyze genres and find most popular ones

In [None]:

df['Genre']=df['Genre'].astype(str)
df["Genre"][df["Genre"] == "[]"] = np.nan
df["Genre"] = df["Genre"].fillna(0)
#here we get rid of useless symbols to be able to separate genres
df.Genre=df.Genre.str.replace("[", "")
df.Genre=df.Genre.str.replace("]", "")
df.Genre=df.Genre.str.replace("'", "")
#now we devide genre strings by comma
df["Genre"] = df["Genre"].str.split(",")
#next command separates rows based on genres, so for each song that is marked with several genres, 
#now we'll have multiple rows with one genre for each row
df=df.explode('Genre')

df

In [None]:
#now we have everything to plot a pie plot with most popular genres

import seaborn as sns
from matplotlib import pyplot as plt
fig = plt.figure(figsize = (10, 10))
ax = fig.subplots()
df.Genre.value_counts()[:25].plot(ax=ax, kind = "pie")
ax.set_ylabel("")
ax.set_title("Top 25 most popular genres")
plt.show()

Now let's analyze column 'Number of Times Charted' and find out, which artists are charted most oftenly
To do that we need to go back to our initial dataset, because in the previous part we devided rows based on genres
So we upload the intital dataset again

In [None]:
df=pd.read_csv("/kaggle/input/spotify-top-200-charts-20202021/spotify_dataset.csv")

In [None]:
#here we sort dataframe based on "Number of Times Charted", 
#but also group by artist and sum numbers of times charted for each of the artist

df_numbercharted=df.groupby('Artist').sum().sort_values('Number of Times Charted', ascending=False)
df_numbercharted=df_numbercharted.reset_index()
df_numbercharted

In [None]:

#here we plot 10 most popular artist based on their appearence in charts
plt.figure(figsize=(20,8))
plt.title("Top 10 Artists with Highest Numbers of Times Charted")
sns.barplot(x='Artist',y='Number of Times Charted',data=df_numbercharted.head(10))

In [None]:
#clean data

df=df.fillna('')
df=df.replace(' ', '')
df['Streams']=df['Streams'].str.replace(',','')

#convert all numeric columns to numeric

df[['Highest Charting Position', 'Number of Times Charted', 'Streams', 'Popularity', 'Danceability', 'Energy', 'Loudness', 'Speechiness',
       'Acousticness', 'Liveness', 'Tempo', 'Duration (ms)', 'Valence',
       ]] = df[['Highest Charting Position', 'Number of Times Charted', 'Streams','Popularity', 'Danceability', 'Energy', 'Loudness', 'Speechiness',
       'Acousticness', 'Liveness', 'Tempo', 'Duration (ms)', 'Valence',
       ]].apply(pd.to_numeric)

In [None]:
#let's also separate year from the column "Release date" to be able to analyze its correlations

df['Release Year'] = pd.DatetimeIndex(df['Release Date']).year

In [None]:
%matplotlib inline

#here's the correlation plot
f,ax = plt.subplots(figsize=(14,10))
sns.heatmap(df.corr(),annot = True,fmt = ".1f",ax = ax)
plt.show()

Among interesting observations is that acousticness and energy and loudness have negative high correlation, 
which is logical, because acoustic music is often quite and requires careful listening

Unfortunately, no high correlations with release year is noticeable, but we can still have a look at 
how denceability was changing over the years

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Danceability over the course of the century')
sns.lineplot(x="Release Year", y="Danceability", data=df)

Another thing I would like to check is how 'Number of Times Charted' correlates with years. Maybe some years were included in this data and some not. We have information for other years, however, most probably it is not full and representative

In [None]:
dfyear=df.groupby('Release Year').sum().sort_values('Number of Times Charted', ascending=False)
dfyear=dfyear.reset_index()

In [None]:

#Plotting Barchart about artist and their followers
plt.figure(figsize=(20,8))
plt.title("Years with Highest Numbers of Times Charted")
sns.barplot(x='Release Year',y='Number of Times Charted',data=dfyear.head(10))

We see that most data comes from 2020, and sice 2021 is ongoing we have a bit less data for this year

Finally, I decided to have a look at popularity. 
I look at popularity by artist to be able to compare the results with the previous graph based on times charted

In [None]:
%matplotlib inline
plt.figure(figsize=(14,7))
plt.rcParams['figure.facecolor'] = 'white' 
artistsbypop = df.groupby("Artist")['Popularity'].sum().sort_values(ascending=False)[:15]
axis = sns.barplot(artistsbypop.index, artistsbypop,palette='cubehelix')
axis.set_title('Top 15 Artists by Popularity')
axis.set_ylabel('Popularity')
axis.set_xlabel('Artists')
plt.xticks(rotation = 65)
plt.show()

We see that this chart has differeent artists, however, some of them intersect, like Justin Bieber, The Weekend etc. 
Hope this helps someone! Thank you!