# Importing necessary libraries

In [None]:
import pandas as pd
import seaborn as sn
import numpy as np
import matplotlib.pyplot as plt

# Importing CSV

In [None]:
data = pd.read_csv('/kaggle/input/top-spotify-songs-from-20102019-by-year/top10s.csv', encoding='ISO-8859-1')
data.head()

# Getting some info about the dataset

In [None]:
data.info()

In [None]:
data.count()

So we can avoid possible problems later on on the dataset, we will rename the column **top genres** to **top_genres**, as well as remove the **Unnamed :0** column that correspondes to the index. At last, as the dataset as no NaN values, it is then ready for analysis.

In [None]:
data = data.rename(columns={'top genre': 'top_genre'})
data = data.drop('Unnamed: 0', axis=1)
print(data.columns)

# Dataset analysis

## Top 10 Artists from 2010 to 2019

In [None]:
artists = data['artist'].value_counts().reset_index().head(10)
print(artists)

In [None]:
plt.figure(figsize=(15,10))
sn.barplot(x='index',y='artist', data=artists)
plt.title("Number of Musics on top score by the 10 Top Artists")

In [None]:
plt.figure(figsize=(20,10))
for i in artists['index']:
  tmp = []
  for y in range(2010,2020):
    songs = data[data['year'] == y][data['artist'] == i]
    tmp.append(songs.shape[0])
  sn.lineplot(x=list(range(2010,2020)),y=tmp)
plt.legend(list(artists['index']))
plt.title("Evolution of each Top 10 Artists throught the target Years")

In [None]:
data['artist'].value_counts().head(10).plot.pie(figsize=(8,8), autopct='%1.0f%%')

Looking into the previous graphs we see that Justin Bieber, even tho is on the top 10 artists with 16 songs he only got the placement thanks to a huge spike between 2014 and 2016, having the rest of the time between 0 and 2 popular songs. On a similar way, Lady Gaga also is a good target to analyse: starting strong in 2010-2011 with 3 to 5 songs making huge sucess, she lost her popularity until 2018-2019 where she got a huge increase on popular songs. This makes sense, since at the end of 2018 she became known for a song on the movie "A Star is Born":

In [None]:
data[data['artist'] == 'Lady Gaga'][data['year'] == 2018]

For a look on all the artists performance through the years, please run and download the image generated by the following code.

In [None]:
plt.figure(figsize=(20,40))
graph = 0
for i in data['artist'].unique():
  tmp = []
  for y in range(2010,2020):
    songs = data[data['year'] == y][data['artist'] == i]
    tmp.append(songs.shape[0])
  graph = sn.lineplot(x=list(range(2010,2020)),y=tmp, label=i)
fig = graph.get_figure()
fig.savefig("artists.png")

## Top 10 Genres from 2010 to 2019

In [None]:
genres = data['top_genre'].value_counts().reset_index().head(10)

In [None]:
plt.figure(figsize=(23,10))
sn.barplot(x='index',y='top_genre', data=genres)

In [None]:
data['top_genre'].value_counts().head(10).plot.pie(figsize=(8,8), autopct='%1.0f%%')

In [None]:
plt.figure(figsize=(20,10))
for i in genres['index']:
  tmp = []
  for y in range(2010,2020):
    songs = data[data['year'] == y][data['top_genre'] == i]
    tmp.append(songs.shape[0])
  sn.lineplot(x=list(range(2010,2020)),y=tmp)
plt.legend(list(genres['index']))

Here we can clearly see how pop music is popular on this generation. What if we looked back into the 80's, 90's, would we see a big difference?
**Type of songs per top artist**

## Relation of the music attributes with each year's mode


In [None]:
def mean_of(col):
  res = []
  years = list(range(2010,2020))
  for y in years:
    tmp = data[data['year'] == y][col]
    res.append(np.mean(tmp))
  return res

In [None]:
# Relation of the music attributes with each year's mode
plt.figure(figsize=(15,5))
years = list(range(2010,2020))
res = mean_of('bpm')
sn.lineplot(x=years,y=res)
res = mean_of('nrgy')
sn.lineplot(x=years,y=res)
res = mean_of('dnce')
sn.lineplot(x=years,y=res)
res = mean_of('dB')
sn.lineplot(x=years,y=res)
res = mean_of('live')
sn.lineplot(x=years,y=res)
res = mean_of('val')
sn.lineplot(x=years,y=res)
res = mean_of('dur')
sn.lineplot(x=years,y=res)
res = mean_of('acous')
sn.lineplot(x=years,y=res)
res = mean_of('spch')
sn.lineplot(x=years,y=res)
plt.legend(['bpm','nrgy','dnce','dB','live','val','dur','acous','spch'])
plt.title('(mean) Music Attributes per Year')

Seeing the mean of values for each attribute of the top musics, we conclude that not many variables were affected along the year, except 'dur' (the duration of the music) which we can see a continuous drop along the years, specially between 2013 and 2019.

In [None]:
plt.figure(figsize=(15,5))
for y in range(2010,2020):
  tmp = data[data['year'] == y]['val']
  tmp.plot.line()

In [None]:
plt.figure(figsize=(15,5))
for y in range(2010,2020):
  tmp = data[data['year'] == y]['nrgy']
  tmp.plot.line()

In [None]:
plt.figure(figsize=(15,5))
for y in range(2010,2020):
  tmp = data[data['year'] == y]['live']
  tmp.plot.line()
  tmp2 = data[data['year'] == y]['pop']
  tmp2.plot.line()

In [None]:
print(data[data['pop'] == max(data['pop'])])
print(data[data['pop'] == min(data['pop'])])

Popularity and Liveness doesn't appear to be correlated, but the first appears to have more usefull information about the dataset.
Looking into it we can see some inconsistency on its meaning on the data. Even tho for each year the top songs are ordered by their popularity (how much of a hit they were), they can reach zero and still be in the top songs. If so, can we predict if a song will be on the top score using popularity?

## Extra curious information

### Top 15 songs by Liveness

In [None]:
data.sort_values(by=['live'], ascending=False).head(15)

### Top 15 songs by Popularity

In [None]:
data.sort_values(by=['pop'], ascending=False).head(15)

2019 was a year with the most popular songs, with almost all songs being pop.

### Top 15 longer songs

In [None]:
data.sort_values(by=['dur'], ascending=False).head(15)

### Top 15 songs by Acousticness

In [None]:
data.sort_values(by=['acous'], ascending=False).head(15)