EDA - year and artists

In [1]:
# import libraries

import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go

Load data

In [3]:
year_data = pd.read_csv('../data/processed/year_data.csv')

In [5]:
year_data = pd.read_csv('../data/processed/year_data.csv')
year_data.head()

Unnamed: 0,year,popularity,duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,1922,0.057971,165202.184783,0.892674,0.541207,0.329083,0.328275,4.833333,0.246547,-14.073036,0.73913,0.258699,110.133703,0.572387
1,1923,1.575342,156975.914764,0.859965,0.637332,0.266977,0.157659,3.651446,0.225396,-16.351921,0.797565,0.552072,109.552648,0.671967
2,1924,0.612954,189111.306477,0.866266,0.593344,0.356725,0.339628,4.723539,0.203447,-13.290367,0.693523,0.375208,119.457894,0.554935
3,1925,1.414175,189370.960133,0.91217,0.617391,0.263749,0.275384,5.069767,0.255157,-14.977595,0.76412,0.305693,113.90105,0.635196
4,1926,1.938776,165946.540464,0.785739,0.622113,0.263075,0.323416,5.746657,0.211447,-15.929906,0.705841,0.356952,113.386792,0.539702


In [6]:
def plot_popularity_by_year(data):
    """
    Plots the popularity of songs by year.

    Parameters:
    data (pd.DataFrame): DataFrame containing the song data.
    """
    fig = px.line(data, x='year', y='popularity', title='Popularity of Songs by Year')
    fig.show()

In [7]:
plot_popularity_by_year(year_data)

In [11]:
def plot_audio_features_by_year(data, audio_feature):
    """
    Plots the audio features of songs by year.
    """
    fig = px.line(data, x='year', y=audio_feature)
    fig.show()

In [8]:
track_features = ['name', 'artists', 'year', 'release_date','popularity', 'duration_ms', 'explicit', 'id']
audio_features_normalized = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']
audio_features_not_normalized = ['key', 'loudness', 'mode', 'tempo']

In [12]:
plot_audio_features_by_year(year_data, audio_features_normalized)

In [13]:
def plot_loudness_changes_by_year(data):
    """
    Plots the loudness of songs by year.
    """
    fig = px.scatter(year_data, x='year', y='loudness', color='loudness', 
                          size='popularity', title='Loudness Changes Over Years', 
                          labels={'loudness': 'Loudness'})
    fig.show()

In [15]:
plot_loudness_changes_by_year(year_data)

In [16]:
def plot_tempo_changes_by_year(data):
    """
    Plots the tempo of songs by year.
    """
    fig = px.scatter(year_data, x='year', y='tempo', color='tempo', 
                          size='popularity', title='Tempo Changes Over Years', 
                          labels={'tempo': 'Tempo'})
    fig.show()

In [17]:
plot_tempo_changes_by_year(year_data)

In [18]:
def plot_key_changes_by_year(data):
    """
    Plots the key of songs by year.
    """
    fig = px.scatter(year_data, x='year', y='key', color='key', 
                          size='popularity', title='Key Changes Over Years', 
                          labels={'key': 'Key'})
    fig.show()

In [19]:
plot_key_changes_by_year(year_data)

In [62]:
data = pd.read_csv('../data/tracks.csv')

In [63]:
# Convert 'release_date' to year
data['year'] = pd.DatetimeIndex(data['release_date']).year

# Drop unnecessary columns
data = data.drop(['id', 'id_artists', 'release_date', 'mode', 'key'], axis=1)

# Drop duplicates
data = data.drop_duplicates(subset=['name', 'artists'])

# Drop rows with missing values
data = data.dropna()

# Convert 'artists' to string
data['artists'] = data['artists'].apply(lambda x: x[1:-1].replace("'", ""))

data = data.drop_duplicates(subset=['artists', 'year'])
data.head()


Unnamed: 0,name,popularity,duration_ms,explicit,artists,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year
0,Carve,6,126903,0,Uli,0.645,0.445,-13.338,0.451,0.674,0.744,0.151,0.127,104.851,3,1922
1,Capítulo 2.16 - Banquero Anarquista,0,98200,0,Fernando Pessoa,0.695,0.263,-22.136,0.957,0.797,0.0,0.148,0.655,102.009,1,1922
2,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,0.434,0.177,-21.18,0.0512,0.994,0.0218,0.212,0.457,130.418,5,1922
4,Lady of the Evening,0,163080,0,Dick Haymes,0.402,0.158,-16.9,0.039,0.989,0.13,0.311,0.196,103.22,4,1922
6,La Butte Rouge,0,134467,0,Francis Marty,0.51,0.355,-12.833,0.124,0.965,0.0,0.155,0.727,85.754,5,1922


In [64]:
# Song Analysis: Top Songs by Popularity
def plot_top_songs_by_popularity(data, n):
    """
    Plots the top n songs by popularity.

    Parameters:
    data (pd.DataFrame): DataFrame containing the song data.
    n (int): Number of songs to plot.
    """
    top_songs = data.sort_values('popularity', ascending=False).head(n)
    fig = px.bar(top_songs, x ='popularity', y = 'name', color='popularity', 
                 orientation='h',
                 title='Top {} Songs by Popularity'.format(n), 
                 labels={'name': 'Song Name', 'popularity': 'Popularity'})
    fig.show()

In [65]:
plot_top_songs_by_popularity(data, 15)

In [66]:
# Song Analysis: Top Songs by Year
def plot_top_songs_by_year(data, year, n):
    """
    Plots the top n songs by year.

    Parameters:
    data (pd.DataFrame): DataFrame containing the song data.
    year (int): Year to plot.
    n (int): Number of songs to plot.
    """
    top_songs = data[data['year'] == year].sort_values('popularity', ascending=False).head(n)
    fig = px.bar(top_songs, x ='popularity', y = 'name', color='popularity', 
                 orientation='h',
                 title='Top {} Songs of {}'.format(n, year), 
                 labels={'name': 'Song Name', 'popularity': 'Popularity'})
    fig.show()

In [67]:
plot_top_songs_by_year(data, 2017, 15)

In [68]:
# Song Analysis: Top Artists by Popularity
def plot_top_artists_by_popularity(data, n):
    """
    Plots the top n artists by popularity.

    Parameters:
    data (pd.DataFrame): DataFrame containing the song data.
    n (int): Number of artists to plot.
    """
    top_artists = data.sort_values('popularity', ascending=False).head(n)
    fig = px.bar(top_artists, x ='popularity', y = 'artists', color='popularity', 
                 orientation='h',
                 title='Top {} Artists by Popularity'.format(n), 
                 labels={'artists': 'Artist', 'popularity': 'Popularity'})
    fig.show()

In [69]:
plot_top_artists_by_popularity(data, 15)

In [70]:
# Song Analysis: Top Artists by Year
def plot_top_artists_by_year(data, year, n):
    """
    Plots the top n artists by year.

    Parameters:
    data (pd.DataFrame): DataFrame containing the song data.
    year (int): Year to plot.
    n (int): Number of artists to plot.
    """
    top_artists = data[data['year'] == year].sort_values('popularity', ascending=False).head(n)
    fig = px.bar(top_artists, x ='popularity', y = 'artists', color='popularity', 
                 orientation='h',
                 title='Top {} Artists of {}'.format(n, year), 
                 labels={'artists': 'Artist', 'popularity': 'Popularity'})
    fig.show()

In [71]:
plot_top_artists_by_year(data, 2017, 15)

In [86]:
artist_popularity_year = data.groupby(['artists', 'year'])['popularity'].mean().reset_index()
artist_popularity_year = artist_popularity_year.sort_values(by=['year', 'popularity'], ascending=False)
artist_popularity_year = artist_popularity_year.groupby('year').head(1)

artist_popularity_year.head(15)

Unnamed: 0,artists,year,popularity
88594,"Justin Bieber, Daniel Caesar, Giveon",2021,100.0
90180,Kali Uchis,2020,97.0
44925,Doja Cat,2019,94.0
22275,"Billie Eilish, Khalid",2018,89.0
75683,Imagine Dragons,2017,88.0
176720,Travis Scott,2016,88.0
171164,The Neighbourhood,2015,87.0
16471,Avicii,2014,86.0
13694,Arctic Monkeys,2013,87.0
26357,Bruno Mars,2012,85.0
