# Projeto 2 - Ciência dos Dados
## Classificando músicas no Spotify
### Rodrigo Paoliello de Medeiros

O objetivo desse projeto é classificar gêneros musicais usando como base ***audio features*** que são disponibilizadas usando o API do Spotify, para que o programa consiga recomendar músicas que se parecem com aquelas de uma playlist. 

Para isso, o primeiro passo foi baixar um dataframe e limpar ele, deixando as 100 músicas mais populares de cada década, de 1950 até 2020.

In [1]:
%%capture

#Instalando o spotipy
!pip install spotipy

In [2]:
import spotipy
from spotipy import util
from spotipy.oauth2 import SpotifyClientCredentials
import math
import os.path
import pandas as pd
import matplotlib.pyplot as plt
import json
from random import shuffle
import seaborn as sns
import numpy as np
import re

In [3]:
client_credentials_manager = SpotifyClientCredentials(client_id='9230b3a140aa47d5976323e954fdb105', client_secret='49ea33f127304b3cb533030a400ce18c')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [4]:
data = pd.read_csv('data.csv')
data = data.drop(['explicit', 'id', 'key', 'mode', 'release_date', 'loudness'], axis=1)
data = data[['artists','name','year','popularity','acousticness','danceability','energy','instrumentalness','liveness','speechiness','valence','tempo','duration_ms']]
data['artists'] = data['artists'].map(lambda x: re.sub(r'\W+', ' ', x))

In [5]:
# Separando dataframes por décadas de 1950s até 2010s e escolhendo as 20 músicas mais populares de cada década

# 50s
maior_50 = data['year'] > 1950
menor_50 = data['year'] <= 1959
decada_50 = maior_50 & menor_50
data_50s = data[(decada_50)]

data_50s = data_50s.sort_values(by=['popularity'], ascending=False)
data_50s = data_50s.head(100)

# 60s
maior_60 = data['year'] > 1960
menor_60 = data['year'] <= 1969
decada_60 = maior_60 & menor_60
data_60s = data[(decada_60)]

data_60s = data_60s.sort_values(by=['popularity'], ascending=False)
data_60s = data_60s.head(100)

# 70s
maior_70 = data['year'] > 1970
menor_70 = data['year'] <= 1979
decada_70 = maior_70 & menor_70
data_70s = data[(decada_70)]

data_70s = data_70s.sort_values(by=['popularity'], ascending=False)
data_70s = data_70s.head(100)

# 80s
maior_80 = data['year'] > 1980
menor_80 = data['year'] <= 1989
decada_80 = maior_80 & menor_80
data_80s = data[(decada_80)]

data_80s = data_80s.sort_values(by=['popularity'], ascending=False)
data_80s = data_80s.head(100)

# 90s
maior_90 = data['year'] > 1990
menor_90 = data['year'] <= 1999
decada_90 = maior_90 & menor_90
data_90s = data[(decada_90)]

data_90s = data_90s.sort_values(by=['popularity'], ascending=False)
data_90s = data_90s.head(100)

# 00s
maior_00 = data['year'] > 2000
menor_00 = data['year'] <= 2009
decada_00 = maior_00 & menor_00
data_00s = data[(decada_00)]

data_00s = data_00s.sort_values(by=['popularity'], ascending=False)
data_00s = data_00s.head(100)

# 10s
maior_10 = data['year'] > 2010
menor_10 = data['year'] <= 2019
decada_10 = maior_10 & menor_10
data_10s = data[(decada_10)]

data_10s = data_10s.sort_values(by=['popularity'], ascending=False)
data_10s = data_10s.head(100)


In [6]:
# Juntando os dataframes de todas as décadas

data_1 = pd.DataFrame

data_1 = pd.concat([data_50s, data_60s, data_70s, data_80s, data_90s, data_00s, data_10s])

data_1

Unnamed: 0,artists,name,year,popularity,acousticness,danceability,energy,instrumentalness,liveness,speechiness,valence,tempo,duration_ms
81840,Chuck Berry,Johnny B. Goode,1959,74,0.7410,0.534,0.803,0.000061,0.3070,0.0743,0.969,167.983,161560
81741,Elvis Presley,Jailhouse Rock,1958,73,0.4100,0.647,0.582,0.000002,0.0715,0.0755,0.915,167.396,146480
81640,Ella Fitzgerald Louis Armstrong,Dream A Little Dream Of Me - Single Version,1957,71,0.9130,0.443,0.104,0.000000,0.1910,0.1010,0.394,76.497,185160
81743,Elvis Presley,Hound Dog,1958,69,0.7330,0.357,0.756,0.005050,0.7600,0.0621,0.950,174.797,136027
81740,Frank Sinatra,Come Fly With Me - Remastered,1958,69,0.8450,0.574,0.338,0.000000,0.1650,0.0420,0.493,67.008,199093
...,...,...,...,...,...,...,...,...,...,...,...,...,...
87775,Panic At The Disco,High Hopes,2018,84,0.1930,0.579,0.904,0.000000,0.0640,0.0618,0.681,82.014,190947
97660,The Chainsmokers ILLENIUM Lennon Stella,Takeaway,2019,84,0.1260,0.528,0.511,0.000000,0.1010,0.0324,0.351,100.100,209880
161372,Alan Walker Ava Max,"Alone, Pt. II",2019,84,0.5360,0.670,0.668,0.000000,0.1330,0.0422,0.608,88.033,179053
87777,Drake,God's Plan,2018,84,0.0332,0.754,0.449,0.000083,0.5520,0.1090,0.357,77.169,198973


Agora que o dataframe com as 700 músicas está pronto, vai ser feita a categorização dos gêneros musicais dessas músicas (os genêros utilizados vão ser os gêneros dos artistas, não das músicas em si, pois o API do Spotify ainda não possui a funcionalidade de classificar cada música em um gênero)

In [7]:
data_1['genre'] = ''

data_generos = pd.DataFrame(columns = ['artists','name','year','popularity','acousticness','danceability','energy','instrumentalness','liveness','speechiness','valence','tempo','duration_ms','genres'])

i = 0

for index, row in data_1.iterrows():

    artists = row['artists'].strip()
    name = row['name'].strip()
    year = row['year']
    popularity = row['popularity']
    acousticness = row['acousticness']
    danceability = row['danceability']
    energy = row['energy']
    instrumentalness = row['instrumentalness']
    liveness = row['liveness']
    speechiness = row['speechiness']
    valence = row['valence']
    tempo = row['tempo']
    duration_ms = row['duration_ms']
    
    
    resultados = sp.search(q = 'artist:' + artists)
    id_artista = str(resultados['tracks']['items'][0]['artists']).split("'id': '",1)[1].split("', 'name")[0]

    genres = sp.artist(id_artista)['genres']
    genres = ' '.join(genres)
    
    colunas = [artists,name,year,popularity,acousticness,danceability,energy,instrumentalness,liveness,speechiness,valence,tempo,duration_ms,genres]
    
    data_generos.loc[i] = colunas
    
    i += 1
    
data_generos
#data_generos.to_csv('data_generos.csv', encoding='utf-8', index=False)
    
    