# Projeto 2 - Ciência dos Dados
## Classificando músicas no Spotify
### Rodrigo Paoliello de Medeiros

O objetivo desse projeto é classificar gêneros musicais usando como base ***audio features*** que são disponibilizadas usando o API do Spotify, para que o programa consiga acertar o gênero de musicas. 

Para isso, o primeiro passo foi baixar um dataframe e limpar ele, deixando as 200 músicas mais populares de cada década, de 1950 até 2020.

In [11]:
%%capture

#Instalando o spotipy
!pip install spotipy

In [12]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os.path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

In [13]:
client_credentials_manager = SpotifyClientCredentials(client_id='9230b3a140aa47d5976323e954fdb105', client_secret='49ea33f127304b3cb533030a400ce18c')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [14]:
data = pd.read_csv('data.csv')
data = data.drop(['explicit', 'id', 'key', 'mode', 'release_date'], axis=1)
data = data[['artists','name','year','popularity','acousticness', 'loudness','danceability','energy','instrumentalness','liveness','speechiness','valence','tempo','duration_ms']]
data['artists'] = data['artists'].map(lambda x: re.sub(r'\W+', ' ', x))

In [15]:
# Separando dataframes por décadas de 1950s até 2010s e escolhendo as 20 músicas mais populares de cada década

# 50s
maior_50 = data['year'] > 1950
menor_50 = data['year'] <= 1959
decada_50 = maior_50 & menor_50
data_50s = data[(decada_50)]

data_50s = data_50s.sort_values(by=['popularity'], ascending=False)
data_50s = data_50s.head(200)

# 60s
maior_60 = data['year'] > 1960
menor_60 = data['year'] <= 1969
decada_60 = maior_60 & menor_60
data_60s = data[(decada_60)]

data_60s = data_60s.sort_values(by=['popularity'], ascending=False)
data_60s = data_60s.head(200)

# 70s
maior_70 = data['year'] > 1970
menor_70 = data['year'] <= 1979
decada_70 = maior_70 & menor_70
data_70s = data[(decada_70)]

data_70s = data_70s.sort_values(by=['popularity'], ascending=False)
data_70s = data_70s.head(200)

# 80s
maior_80 = data['year'] > 1980
menor_80 = data['year'] <= 1989
decada_80 = maior_80 & menor_80
data_80s = data[(decada_80)]

data_80s = data_80s.sort_values(by=['popularity'], ascending=False)
data_80s = data_80s.head(200)

# 90s
maior_90 = data['year'] > 1990
menor_90 = data['year'] <= 1999
decada_90 = maior_90 & menor_90
data_90s = data[(decada_90)]

data_90s = data_90s.sort_values(by=['popularity'], ascending=False)
data_90s = data_90s.head(200)

# 00s
maior_00 = data['year'] > 2000
menor_00 = data['year'] <= 2009
decada_00 = maior_00 & menor_00
data_00s = data[(decada_00)]

data_00s = data_00s.sort_values(by=['popularity'], ascending=False)
data_00s = data_00s.head(200)

# 10s
maior_10 = data['year'] > 2010
menor_10 = data['year'] <= 2019
decada_10 = maior_10 & menor_10
data_10s = data[(decada_10)]

data_10s = data_10s.sort_values(by=['popularity'], ascending=False)
data_10s = data_10s.head(200)


In [16]:
# Juntando os dataframes de todas as décadas

data_1 = pd.DataFrame

data_1 = pd.concat([data_50s, data_60s, data_70s, data_80s, data_90s, data_00s, data_10s])

data_1

Unnamed: 0,artists,name,year,popularity,acousticness,loudness,danceability,energy,instrumentalness,liveness,speechiness,valence,tempo,duration_ms
81840,Chuck Berry,Johnny B. Goode,1959,74,0.7410,-9.129,0.534,0.803,0.000061,0.3070,0.0743,0.969,167.983,161560
81741,Elvis Presley,Jailhouse Rock,1958,73,0.4100,-9.538,0.647,0.582,0.000002,0.0715,0.0755,0.915,167.396,146480
81640,Ella Fitzgerald Louis Armstrong,Dream A Little Dream Of Me - Single Version,1957,71,0.9130,-17.042,0.443,0.104,0.000000,0.1910,0.1010,0.394,76.497,185160
81743,Elvis Presley,Hound Dog,1958,69,0.7330,-8.492,0.357,0.756,0.005050,0.7600,0.0621,0.950,174.797,136027
81740,Frank Sinatra,Come Fly With Me - Remastered,1958,69,0.8450,-11.376,0.574,0.338,0.000000,0.1650,0.0420,0.493,67.008,199093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97568,Anne Marie,2002,2018,82,0.0372,-2.881,0.697,0.683,0.000000,0.1370,0.1170,0.603,96.133,186987
87795,Zedd Maren Morris Grey,The Middle,2018,82,0.1710,-3.061,0.753,0.657,0.000000,0.1120,0.0449,0.437,107.010,184732
87796,Cardi B Bad Bunny J Balvin,I Like It,2018,82,0.0990,-3.998,0.816,0.726,0.000000,0.3720,0.1290,0.650,136.048,253390
87797,FINNEAS,Let's Fall in Love for the Night,2018,82,0.8020,-7.941,0.737,0.408,0.000000,0.1710,0.1040,0.374,127.921,190348


Agora que o dataframe com as 700 músicas está pronto, vai ser feita a categorização dos gêneros musicais dessas músicas (os genêros utilizados vão ser os gêneros dos artistas, não das músicas em si, pois o API do Spotify ainda não possui a funcionalidade de classificar cada música em um gênero)

In [17]:
data_1['genre'] = ''

data_generos = pd.DataFrame(columns = ['artists','name','year','popularity','acousticness','danceability','energy','instrumentalness','liveness','loudness','speechiness','valence','tempo','duration_ms','genres'])

i = 0

for index, row in data_1.iterrows():

    artists = row['artists'].strip()
    name = row['name'].strip()
    year = row['year']
    popularity = row['popularity']
    acousticness = row['acousticness']
    danceability = row['danceability']
    energy = row['energy']
    instrumentalness = row['instrumentalness']
    liveness = row['liveness']
    loudness = row['loudness']
    speechiness = row['speechiness']
    valence = row['valence']
    tempo = row['tempo']
    duration_ms = row['duration_ms']
    
    
    resultados = sp.search(q = 'artist:' + artists)
    id_artista = str(resultados['tracks']['items'][0]['artists']).split("'id': '",1)[1].split("', 'name")[0]

    genres = sp.artist(id_artista)['genres']
    genres = ' '.join(genres)
    
    colunas = [artists,name,year,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo,duration_ms,genres]
    
    data_generos.loc[i] = colunas
    
    i += 1

data_generos.to_csv('data_generos.csv', encoding='utf-8', index=False)
    
    