# 1. Imports

In [55]:
import pandas as pd
import plotly.express as px
import spotipy
import string
import unidecode
import warnings
from datetime import datetime
from spotipy.oauth2 import SpotifyOAuth
from datetime import datetime
from config import (
    SPOTIPY_CLIENT_ID,
    SPOTIPY_CLIENT_SECRET,
    SPOTIPY_REDIRECT_URI,
    PLAYLIST_ID
)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# 2. Consuming _spotipy_ Data

In [2]:
# Creating authentication object
sp = spotipy.Spotify(
    auth_manager=SpotifyOAuth(
        client_id=SPOTIPY_CLIENT_ID,
        client_secret=SPOTIPY_CLIENT_SECRET,
        redirect_uri=SPOTIPY_REDIRECT_URI,
        scope="playlist-read-private",
    )
)

# Obtaining platlist information
playlist = sp.playlist(PLAYLIST_ID)

print(f'Playlist name: {playlist["name"]}')
music_data = []
for idx, track in enumerate(playlist["tracks"]["items"], start=1):
    track_id = track["track"]["id"]

    # Audio attributes
    track_info = sp.audio_features([track_id])

    # Getting music attributes
    track_details = sp.track(track_id)
    artist = track_details["artists"][0]["name"]
    genre = track_details["genres"] if "genres" in track_details else None
    popularity = track_details["popularity"]

    track_features = track_info[0] if track_info else None

    # Concatenating music infomation
    music_data.append(
        {
            **{
                "track_name": track["track"]["name"],
                "artist": artist,
                "genre": genre,
                "release_date": track_details['album']['release_date'],
                "popularity": popularity,
            },
            **track_features,
        }
    )
data = pd.DataFrame(music_data)

Playlist name: Playlist - Trap


# 3. Data Prep & Feature Engineering

In [17]:
data = pd.DataFrame(music_data)

In [18]:
data.tail()

Unnamed: 0,track_name,artist,genre,release_date,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
95,BEM MELHOR,MC Cabelinho,,2023-05-04,78,0.5,0.59,10,-5.43,0,0.38,0.61,0.0,0.12,0.57,70.94,audio_features,7aOBf2xYbux3bo1fwISo6b,spotify:track:7aOBf2xYbux3bo1fwISo6b,https://api.spotify.com/v1/tracks/7aOBf2xYbux3...,https://api.spotify.com/v1/audio-analysis/7aOB...,317340,4
96,FOGO E GASOLINA,MC Cabelinho,,2022-12-20,79,0.53,0.63,10,-6.38,0,0.11,0.18,0.0,0.13,0.51,141.56,audio_features,52EYWdjiulHaehpexuowyh,spotify:track:52EYWdjiulHaehpexuowyh,https://api.spotify.com/v1/tracks/52EYWdjiulHa...,https://api.spotify.com/v1/audio-analysis/52EY...,181768,4
97,Tropa do Sábio,Mc Poze do Rodo,,2022-11-10,54,0.64,0.59,1,-6.06,0,0.29,0.38,0.0,0.11,0.43,146.47,audio_features,75JStwKgd9AMAy4q1SEZjq,spotify:track:75JStwKgd9AMAy4q1SEZjq,https://api.spotify.com/v1/tracks/75JStwKgd9AM...,https://api.spotify.com/v1/audio-analysis/75JS...,146939,4
98,Segredo,KayBlack,,2023-03-31,83,0.8,0.49,7,-6.79,0,0.18,0.33,0.0,0.11,0.66,86.27,audio_features,6hu4HvpNzrW2RIEEAZC3vD,spotify:track:6hu4HvpNzrW2RIEEAZC3vD,https://api.spotify.com/v1/tracks/6hu4HvpNzrW2...,https://api.spotify.com/v1/audio-analysis/6hu4...,104145,3
99,Sal e Pimenta,KayBlack,,2023-03-30,75,0.61,0.53,9,-6.86,0,0.41,0.04,0.0,0.1,0.28,118.15,audio_features,6BsKZXjKIrGWMK1uLWfJYN,spotify:track:6BsKZXjKIrGWMK1uLWfJYN,https://api.spotify.com/v1/tracks/6BsKZXjKIrGW...,https://api.spotify.com/v1/audio-analysis/6BsK...,206452,5


- In this dataframe, we have some columns that we can remove. For example: `uri`, `track_href`, so on...

In [19]:
selected_cols = [
    'id',
    'track_name',
    'artist',
    'genre',
    'release_date',
    'popularity',
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    'duration_ms',
    'time_signature'
]
data = data.filter(selected_cols)

In [20]:
data.tail()

Unnamed: 0,id,track_name,artist,genre,release_date,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
95,7aOBf2xYbux3bo1fwISo6b,BEM MELHOR,MC Cabelinho,,2023-05-04,78,0.5,0.59,10,-5.43,0,0.38,0.61,0.0,0.12,0.57,70.94,317340,4
96,52EYWdjiulHaehpexuowyh,FOGO E GASOLINA,MC Cabelinho,,2022-12-20,79,0.53,0.63,10,-6.38,0,0.11,0.18,0.0,0.13,0.51,141.56,181768,4
97,75JStwKgd9AMAy4q1SEZjq,Tropa do Sábio,Mc Poze do Rodo,,2022-11-10,54,0.64,0.59,1,-6.06,0,0.29,0.38,0.0,0.11,0.43,146.47,146939,4
98,6hu4HvpNzrW2RIEEAZC3vD,Segredo,KayBlack,,2023-03-31,83,0.8,0.49,7,-6.79,0,0.18,0.33,0.0,0.11,0.66,86.27,104145,3
99,6BsKZXjKIrGWMK1uLWfJYN,Sal e Pimenta,KayBlack,,2023-03-30,75,0.61,0.53,9,-6.86,0,0.41,0.04,0.0,0.1,0.28,118.15,206452,5


For my playlist, the column `genre` is completly empty.

In [21]:
del data['genre']

In [31]:
data = data.astype(
    {
        'id': 'category',
        'track_name': 'str',
        'artist': 'category',
        'release_date': 'datetime64[ns]',
        'popularity': 'int',
        'danceability': 'float16',
        'energy': 'float16',
        'key': 'int',
        'loudness': 'float16',
        'mode': 'int',
        'speechiness': 'float16',
        'acousticness': 'float16',
        'instrumentalness': 'float16',
        'liveness': 'float16',
        'valence': 'float16',
        'tempo': 'float16',
        'duration_ms': 'int',
        'time_signature': 'int'
    }
)

In [29]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
popularity,100.0,61.23,18.56,0.0,55.0,66.0,72.25,90.0
danceability,100.0,0.73,0.12,0.4,0.65,0.75,0.84,0.94
energy,100.0,0.59,0.11,0.33,0.52,0.6,0.67,0.83
key,100.0,5.88,3.4,0.0,3.0,6.0,9.0,11.0
loudness,100.0,-6.77,2.17,-13.26,-7.56,-6.33,-5.17,-2.84
mode,100.0,0.3,0.46,0.0,0.0,0.0,1.0,1.0
speechiness,100.0,0.18,0.13,0.03,0.08,0.15,0.27,0.64
acousticness,100.0,0.25,0.19,0.0,0.08,0.19,0.37,0.81
instrumentalness,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
liveness,100.0,0.15,0.08,0.04,0.1,0.12,0.18,0.46


> I will consider the track_name as text because music titles can provide valuable context about the song. This additional information can be highly beneficial for our model.


We can leverage the music names to extract valuable context. For instance, let's take the songs 'x1' and 'x6,' both featuring references to cars
in their lyrics. By thoroughly analyzing and comparing their titles, we can potentially uncover common themes and connections between
these compositions. However, it's essential to acknowledge that in certain cases, such information might not yield any meaningful connections.

In [51]:
def remove_punctuation(texto):
    '''Remove strings punctuations, like: (!, ?, -)'''
    return texto.translate(str.maketrans('', '', string.punctuation))


def remove_accentuation(texto):
    '''Remove strings accentuation, like: (^, ~, ')'''
    return unidecode.unidecode(texto)

data['processed_track_name'] = data['track_name'].apply(lambda x: remove_accentuation(remove_punctuation(x.lower())))

In [49]:
data['processed_track_name'].unique()

array(['rei lacoste', 'fragrancia  remix', 'tiffany', 'montblanc',
       'drip da roca', 'groupies', 'saturno', 'futuro quadro',
       'prada louis', 'ok baby', 'louis v menina linda', 'lv',
       'anota placa', 'a cara do crime nos incomoda',
       'eu fiz o jogo virar', 'vida louca', 'do crime ao funk',
       'aonde eu sou cria', 'glockada', 'bag de grife', 'bendito',
       'malandro chique', 'tapa', 'saque  remix', 'blessed',
       'mlk sonhador', 'lagrimas de crocodilo', 'mustang preto', 'tango',
       'manha', 'dia azul', 'paypal', 'invejoso', 'jacare que dorme',
       'rj mais que atlanta', 'freio da blazer', 'tipo gringa',
       'isso que e bom', 'fim de semana no rio', 'quer voar', 'sem do',
       'gorilla roxo', 'pitbull', '777666', 'maquina do tempo', 'antes',
       'm4', 'cogulandia', 'f f m', 'vampiro', 'aulas e cursos',
       'na onda do gin', 'lobo', 'me sinto abencoado', 'vestido da fendi',
       'cpx ta tega', 'bandido nao danca bb', 'balazul',
       'jov

In [75]:
today = datetime.strptime('2023-07-31', "%Y-%m-%d")  # Fixing the actual date
data['diff_release'] = data['release_date'].apply(lambda x: (today - x).days)

> The idea of calculating the number of days since a song's release is that some songs, which did not gain significant popularity initially, tend to be forgotten over time. On the other hand, songs that were successful at the time of their release are more likely to be remembered even after some time has passed.

In [76]:
data.head()

Unnamed: 0,id,track_name,artist,release_date,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,processed_track_name,diff_release
0,1jwEnLN5hMOwotwXfchUaf,Rei Lacoste,MD Chefe,2021-08-04,56,0.9,0.34,9,-12.38,1,0.27,0.14,0.0,0.09,0.49,119.94,152000,4,rei lacoste,726
1,76E9YSvgci2hPaNFy3XZu1,Fragrância - Remix,MD Chefe,2021-08-04,55,0.74,0.62,7,-6.11,0,0.27,0.5,0.0,0.2,0.53,111.88,171428,4,fragrancia remix,726
2,0NvVVP3Q0oSbfBUZ48vCHs,Tiffany,MD Chefe,2021-08-04,59,0.75,0.46,8,-7.89,1,0.05,0.28,0.0,0.17,0.48,126.0,152381,4,tiffany,726
3,4FIvGmOwod42VFIpO0huBO,Montblanc,MD Chefe,2021-08-04,48,0.72,0.62,1,-8.01,1,0.14,0.44,0.0,0.39,0.44,119.88,144005,4,montblanc,726
4,0LxmE3Bnk30dfFJ7LQt30T,Drip da Roça,Reid,2020-04-17,58,0.86,0.35,3,-11.25,0,0.29,0.18,0.0,0.11,0.52,119.94,424000,4,drip da roca,1200


### Next steps:
- Preparation of the track_name column
- Extraction of context from the track_name column
- Development of new features
- Implementation of a Recommender using Reinforcement Learning