# DATA PREPARATION

In [164]:
# Import libraries
import numpy as np
import pandas as pd
import spotipy
import spotipy.util as util

In [165]:
# Authentication details
my_username = "oso41"
chosen_scope = "user-library-read"
my_client_id = ""
my_client_secret = ""

In [166]:
# Token creation
token = util.prompt_for_user_token(username=my_username,
                                   scope=chosen_scope,
                                   client_id=my_client_id,
                                   client_secret=my_client_secret,
                                   redirect_uri="http://127.0.0.1/")

In [167]:
# Spotipy object
sp = spotipy.Spotify(auth=token)

### CREATION OF GOOD TRACKS DATAFRAME

In [200]:
# Create empty lists where results will be stored
artist_name = []
track_name = []
popularity = []
track_id = []

In [201]:
# Obtain data (artist name, track name, track id, popularity) from playlist containing liked songs
for i in range(0,8000,100):
    results = sp.user_playlist_tracks(my_username,playlist_id="113sBEwMtYziAis9d3CmzJ",limit=100, offset=i)
    for i, t in enumerate(results['items']):
        artist_name.append(t['track']['artists'][0]['name'])
        track_name.append(t['track']['name'])
        track_id.append(t['track']['id'])
        popularity.append(t['track']['popularity'])

In [203]:
# Create dataframe of tracks
df_tracks = pd.DataFrame([artist_name,track_name,track_id,popularity]).transpose()
df_tracks.columns = ['artist_name','track_name','track_id','popularity']

In [208]:
df_tracks.head()

Unnamed: 0,artist_name,track_name,track_id,popularity
0,Kyle Landry,Howl's Moving Castle Theme,3JTjiMAPVMfwjzPiG5R3hK,54
1,Kyle Landry,Shigatsu - Otouto Mitai Na Sonzai Piano,5tr1bCo0XRU3ZciXahlTKm,47
2,Kyle Landry,"Passion (From ""Kingdom Hearts II"") [Piano Solo]",2xFBrGSZ4mSURd8p5IU4ed,47
3,Kyle Landry,Dango Daikazoku,7c6mInVLlpnaYD55R72FDX,45
4,Kyle Landry,Interstellar - First Step,0kznmyiS92b9v1rkn9WDqV,41


In [209]:
# There are some tracks that are repeated (with the the same track name and track id). Therefore, the duplicates have 
# been removed using the drop_duplicated method with the subset of the name of the artist and of the track
df_tracks.drop_duplicates(subset=['artist_name','track_name'], inplace=True)

In [213]:
# Drop NaN values
df_tracks.dropna(inplace=True)

In [215]:
# Create new column with a value of 1 since for all the songs of this dataframe (since I liked all these songs)
df_tracks['like'] = 1 

In [216]:
df_tracks.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,like
0,Kyle Landry,Howl's Moving Castle Theme,3JTjiMAPVMfwjzPiG5R3hK,54,1
1,Kyle Landry,Shigatsu - Otouto Mitai Na Sonzai Piano,5tr1bCo0XRU3ZciXahlTKm,47,1
2,Kyle Landry,"Passion (From ""Kingdom Hearts II"") [Piano Solo]",2xFBrGSZ4mSURd8p5IU4ed,47,1
3,Kyle Landry,Dango Daikazoku,7c6mInVLlpnaYD55R72FDX,45,1
4,Kyle Landry,Interstellar - First Step,0kznmyiS92b9v1rkn9WDqV,41,1


In [217]:
# Create empty list to store songs features
rows = []

# Retrieve audio features for every track
for i in range(0,len(df_tracks['track_id']),50):
    feature_results = sp.audio_features(df_tracks['track_id'][i:i+50])
    for i, t in enumerate(feature_results):
        rows.append(t)

In [218]:
# Create dataframe from previously retrieved daa
df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')

In [219]:
df_audio_features.head()

Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0.94,https://api.spotify.com/v1/audio-analysis/3JTj...,0.322,350348,0.144,3JTjiMAPVMfwjzPiG5R3hK,0.917,7,0.0744,-17.448,0,0.0384,142.818,3,https://api.spotify.com/v1/tracks/3JTjiMAPVMfw...,audio_features,spotify:track:3JTjiMAPVMfwjzPiG5R3hK,0.0471
1,0.967,https://api.spotify.com/v1/audio-analysis/5tr1...,0.368,302336,0.124,5tr1bCo0XRU3ZciXahlTKm,0.803,3,0.182,-21.851,1,0.0513,138.91,4,https://api.spotify.com/v1/tracks/5tr1bCo0XRU3...,audio_features,spotify:track:5tr1bCo0XRU3ZciXahlTKm,0.104
2,0.964,https://api.spotify.com/v1/audio-analysis/2xFB...,0.406,274656,0.13,2xFBrGSZ4mSURd8p5IU4ed,0.87,1,0.0964,-20.32,0,0.0515,116.005,4,https://api.spotify.com/v1/tracks/2xFBrGSZ4mSU...,audio_features,spotify:track:2xFBrGSZ4mSURd8p5IU4ed,0.0975
3,0.976,https://api.spotify.com/v1/audio-analysis/7c6m...,0.384,310974,0.04,7c6mInVLlpnaYD55R72FDX,0.896,8,0.0943,-23.524,1,0.0357,95.854,4,https://api.spotify.com/v1/tracks/7c6mInVLlpna...,audio_features,spotify:track:7c6mInVLlpnaYD55R72FDX,0.1
4,0.961,https://api.spotify.com/v1/audio-analysis/0kzn...,0.232,508622,0.0651,0kznmyiS92b9v1rkn9WDqV,0.93,9,0.0607,-21.866,0,0.0387,74.75,3,https://api.spotify.com/v1/tracks/0kznmyiS92b9...,audio_features,spotify:track:0kznmyiS92b9v1rkn9WDqV,0.0372


In [220]:
# Drop useless columns containing useless information
df_audio_features.drop(['analysis_url','track_href','type','uri'],axis=1,inplace=True)

In [221]:
df_audio_features.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0.94,0.322,350348,0.144,3JTjiMAPVMfwjzPiG5R3hK,0.917,7,0.0744,-17.448,0,0.0384,142.818,3,0.0471
1,0.967,0.368,302336,0.124,5tr1bCo0XRU3ZciXahlTKm,0.803,3,0.182,-21.851,1,0.0513,138.91,4,0.104
2,0.964,0.406,274656,0.13,2xFBrGSZ4mSURd8p5IU4ed,0.87,1,0.0964,-20.32,0,0.0515,116.005,4,0.0975
3,0.976,0.384,310974,0.04,7c6mInVLlpnaYD55R72FDX,0.896,8,0.0943,-23.524,1,0.0357,95.854,4,0.1
4,0.961,0.232,508622,0.0651,0kznmyiS92b9v1rkn9WDqV,0.93,9,0.0607,-21.866,0,0.0387,74.75,3,0.0372


In [222]:
# Change 'id' name to 'track_id' in order to match with previous dataframe (prepare for merging)
df_audio_features.rename(columns={'id':'track_id'},inplace=True)

In [223]:
# Merge both dataframes
df = pd.merge(df_tracks,df_audio_features,on='track_id',how='inner')

In [224]:
df.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,like,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Kyle Landry,Howl's Moving Castle Theme,3JTjiMAPVMfwjzPiG5R3hK,54,1,0.94,0.322,350348,0.144,0.917,7,0.0744,-17.448,0,0.0384,142.818,3,0.0471
1,Kyle Landry,Shigatsu - Otouto Mitai Na Sonzai Piano,5tr1bCo0XRU3ZciXahlTKm,47,1,0.967,0.368,302336,0.124,0.803,3,0.182,-21.851,1,0.0513,138.91,4,0.104
2,Kyle Landry,"Passion (From ""Kingdom Hearts II"") [Piano Solo]",2xFBrGSZ4mSURd8p5IU4ed,47,1,0.964,0.406,274656,0.13,0.87,1,0.0964,-20.32,0,0.0515,116.005,4,0.0975
3,Kyle Landry,Dango Daikazoku,7c6mInVLlpnaYD55R72FDX,45,1,0.976,0.384,310974,0.04,0.896,8,0.0943,-23.524,1,0.0357,95.854,4,0.1
4,Kyle Landry,Interstellar - First Step,0kznmyiS92b9v1rkn9WDqV,41,1,0.961,0.232,508622,0.0651,0.93,9,0.0607,-21.866,0,0.0387,74.75,3,0.0372


In [226]:
df.to_csv('good_tracks.csv')

### CREATION OF BAD TRACKS DATAFRAME

In [228]:
# REPEAT PREVIOUS PROCESS 

artist_name = []
track_name = []
popularity = []
track_id = []

In [229]:
for i in range(0,2000,100):
    results = sp.user_playlist_tracks(my_username,playlist_id="2POKTRd37hqKGQS7hvV9Qo",limit=100, offset=i)
    for i, t in enumerate(results['items']):
        artist_name.append(t['track']['artists'][0]['name'])
        track_name.append(t['track']['name'])
        track_id.append(t['track']['id'])
        popularity.append(t['track']['popularity'])

In [230]:
df_bad_tracks = pd.DataFrame([artist_name,track_name,track_id,popularity]).transpose()

In [231]:
df_bad_tracks.columns = ['artist_name','track_name','track_id','popularity']

In [233]:
df_bad_tracks.drop_duplicates(subset=['artist_name','track_name'], inplace=True)

In [234]:
df_bad_tracks.dropna(inplace=True)

In [235]:
# This time we assign a 0 to all songs since I didn't like them
df_bad_tracks['like'] = 0

In [236]:
df_bad_tracks.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,like
0,Rkomi,Mon Cheri (feat. Sfera Ebbasta),66UkKyxN5YbJl9z7YamQOR,76,0
1,Priestess,Chef (feat. Madman),4QSLqqTFzsXR8MWfJeI3zq,58,0
2,Capo Plaza,Billets (feat. Ninho),2aD5ESGyEhXnXdoOsZvsR0,73,0
3,Sfera Ebbasta,Mademoiselle,2aPnpV1hT7l3eP10uSGK6c,81,0
4,MamboLosco,BINGO,2fXOb3Dr8di7hbcKFbYNPt,69,0


In [240]:
rows = []

for i in range(0,len(df_bad_tracks['track_id']),50):
    feature_results = sp.audio_features(df_bad_tracks['track_id'][i:i+50])
    for i, t in enumerate(feature_results):
        rows.append(t)

In [242]:
df_bad_audio_features = pd.DataFrame.from_dict(rows,orient='columns')

In [243]:
df_bad_audio_features.head()

Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0.316,https://api.spotify.com/v1/audio-analysis/66Uk...,0.887,181867,0.667,66UkKyxN5YbJl9z7YamQOR,0.0,6,0.119,-5.681,0,0.0484,130.032,4,https://api.spotify.com/v1/tracks/66UkKyxN5YbJ...,audio_features,spotify:track:66UkKyxN5YbJl9z7YamQOR,0.493
1,0.277,https://api.spotify.com/v1/audio-analysis/4QSL...,0.944,176239,0.661,4QSLqqTFzsXR8MWfJeI3zq,0.0,4,0.284,-5.643,0,0.0556,133.114,4,https://api.spotify.com/v1/tracks/4QSLqqTFzsXR...,audio_features,spotify:track:4QSLqqTFzsXR8MWfJeI3zq,0.963
2,0.298,https://api.spotify.com/v1/audio-analysis/2aD5...,0.732,198537,0.791,2aD5ESGyEhXnXdoOsZvsR0,0.0,1,0.175,-5.642,1,0.191,123.101,4,https://api.spotify.com/v1/tracks/2aD5ESGyEhXn...,audio_features,spotify:track:2aD5ESGyEhXnXdoOsZvsR0,0.609
3,0.0671,https://api.spotify.com/v1/audio-analysis/2aPn...,0.794,189344,0.702,2aPnpV1hT7l3eP10uSGK6c,0.0,6,0.155,-5.656,0,0.0346,121.938,4,https://api.spotify.com/v1/tracks/2aPnpV1hT7l3...,audio_features,spotify:track:2aPnpV1hT7l3eP10uSGK6c,0.535
4,0.158,https://api.spotify.com/v1/audio-analysis/2fXO...,0.796,150444,0.643,2fXOb3Dr8di7hbcKFbYNPt,0.0,2,0.0561,-5.235,0,0.0354,128.057,4,https://api.spotify.com/v1/tracks/2fXOb3Dr8di7...,audio_features,spotify:track:2fXOb3Dr8di7hbcKFbYNPt,0.609


In [245]:
df_bad_audio_features.drop(['analysis_url','track_href','type','uri'],axis=1,inplace=True)

In [246]:
df_bad_audio_features.rename(columns={'id':'track_id'},inplace=True)

In [247]:
df_bad = pd.merge(df_bad_tracks,df_bad_audio_features,on='track_id',how='inner')

In [249]:
df_bad.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,like,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Rkomi,Mon Cheri (feat. Sfera Ebbasta),66UkKyxN5YbJl9z7YamQOR,76,0,0.316,0.887,181867,0.667,0.0,6,0.119,-5.681,0,0.0484,130.032,4,0.493
1,Priestess,Chef (feat. Madman),4QSLqqTFzsXR8MWfJeI3zq,58,0,0.277,0.944,176239,0.661,0.0,4,0.284,-5.643,0,0.0556,133.114,4,0.963
2,Capo Plaza,Billets (feat. Ninho),2aD5ESGyEhXnXdoOsZvsR0,73,0,0.298,0.732,198537,0.791,0.0,1,0.175,-5.642,1,0.191,123.101,4,0.609
3,Sfera Ebbasta,Mademoiselle,2aPnpV1hT7l3eP10uSGK6c,81,0,0.0671,0.794,189344,0.702,0.0,6,0.155,-5.656,0,0.0346,121.938,4,0.535
4,MamboLosco,BINGO,2fXOb3Dr8di7hbcKFbYNPt,69,0,0.158,0.796,150444,0.643,0.0,2,0.0561,-5.235,0,0.0354,128.057,4,0.609


In [273]:
df_bad.to_csv('bad_tracks.csv')