# Librerías

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Importar datos

In [2]:
track_features = pd.read_csv('./data/tf_mini.csv', index_col=0)
sessions = pd.read_csv('./data/log_mini.csv')

# Renombrar columnas

In [3]:
sessions = sessions.rename(columns={'session_id': 'user_id', 'track_id_clean': 'track_id'})
track_features = track_features.rename(columns={'release_year': 'year'})

# Eliminar columnas

De `track_features` conservaremos solamente las columnas que puedan ser obtenidas desde la API de Spotify:

In [4]:
track_features = track_features[[
    'duration', 'year', 'us_popularity_estimate', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key',
    'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'
]]
print('Dataset Track Features, sin columnas no obtenibles de la API de Spotify:')
print('- Filas:', track_features.shape[0])
print('- Columnas:', track_features.shape[1])
print('- Datos nulos:', track_features.isna().sum().sum())
display(track_features.head(3))

Dataset Track Features, sin columnas no obtenibles de la API de Spotify:
- Filas: 50704
- Columnas: 15
- Datos nulos: 0


Unnamed: 0_level_0,duration,year,us_popularity_estimate,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
t_a540e552-16d4-42f8-a185-232bd650ea7d,109.706673,1950,99.975414,0.45804,0.399767,0.817709,3.254327e-06,0,0.132124,-11.238,major,0.079985,166.287003,4,0.935512
t_67965da0-132b-4b1e-8a69-0ef99b32287c,187.693329,1950,99.96943,0.916272,0.491235,0.154258,8.344854e-12,0,0.163281,-13.706,major,0.083877,95.261002,3,0.359675
t_0614ecd3-a7d5-40a1-816e-156d5872a467,160.839996,1951,99.602549,0.812884,0.491625,0.358813,2.927475e-10,0,0.090115,-10.522,minor,0.038777,105.185997,4,0.726769


De `sessions` solo queremos las interacciones usuario-ítem, así que conservaremos solo `session_id` y `track_id`:

In [5]:
sessions = sessions[['user_id', 'track_id']]
print('Dataset Sessions:')
print('- Filas:', sessions.shape[0])
print('- Columnas:', sessions.shape[1])
print('- Datos nulos:', sessions.isna().sum().sum())
display(sessions.head(3))

Dataset Sessions:
- Filas: 167880
- Columnas: 2
- Datos nulos: 0


Unnamed: 0,user_id,track_id
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,t_0479f24c-27d2-46d6-a00c-7ec928f2b539
1,0_00006f66-33e5-4de7-a324-2d18e439fc1e,t_9099cd7b-c238-47b7-9381-f23f2c1d1043
2,0_00006f66-33e5-4de7-a324-2d18e439fc1e,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0


# Normalizar columnas

In [6]:
track_features.loc[:, 'duration'] = track_features.loc[:, 'duration'] / max(track_features['duration'])
track_features.loc[:, 'year'] = track_features.loc[:, 'year'] / 2022
track_features.loc[:, 'us_popularity_estimate'] = track_features.loc[:, 'us_popularity_estimate'] / 100
track_features.loc[:, 'tempo'] = track_features.loc[:, 'tempo'] / max(track_features['tempo'])
track_features.loc[:, 'loudness'] = track_features.loc[:, 'loudness'] / min(track_features['loudness'])
track_features.head(3)

Unnamed: 0_level_0,duration,year,us_popularity_estimate,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
t_a540e552-16d4-42f8-a185-232bd650ea7d,0.061365,0.964392,0.999754,0.45804,0.399767,0.817709,3.254327e-06,0,0.132124,0.1873,major,0.079985,0.760082,4,0.935512
t_67965da0-132b-4b1e-8a69-0ef99b32287c,0.104988,0.964392,0.999694,0.916272,0.491235,0.154258,8.344854e-12,0,0.163281,0.228433,major,0.083877,0.435429,3,0.359675
t_0614ecd3-a7d5-40a1-816e-156d5872a467,0.089967,0.964886,0.996025,0.812884,0.491625,0.358813,2.927475e-10,0,0.090115,0.175367,minor,0.038777,0.480795,4,0.726769


# Transformar columnas categóricas a columnas numéricas

In [7]:
track_features.loc[:, 'mode'] = LabelEncoder().fit_transform(track_features.loc[:, 'mode'])
track_features.head(3)

Unnamed: 0_level_0,duration,year,us_popularity_estimate,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
t_a540e552-16d4-42f8-a185-232bd650ea7d,0.061365,0.964392,0.999754,0.45804,0.399767,0.817709,3.254327e-06,0,0.132124,0.1873,0,0.079985,0.760082,4,0.935512
t_67965da0-132b-4b1e-8a69-0ef99b32287c,0.104988,0.964392,0.999694,0.916272,0.491235,0.154258,8.344854e-12,0,0.163281,0.228433,0,0.083877,0.435429,3,0.359675
t_0614ecd3-a7d5-40a1-816e-156d5872a467,0.089967,0.964886,0.996025,0.812884,0.491625,0.358813,2.927475e-10,0,0.090115,0.175367,1,0.038777,0.480795,4,0.726769


# Guardar datos procesados

In [8]:
track_features.to_csv('./data/processed_track_features.csv')
sessions.to_csv('./data/processed_sessions.csv', index=False)