# Spotify Recomendation Engine

### Installing and importing dependecies

In [None]:
!pip install spotipy

In [None]:
# Basic data science packages
import pandas as pd
import numpy as np

# Auxiliary packages from standard library
import json
import sys
import itertools

# Machine learning and data visualization tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# Wrappers for the Spotify API
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

## Data exploration & feature engineering

In [None]:
tracks_df = pd.read_csv('/kaggle/input/spotify-dataset-19212020-160k-tracks/data.csv')
genre_df = pd.read_csv('/kaggle/input/spotify-dataset-19212020-160k-tracks/data_w_genres.csv')

In [None]:
tracks_df.head()

In [None]:
genre_df.head()

In [None]:
genre_df.dtypes

The `genres` feature is actually a string made to look like a list. This can be proven by the indexing the first value of the first row.

In [None]:
genre_df['genres'].values[0][0]

To work with the list of genres as intended, we must convert the string to a list of strings.

In [None]:
def string_to_list(input_string):
    list_of_strings = list()
    
    if len(input_string) == 2:  # Skip empty lists 
        return list_of_strings

    strings = input_string[1:-1]  # Cut off the brackets
    
    strings = strings.split(', ')
    for string in strings:
        string = string.replace(' ', '_')
        string = string[1:-1]  # Cut off the apostrophes
        list_of_strings.append(string)
    
    return list_of_strings


genre_df['genres_upd'] = genre_df['genres'].apply(string_to_list)
genre_df['genres_upd'].values[0][0]

Similarly, we must convert the `artist` feature to a list of names.

In [None]:
tracks_df['artists_upd'] = tracks_df['artists'].apply(string_to_list)
tracks_df['artists_upd'].values[0][0]

There are also duplicate songs from the same artist that must be removed.

In [None]:
def concatenate_artist_song(row):
    artist = row['artists_upd'][0]
    song = row['name']
    
    return artist + song    
    

original_df_lenght = len(tracks_df['artists_upd'])

tracks_df['artists_song'] = tracks_df.apply(concatenate_artist_song, axis=1)

tracks_df.drop_duplicates('artists_song', inplace=True)

removed_duplicates = original_df_lenght - len(tracks_df['artists_song'])

print(f"Removed {removed_duplicates} duplicates from the dataset")

Then, it's necessary to explode the artist feature so it can be merged with the genre dataset.

In [None]:
artists_exploded = tracks_df[['artists_upd','id']].explode('artists_upd')
artists_exploded_merged = artists_exploded.merge(genre_df, 
                                                 how = 'left', 
                                                 left_on = 'artists_upd',
                                                 right_on = 'artists')

# Remove rows without genre
artists_exploded_merged_nonnull = artists_exploded_merged[~artists_exploded_merged.genres_upd.isnull()]

In [None]:
def consolidate_list(genres):
    # Flatten the genres list
    genres = itertools.chain.from_iterable(genres)
              
    # Remove duplicates
    genres = set(genres)
    
    return list(genres)

artists_genres_consolidated = artists_exploded_merged_nonnull.groupby('id')['genres_upd'].apply(list).reset_index()
artists_genres_consolidated['consolidates_genre_lists'] = artists_genres_consolidated['genres_upd'].apply(consolidate_list)


artists_genres_consolidated.head()

In [None]:
tracks_df = tracks_df.merge(artists_genres_consolidated[['id','consolidates_genre_lists']], 
                            on='id',
                            how='left')
tracks_df.head()

Now that we have explored and merged the datasets, it's time to start the **feature engineering** proccess.

First, we are going to normalize the `float64` features.

In [None]:
def normalize_floats(df):
    float_cols = df.dtypes[df.dtypes == 'float64'].index.values

    #scale float columns
    floats = df[float_cols].reset_index(drop=True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), 
                                 columns=floats.columns) * 0.2
    
    return floats_scaled

floats_scaled = normalize_floats(tracks_df)

floats_scaled.head()

Then we need to OHE the year and popularity features.

In [None]:
def get_year(release_date):
    year, *_ = release_date.split('-')
    
    return year

tracks_df['year'] = tracks_df['release_date'].apply(get_year)
tracks_df['year'].head()

In [None]:
# Divide popularity into 10 categories
tracks_df['popularity_red'] = tracks_df['popularity'].apply(
    lambda genre: genre // 10
)

tracks_df['popularity_red'].unique()

In [None]:
def ohe_prep(df, column, new_name): 
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    
    tf_df.columns = [f"{new_name}|{feature}" for feature in feature_names]
    tf_df.reset_index(drop=True, 
                      inplace=True)    
    
    return tf_df

year_ohe = ohe_prep(tracks_df, 'year','year') * 0.5
popularity_ohe = ohe_prep(tracks_df, 'popularity_red','pop') * 0.15

ohe_features = pd.concat([year_ohe, popularity_ohe], axis=1)
ohe_features.columns

Finally, we are going to create TF-IDF features off of the artist genres.

TF-IDF consists in an OHE method that balances the feature weights based on how often they appear on the dataset. An uncommon genre should have a bigger impact on the recommended tracks than a common one.

In [None]:
def tfidf_feature(df):
    #tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(df['consolidates_genre_lists'].apply(
        lambda genres: " ".join(genres)
    ))
    
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    
    genre_df.columns = [f"genre|{feature}" for feature in tfidf.get_feature_names()]
    genre_df.reset_index(drop = True, inplace=True)
    
    return genre_df

# TF-IDF can't handle NULL values, so they must be replaced by empty lists
tracks_df['consolidates_genre_lists'] = tracks_df['consolidates_genre_lists'].apply(
    lambda genres: genres if isinstance(genres, list) else list()
)

genre_df = tfidf_feature(tracks_df)
genre_df.head()

Now that we have all the features we can combine them in a single dataset.

In [None]:
complete_feature_set = pd.concat([genre_df, floats_scaled, ohe_features], axis=1)
complete_feature_set['id'] = tracks_df['id'].values

complete_feature_set.head()