In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Introduction**

Hello there, this is a (semi)-quick look of how one can use [this Spotify audio features dataset I created](https://www.kaggle.com/siropo/spotify-multigenre-playlists-data) to train a basic recommender system for your own music library! 
While I won't provide access to my private music library (for now...), I'll be leaving the code I used to scrape my personal music data (and the global playlist data) in the appendix section at the end of this notebook, so be sure to give it a look if you'd like to try this with your own track list!

# Importing the dataset(s)

In [None]:
#general music data from various genres, gathered from official spotify playlists available in my region
blues_df = pd.read_csv("../input/spotify-multigenre-playlists-data/blues_music_data.csv")
rock_df = pd.read_csv("../input/spotify-multigenre-playlists-data/rock_music_data.csv")
metal_df = pd.read_csv("../input/spotify-multigenre-playlists-data/metal_music_data.csv")
pop_df = pd.read_csv("../input/spotify-multigenre-playlists-data/pop_music_data.csv")
indie_df = pd.read_csv("../input/spotify-multigenre-playlists-data/indie_alt_music_data.csv")
alt_df = pd.read_csv("../input/spotify-multigenre-playlists-data/alternative_music_data.csv")
hiphop_df = pd.read_csv("../input/spotify-multigenre-playlists-data/hiphop_music_data.csv")

#personal data gathered from my music library
my_df = pd.read_csv("../input/my-spotify-music-library/train_music_data.csv").drop('type',axis=1)

In [None]:
#useful imports
from matplotlib import pyplot as plt
import seaborn as sns; sns.set()

# **Some insights** - Data Analysis

One of the very first stuff that I wanted to do with my data was to try and check which bands that I like were the loudest/hypest/most energetic, because that's my most listened to "genre" of music.

In [None]:
pd.DataFrame(pd.DataFrame(my_df.groupby(my_df['Artist Name']).filter(lambda x: len(x)>10)).groupby("Artist Name").energy.mean()).sort_values(by='energy',ascending=False)[:20]
#There must be a better way of finding the top 20 loudest/most energetic bands in my library... too bad I didn't find it though so enjoy this monstrosity
#(Do note I didn't include any artist with less than 10 songs (1 LP-worth of songs) saved, due to the possible high variance (and low sample size) between audio features). 

Well, if you know any of these bands, you are probably not surprised, as all of these artists usually play pretty [energetic](https://www.youtube.com/watch?v=tuK6n2Lkza0&ab_channel=Jet) stuff.

Now, lets visualize my libraries' tendencies using some audio features - specifically the "energy" and "danceability" features, to create a neat 2-D scatter plot. The red cross represents the "average popular song"'s audio features, as a frame of reference:

In [None]:
plt.rcParams["figure.figsize"] = (12,10)

In [None]:
average_noise = pop_df['energy'].mean()
average_danceability = pop_df['danceability'].mean()
plt.scatter(my_df['danceability'],my_df['energy'],alpha=0.75)
plt.axhline(y=average_noise, color='r')
plt.axvline(x=average_danceability, color='r')
plt.title("Energy as a function of Danceability - from my music library")
plt.xlabel("Danceability")
plt.ylabel("Energy")
plt.show()

We can see from this visualization that I usually favor tracks with very high energy and pretty low danceability (compared to your average "popular song"). 
We can use this insight later on to draw some conclusions about my general music taste...

In [None]:
'''#reset default fig size
plt.rcParams['figure.figsize'] = plt.rcParamsDefault['figure.figsize']'''

Next, I'm gonna take my data and compare the "average song" from my library to those from different genres:

In [None]:
my_df.columns

In [None]:
pop_df[['danceability','energy','loudness','speechiness','acousticness','valence']].mean().plot(legend=True)
rock_df[['danceability','energy','loudness','speechiness','acousticness','valence']].mean().plot(legend=True)
alt_df[['danceability','energy','loudness','speechiness','acousticness','valence']].mean().plot(legend=True)
indie_df[['danceability','energy','loudness','speechiness','acousticness','valence']].mean().plot(legend=True)
hiphop_df[['danceability','energy','loudness','speechiness','acousticness','valence']].mean().plot(legend=True)
metal_df[['danceability','energy','loudness','speechiness','acousticness','valence']].mean().plot(legend=True)
my_df[['danceability','energy','loudness','speechiness','acousticness','valence']].mean().plot(legend=True)
plt.legend(["pop","rock","alternative","indie","hiphop","metal","my library"])
plt.title("Different genres across different audio features")


While discerning minute differences between the genres might be difficult, what we CAN clearly see is that songs from my library tend to be much louder than its' peers, while also topping the energy ranks and coming in last at danceability. Furthermore, we can clearly see my songs' features are very **very** similar to the metal genre - and that very much surprised me, as I barely listen to any metal at all.

Some other cute stuff I've noticed:
 - the hiphop genre leads the pack in terms of speechiness, which makes sense overall.
 - metal (and rock, to some extent) tend to have lower danceability and acousticness- not surprising considering the abundance of electric guitars and that sweet distortion.
- indie seems to be the 'chillest' genre overall, with the lowest loudness score and with low energy score, as well.

Lets remove the "noise" and focus solely on my library and the metal library: 

In [None]:
metal_df[['danceability','energy','loudness','speechiness','acousticness','valence']].mean().plot(legend=True)
my_df[['danceability','energy','loudness','speechiness','acousticness','valence']].mean().plot(legend=True)
plt.legend(["metal","my library"])
plt.title("Different genres across different audio features")

...And now we can see differences more cleary, including the changes in energy,danceability,acousticness and valence ("Hapiness measure") across the two groups. 

However, it is a bit forboding how my musical taste is feature-similar to a genre I generally do not like all that much...

Now let's try to check for some audio features dependencies using a heatmap correlation matrix: 

In [None]:
plt.figure(figsize = (16, 6))
full_dataset = pd.concat([blues_df,pd.concat([rock_df,pd.concat([metal_df,pd.concat([indie_df,pd.concat([alt_df,pd.concat([pop_df,pd.concat([hiphop_df,my_df])])])])])])]).drop_duplicates()
sns.heatmap(full_dataset[['Popularity','danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo']].corr(),annot=True)

The two clearest dependencies are valence-danceability and loudness-energy, the former makes sense to me since happy songs are more fun to dance to, and the latter further supports the phenomena we witnessed when comparing the cross-genre audio features- and noticed that the loudest genres (and my library) tend to be more energetic as well.

Similiary, we detect a clear anti-correlation between acousticness and both energy and loudness.

# **Building some models**

So, I started this little project with a pretty obvious goal - design a track recommender system, using my music library as a train set.
Being a data science student the first thing that came to mind was to build a binary classifcation model - using a train set filled with songs I like **and** dislike. While this approach may require more data than other, more realistic/feasible approaches (that may operate more smoothly without a list of disliked songs), its also relatively simple and intiutive to anyone who tried basic ML before. Will update this notebook when I have the time to try out some other approaches I had in mind.

# **Approach 1 - Binary Classification**
Assuming I have my own library AND a list of songs I don't like, I can use regular ML/NN algorithms to try and predict which songs from the remainder of Spotify's featured tracks I may like the most:

In [None]:
def del_common_rows(df1,df2,decider):
    """
    :returns a DataFrame that is identical to df1 without any rows it has in common with df2
    """
    intersection = pd.merge(df2, df1, how ='inner')
    intersection_list_dec = intersection[decider].tolist()
    df = df1.loc[~df1[decider].isin(intersection_list_dec)]
    return df

To make things simpler the first time around, and considering my general taste revolves around rock music, I automatically marked that I love all "rocky" music (tracks from rock/blues/metal/alt/indie and automatically disliked every pop/hiphop track thats in my dataset.

In [None]:
pop_dis_df = del_common_rows(pop_df,my_df,"Track Name")  # Deleting any pop/hiphop tracks I like from the supposed "dislike pile" 
hiphop_dis_df = del_common_rows(hiphop_df,my_df,"Track Name")
dislikes = pd.concat([pop_dis_df,hiphop_dis_df])
likes = pd.concat([blues_df,pd.concat([rock_df,pd.concat([metal_df,pd.concat([indie_df,alt_df])])])])
likes = del_common_rows(likes,my_df,"Track Name")
likes['like'] = 1
my_df['like'] = 1
dislikes['like'] = 0

In [None]:
likes = likes.drop('Playlist',axis=1)
dislikes = dislikes.drop('Playlist',axis=1) # Won't use the playlist feature with this approach

In [None]:
from sklearn.model_selection import train_test_split
dislikes_train,dislikes_test = train_test_split(dislikes)

In [None]:
train = pd.concat([my_df,dislikes_train]).drop_duplicates()

In [None]:

test = pd.concat([likes,dislikes_test]).drop_duplicates()
true_labels = test['like']
test = test.drop('like',axis=1)

In [None]:
#Importing some ML packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate,GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from tensorflow import keras
from tensorflow.keras import layers

#Tried using keras NN, but the results were not impressive enough overall

In [None]:
to_drop=['Artist Name','Track Name','Genres','key','mode','id','uri','track_href','analysis_url','duration_ms','time_signature']
y = train['like']
X = train.drop(to_drop,axis=1).copy()
X= X.drop(['like'],axis=1)
X_test = test.drop(to_drop,axis=1).copy()

In [None]:
#using grid CV to determine best randomforest parameters
'''param_grid = {
    'criterion': ['gini','entropy'],
    'max_depth': [1,2,3,4,10,15],
     'min_samples_leaf': [3, 5,10,20,30],
     'min_samples_split': [4, 8, 10, 12],
     'n_estimators': [3,5,10,15]
}

rf = RandomForestClassifier(random_state=1)

grid_search_forest = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

grid_search_forest.fit(X,y)
grid_search_forest.best_params_'''

In [None]:
# Applying same logic for gboost hyperparameters.
'''param_grid = {
    'loss': ['deviance','exponential'],
    'learning_rate': [0.05,0.1,0.15,0.175,0.2],
     'min_samples_leaf': [3, 5,10,20,30],
     'min_samples_split': [4, 8, 10, 12],
     'n_estimators': [10,50,100]
}

gb = GradientBoostingClassifier(random_state=1)

grid_search_gboost = GridSearchCV(estimator = gb, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search_gboost.fit(X,y)
grid_search_gboost.best_params_'''

In [None]:
log_model = LogisticRegression(max_iter=10000)

#Parameters found using Grid-CV

forest_model = RandomForestClassifier(criterion = 'entropy',
 max_depth =  15,
 min_samples_leaf =  3,
 min_samples_split =  12,
 n_estimators = 15,random_state=1)

gboost_model = GradientBoostingClassifier(learning_rate= 0.1,
 loss= 'deviance',
 min_samples_leaf= 30,
 min_samples_split= 4,
 n_estimators= 100,
 random_state=1)


In [None]:
cv_Err_forest = cross_validate(forest_model,X,y,cv=10)
print("Average Cross Validation score on the training set with Random Forests is " +str(cv_Err_forest["test_score"].mean()))

In [None]:
cv_Err_gboost = cross_validate(gboost_model,X,y,cv=10)
print("Average Cross Validation score on the training set with Gradient Boosting is " +str(cv_Err_gboost["test_score"].mean()))

In [None]:
cv_Err_log = cross_validate(log_model,X,y,cv=10)
print("Average Cross Validation score on the training set with Logistic Regression is " +str(cv_Err_log["test_score"].mean()))

(Seems like Gboost gives us the best result so far, however the differences between the models are relatively small, so I decided to use the 3 of them and check which model gives the best predictions overall).

In [None]:
log_model.fit(X,y)
forest_model.fit(X,y)
gboost_model.fit(X,y)

In [None]:
predictions_forest = forest_model.predict_proba(X_test)
predictions_gboost = gboost_model.predict_proba(X_test)
predictions_log = log_model.predict_proba(X_test)

# Evaluating results

Now that we got our results, let us see how did the models do:

In [None]:
def get_top_k_results(predictions,model,k):
    prob_df = pd.DataFrame(predictions, columns=model.classes_)
    indices = np.argsort(prob_df.values[:,1],)
    indices= np.flip(indices)
    top_k_df = pd.DataFrame([test.iloc[indx] for indx in indices[:k]])
    return top_k_df


In [None]:
log_top_k = get_top_k_results(predictions_log,log_model,15)
forest_top_k = get_top_k_results(predictions_forest,forest_model,15)
gboost_top_k = get_top_k_results(predictions_gboost,gboost_model,15)

In [None]:
log_top_k 

In [None]:
forest_top_k

In [None]:
gboost_top_k

Well, seems like the audio features' similarity between my library and the metal track list proved itself, because apparently I'm a metalhead now, with most of the top-15 recommended songs being from the metal genre. While there are some offerings from other genres such as alt,blues and funk rock most of it is the usual deathcore stuff I apparently enjoy so much...

**Take 2 - dropping the metal dataframe from the 'liked' group**

Lesson learned - Quickly running the same exact code as take 1, but putting the entirety of the metal database in the dislike pile this time- not only I do not like metal all that much, its' similiarities to what I **do** like seemed to confuse the models a whole lot. 
Lets see how well do we fare now

In [None]:
pop_dis_df = del_common_rows(pop_df,my_df,"Track Name")  # Deleting any pop/hiphop tracks I like from the supposed "dislike pile" 
hiphop_dis_df = del_common_rows(hiphop_df,my_df,"Track Name")
dislikes = pd.concat([pop_dis_df,pd.concat([metal_df,hiphop_dis_df])])
likes = pd.concat([blues_df,pd.concat([rock_df,pd.concat([indie_df,alt_df])])])
likes = del_common_rows(likes,my_df,"Track Name")
likes['like'] = 1
my_df['like'] = 1
dislikes['like'] = 0
likes = likes.drop('Playlist',axis=1)
dislikes = dislikes.drop('Playlist',axis=1)
train = pd.concat([my_df,dislikes_train]).drop_duplicates()
test = pd.concat([likes,dislikes_test]).drop_duplicates()
true_labels = test['like']
test = test.drop('like',axis=1)
to_drop=['Artist Name','Track Name','Genres','key','mode','id','uri','track_href','analysis_url','duration_ms','time_signature']
y = train['like']
X = train.drop(to_drop,axis=1).copy()
X= X.drop(['like'],axis=1)
X_test = test.drop(to_drop,axis=1).copy()
                      
log_model = LogisticRegression(max_iter=10000)

#Parameters found using Grid-CV

forest_model = RandomForestClassifier(criterion = 'entropy',
 max_depth =  15,
 min_samples_leaf =  3,
 min_samples_split =  12,
 n_estimators = 15,random_state=1)

gboost_model = GradientBoostingClassifier(learning_rate= 0.1,
 loss= 'deviance',
 min_samples_leaf= 30,
 min_samples_split= 4,
 n_estimators= 100,
 random_state=1)

log_model.fit(X,y)
forest_model.fit(X,y)
gboost_model.fit(X,y)

predictions_forest = forest_model.predict_proba(X_test)
predictions_gboost = gboost_model.predict_proba(X_test)
predictions_log = log_model.predict_proba(X_test)

log_top_k = get_top_k_results(predictions_log,log_model,15)
forest_top_k = get_top_k_results(predictions_forest,forest_model,15)
gboost_top_k = get_top_k_results(predictions_gboost,gboost_model,15)

In [None]:
log_top_k

In [None]:
forest_top_k

In [None]:
gboost_top_k

Far less metal thats for sure!

In [None]:
#Saving the final recommendations:
log_top_k.to_csv("log_recommendations_v1.csv",index =False)
forest_top_k.to_csv("forest_recommendations_v1.csv",index =False)
gboost_top_k.to_csv("gboost_recommendations_v1.csv",index =False)

# **Summing up, and closing thoughts (for now)**

So, this model is pretty basic overall, since it relied solely on numeric variables and didn't include any feature engineering beforehand, thus we can probably improve it by doing stuff like:
- Encoding the 'Genres' feature to a group of binary variables, to help our model diffrentiate genres
- Introducing a "score" metric to boost tracks that come from the same Spotify playlist as one of your liked songs from the train set
- Using more complex models, such as neural networks using keras and TF.

However, this seems like a good starting point.
Do check if you can improve your own recommendations with more complex models though.

Thats it for me,thanks for sticking around! would love to answer any questions that pop up (and accept any criticisms)!

# **Appendix- code to get your own music data, playlist-wise**

Before we dive in to the code, heres a couple of things you should know:
- This code is written in python, which uses the spotipy library to access Spotify's data through an **App** that uses your own Spotify user credentials. 
- Here's a [good tutorial](https://towardsdatascience.com/get-your-spotify-streaming-history-with-python-d5a208bbcbd3#:~:text=Getting%20the%20data,but%20it's%20usually%20much%20faster) to get you started with creating your app at the Spotify developers website. Didn't follow all of it, since some of the spotipy code is deprecated, but this guide was great at helping me set up my Spotify developers account and app.
- Once you have an app running, you should plug you client secret and user to this code and be good to go! Be sure to check [Spotipy's documentation](https://spotipy.readthedocs.io/en/2.18.0/) if there's something unclear.
- This code accesses your music library **Playlist wise**, so be sure any song you want to actually access is in one of your playlists.
- Would recommend running this with a debugger if confused, as the Json structure of this is quite trippy, but your mileage may vary.

Helper function:

In [None]:
'''def create_music_dataset(playlists, sp,k):
    """
    Helper function
    :param playlists- playlists to draw tracks from
    :param sp - spotipy object to manage stuff
    :k - constant to determine how many songs I want to scrape from each playlist.
    100 songs per k
    :returns a song list along with their audio features
    """
    track_art_names = []
    while playlists:
        for i, playlist in enumerate(playlists['items']):
            print("%4d %s" % (i + 1 + playlists['offset'], playlist['name']))
            for j in range(k):
                playlist_tracks = sp.playlist_items(playlist['id'], offset=j * 100)
                items = playlist_tracks['items']
                for item in items:
                    track = item['track']
                    try:
                        artist_name = track['artists'][0]['name']
                        artist_id = track['artists'][0]['id']
                        track_name = track['name']
                        orig_playlist = playlist['name']
                        track_art_names.append((artist_name, track_name, artist_id,orig_playlist))
                    except:
                        print("Playlist Error")
                        continue
        if playlists['next']:
            playlists = sp.next(playlists)['playlists']
        else:
            playlists = None

    ids = []
    song_list = []
    cols = []
    unique_tracks = list(set(track_art_names))
    for j, data_tup in enumerate(unique_tracks):
        try:  # Avoid stoppin
            track_feats = sp.search(q='artist:' + data_tup[0] + ' track:' + data_tup[1], type='track')
            track_id = track_feats['tracks']['items'][0]['id']
            track_pop = sp.track(track_id)['popularity']
            artist = sp.artist(data_tup[2])
            features = sp.audio_features(track_id)
            del features[0]['type']
        except:  # The track_feats search query returns an empty statement sometimes
            print("err")
            continue
        ids.append(track_id)
        artist_genre = artist['genres']
        if not cols:  # First iteration
            cols = ['Artist Name', 'Track Name', 'Popularity', 'Genres','Playlist']
            cols.extend(features[0].keys())
        song_row = [data_tup[0], data_tup[1], track_pop, artist_genre,data_tup[3]]
        try:  # somehow, some songs have no features
            song_row.extend(features[0].values())
        except:
            continue
        print("Artist = ", data_tup[0], " Song = ", data_tup[1], " Id = ", track_id, " Iter = ", j)
        song_list.append(song_row)

    return song_list, cols'''

The base script - **for scraping data from your library** 

In [None]:
'''import spotipy
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd

def main():
    SPOTIPY_CLIENT_ID = '<YOUR CLIENT ID HERE>'
    SPOTIPY_CLIENT_SECRET = '<YOUR CLIENT SECRET HERE>'
    SPOTIPY_REDIRECT_URI = '<YOUR REDIRECT ID HERE>'
    SCOPE = "user-library-read"

    sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=SCOPE, client_id=SPOTIPY_CLIENT_ID,
                                                   client_secret=SPOTIPY_CLIENT_SECRET,
                                                   redirect_uri=SPOTIPY_REDIRECT_URI))
    playlists = sp.current_user_playlists()
    song_list, cols = create_music_dataset(playlists, sp,2)
    df = pd.DataFrame(song_list, columns=cols)
    df.to_csv("train_music_data.csv", header=cols)'''

### 