From d3e615ae42ccb10d73dfa4492b404a8bc1a6bd21 Mon Sep 17 00:00:00 2001 From: sqrx-mckl Date: Mon, 23 Mar 2020 01:50:33 +0100 Subject: [PATCH] WIP: some debug up until genre class I added too a new way to connect to the OAuth API https://github.com/plamere/spotipy/issues/263 --- lib_spotify_app/adapter_spotipy_api.py | 30 ++- lib_spotify_app/facade_enrich_artist_genre.py | 185 +++++++++++++----- lib_spotify_app/util.py | 15 +- spotify_app_script.py | 34 +++- 4 files changed, 199 insertions(+), 65 deletions(-) diff --git a/lib_spotify_app/adapter_spotipy_api.py b/lib_spotify_app/adapter_spotipy_api.py index 3183975..508c5ce 100644 --- a/lib_spotify_app/adapter_spotipy_api.py +++ b/lib_spotify_app/adapter_spotipy_api.py @@ -47,10 +47,10 @@ def __init__(self, credential_fp:Path, scope:List[str], cache_path:Path): self.sp:spotipy.Spotify = None - def refresh_token(self): - pass - def _get_token(self): + NotImplementedError('''deprecated method + refer to https://github.com/plamere/spotipy/issues/263''') + # token need to be refreshed try: self.token_code = spotipy.util.prompt_for_user_token( @@ -70,9 +70,27 @@ def _get_token(self): with open(self.credential_fp, 'w') as file: json.dump(self.credential, file) + def open_session(self): - self._get_token() - self.sp = spotipy.Spotify(auth=self.credential['token']['code']) + """ + Open a session with OAuth + refer to + """ + + ## Depreciated + # self._get_token() + # self.sp = spotipy.Spotify(auth=self.credential['token']['code']) + + # new method as per + + self.sp=spotipy.Spotify( + auth_manager=spotipy.SpotifyOAuth( + client_id=self.credential["client_id"], + client_secret=self.credential["client_secret"], + redirect_uri="http://localhost/", + scope=self.scope, + cache_path=self.cache_path) + ) def query_liked_songs(self, tracks_count:int=-1, limit:int=50): result_liked_songs = None @@ -94,7 +112,7 @@ def query_liked_songs(self, tracks_count:int=-1, limit:int=50): offset = offset + limit # check condition if there is one - if tracks_count > 0 and offset > tracks_count: + if tracks_count > 0 and offset >= tracks_count: break return result_liked_songs diff --git a/lib_spotify_app/facade_enrich_artist_genre.py b/lib_spotify_app/facade_enrich_artist_genre.py index 424857b..348cc0c 100644 --- a/lib_spotify_app/facade_enrich_artist_genre.py +++ b/lib_spotify_app/facade_enrich_artist_genre.py @@ -3,6 +3,7 @@ import spotipy import pandas as pd from typing import Dict, List +import seaborn as sns from sklearn.preprocessing import MultiLabelBinarizer from scipy.spatial.distance import yule @@ -16,38 +17,49 @@ from .util import _enrich_by_feature class facade_enrich_artist_genre: + """ + Class which handle the artists data and most particurarly the genre information from the artist. + https://spotipy.readthedocs.io/en/2.9.0/#spotipy.client.Spotify.recommendation_genre_seeds + + This class take care of the processing of the genres: + * cleaning the "useless" genre (_indie) + * remove outliers + * cluster into "super_genre" + * name the "super_genre" + * enrich initial data with the newly added "super_genre" + + Clustering is done by 2 algorithms: + * DBSCAN with OPTICS to detect the outliers + * hierarchical clustering to create the "super-genre" + + The metric used for the genre combination is the Yule distance: + https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.yule.html + It is a binary array disambleance distance. It uses the amount of time a True is encountered at the same index for both arrays + + Parameters + ---------- + method : str + method for Hierarchical Clustering, see scipy.cluster.hierarchy.linkage + by default 'weighted' + feature : pd.DataFrame + feature from artist request on Spotify in a DataFrame + df_genre : pd.DataFrane + encoded genre with as rows the artists, and columns the genre from those artists + _artist : pd.Series + artist data to be transformed + _mlb : MultiLabelBinarizer + used to encode the artist genre + """ + def __init__(self, artists:pd.Series): - """ - Class which handle the artists data and most particurarly the genre information from the artist. - https://spotipy.readthedocs.io/en/2.9.0/#spotipy.client.Spotify.recommendation_genre_seeds - - This class take care of the processing of the genres: - * cleaning the "useless" genre (_indie) - * remove outliers - * cluster into "super_genre" - * name the "super_genre" - * enrich initial data with the newly added "super_genre" - - Clustering is done by 2 algorithms: - * DBSCAN with OPTICS to detect the outliers - * hierarchical clustering to create the "super-genre" - - The metric used for the genre combination is the Yule distance: - https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.yule.html - It is a binary array disambleance distance. It uses the amount of time a True is encountered at the same index for both arrays - - Parameters - ---------- - artists : pd.Series - artists data to be transformed - """ - self._artists = artists - self._mlb = MultiLabelBinarizer() + self._artist:pd.Series = artists + self._mlb:MultiLabelBinarizer = MultiLabelBinarizer() + self.method:str = 'weighted' @property - def genre(self)->pd.Series: + def genre(self)->pd.Index: """ Retrieve the genres from the initial data @@ -59,44 +71,123 @@ def genre(self)->pd.Series: return self.df_genre.columns - def enrich_artists(self, sp:spotipy.Spotify)->pd.DataFrame: - df = _enrich_by_feature(self._artists, - w=50, - f=sp.artists) - + def request_artist_features(self, sp:spotipy.Spotify): + self.feature:pd.DataFrame = _enrich_by_feature(self._artist, + w=50, + f=sp.artists) + # DataFrame with genre as a column and each row is an artist - genre_col = f'{self._artists.name}.enrich.genres' self.df_genre = pd.DataFrame( - self._mlb.fit_transform(df[genre_col]), + self._mlb.fit_transform(self.feature['genres']), columns=self._mlb.classes_, - index=self._artists.index + index=self._artist.index ) - - return df + def clean_useless_genre(self): """ spotify contains strange genre such as "alabama_indie" which are not useful for our purpose. As such this method get rids of all of them """ mask = self.df_genre.columns.str.contains(r'genre_\w+ indie') - self.df_genre = self.df_genre.loc[:,~mask] + self.df_genre:pd.DataFrame = self.df_genre.loc[:,~mask] + + + def cluster_genre(self): + """ + Clustering is done by 2 algorithms: + * DBSCAN with OPTICS to detect the outliers + * hierarchical clustering to create the "super-genre" + + The metric used for the genre combination is the Yule distance: + https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.yule.html + It is a binary array disambleance distance. It uses the amount of time a True is encountered at the same index for both arrays + """ + + from sklearn.cluster import OPTICS + + df_genre = self.df_genre.transpose() + self.cl_optics = OPTICS( + max_eps=1, + min_samples=2,# to avoid lonely genre, goal is to have super-genre + metric='yule', + cluster_method='dbscan', + algorithm='brute',# only valid algorithm for this metric + n_jobs=-2 + ) + self.cl_optics = self.cl_optics.fit_predict(df_genre) - def cluster_genre(self, method:str='average'): - self.linkage = linkage(self.df_genre, - method=method, - metric='yule') + # remove the outlier cluster + self.cl_linkage = linkage(df_genre, + method=self.method, + metric='yule') + + def plot_clustermap(self): + plt.figure() + g = sns.clustermap(df_genre, + row_linkage=self.cl_linkage, + col_cluster=False, + yticklabels=True) + g.fig.suptitle(self.method) + + def plot_dendrogram(self): dendrogram( - self.linkage, + self.cl_linkage, orientation='left', labels=self.genre.to_list() ) + def create_super_genre(self): - import re from itertools import chain - from collections import Counter - from statistics import mode - pass \ No newline at end of file + + + pass + + + def enrich_artist_genre(self, df:pd.DataFrame)->pd.DataFrame: + pass + + +@staticmethod +def _split_word(x:str)->List[str]: + """ + split a string as a list of words + + Arguments: + x {str} -- string to split + + Returns: + List[str] -- list of words (as strngs) + """ + import re + + if x is np.nan: + return [] + x = x.replace('genre_', '') + return re.findall(r"[\w']+", x) + + +@staticmethod +def _robust_str_mode(x:List[str], sep:str='-')->str: + """ + robust method to calculate the mode of a list of string, returns a concatenate string in case of equal number of counter + + Arguments: + x {List[str]} -- list of string to calcualte the mode + sep {str} -- string used to concatenate mutliple string in case of draw + + Returns: + [str] -- mode word + """ + from statistics import mode + + try: + return mode(x) + except: + vc = pd.Series(x).value_counts() + x_to_join = vc.index[vc == vc.max()].values + return '-'.join(x_to_join) + diff --git a/lib_spotify_app/util.py b/lib_spotify_app/util.py index 57a0d15..2f8d732 100644 --- a/lib_spotify_app/util.py +++ b/lib_spotify_app/util.py @@ -51,7 +51,7 @@ def json_list2dict(d:Dict)->Dict: return d -def normalize_request(request)->pd.DataFrame: +def normalize_request(_request)->pd.DataFrame: """ transform the output of a request into a DataFrame @@ -67,8 +67,14 @@ def normalize_request(request)->pd.DataFrame: """ # some request gives back a strange dict with key the name of the # request and values the lists output - if isinstance(request, dict): - request = list(request.values())[0] + if isinstance(_request, dict) and 'items' in _request.keys(): + request = _request['items'] + elif isinstance(_request, dict) \ + and len(_request.keys()) == 1 \ + and isinstance(_request[list(_request.keys())[0]], list): + request = _request[list(_request.keys())[0]] + else: + request = _request # if there is multilple request inside the request (like a list). The # output is a list, else is a dict @@ -134,4 +140,5 @@ def enrich_df_by_feature(df:pd.DataFrame, col:str, f, w:int)->pd.DataFrame: df_enriched = _enrich_by_feature(df[col], f=f, w=w) df_enriched = df_enriched.add_prefix(f'{col}.') - return df.join(df_enriched, on=col) \ No newline at end of file + return df.join(df_enriched, on=col) + \ No newline at end of file diff --git a/spotify_app_script.py b/spotify_app_script.py index 4a61979..b61755b 100644 --- a/spotify_app_script.py +++ b/spotify_app_script.py @@ -15,7 +15,9 @@ import lib_spotify_app as lib import pandas as pd +pd.reset_option() pd.set_option('max_columns', None) +pd.reset_option('max_rows') from typing import Dict, List, Union from copy import deepcopy @@ -68,22 +70,38 @@ sharey=True, sharex=False) +#%% standard data enrichment + +def data_enrich(df:pd.DataFrame, col:str='id')->pd.DataFrame: + return lib.enrich_df_by_feature(df, + col=col, + f=sp_adapter.sp.audio_features, + w=100) + + # %% df_toptracks = lib.normalize_request( sp_adapter.sp.current_user_top_tracks(limit=50) -) -df_toptracks = lib.enrich_df_by_feature(df_toptracks, - col='id', - f=sp_adapter.sp.audio_features, - w=100) +).pipe(data_enrich, col='id') #%% -df_likedsong = sp_adapter.query_liked_songs(tracks_count=200) +df_likedsong = lib.normalize_request( + sp_adapter.query_liked_songs(tracks_count=200) +).pipe(data_enrich, col='track.id') # %% -enrich_genre = lib.facade_enrich_artist_genre( - artists=df_toptracks +genre_toptracks = lib.facade_enrich_artist_genre( + artists=df_toptracks['artists.0.id'] ) + + +# %% + +genre_toptracks.request_artist_features(sp_adapter.sp) +genre_toptracks.clean_useless_genre() +genre_toptracks.cluster_genre() + +# %%