WIP: some debug up until genre class

I added too a new way to connect to the OAuth API spotipy-dev/spotipy#263
sqrx-mckl · Mar 23, 2020 · d3e615a · d3e615a
1 parent 9fb70f8
commit d3e615a
Show file tree

Hide file tree

Showing 4 changed files with 199 additions and 65 deletions.
diff --git a/lib_spotify_app/adapter_spotipy_api.py b/lib_spotify_app/adapter_spotipy_api.py
@@ -47,10 +47,10 @@ def __init__(self, credential_fp:Path, scope:List[str], cache_path:Path):
         self.sp:spotipy.Spotify = None
 
 
-    def refresh_token(self):
-        pass
-
     def _get_token(self):
+        NotImplementedError('''deprecated method
+        refer to https://github.com/plamere/spotipy/issues/263''')
+
         # token need to be refreshed
         try:
             self.token_code = spotipy.util.prompt_for_user_token(
@@ -70,9 +70,27 @@ def _get_token(self):
         with open(self.credential_fp, 'w') as file:
             json.dump(self.credential, file)
 
+
     def open_session(self):
-        self._get_token()
-        self.sp = spotipy.Spotify(auth=self.credential['token']['code'])
+        """
+        Open a session with OAuth
+        refer to <https://github.com/plamere/spotipy/issues/263>
+        """
+
+        ## Depreciated
+        # self._get_token()
+        # self.sp = spotipy.Spotify(auth=self.credential['token']['code'])
+
+        # new method as per <https://github.com/plamere/spotipy/issues/263>
+
+        self.sp=spotipy.Spotify(
+            auth_manager=spotipy.SpotifyOAuth(
+                client_id=self.credential["client_id"],
+                client_secret=self.credential["client_secret"],
+                redirect_uri="http://localhost/",
+                scope=self.scope,
+                cache_path=self.cache_path)
+        )
 
     def query_liked_songs(self, tracks_count:int=-1, limit:int=50):
         result_liked_songs = None
@@ -94,7 +112,7 @@ def query_liked_songs(self, tracks_count:int=-1, limit:int=50):
             offset = offset + limit
 
             # check condition if there is one
-            if tracks_count > 0 and offset > tracks_count:
+            if tracks_count > 0 and offset >= tracks_count:
                 break
 
         return result_liked_songs

diff --git a/lib_spotify_app/facade_enrich_artist_genre.py b/lib_spotify_app/facade_enrich_artist_genre.py
@@ -3,6 +3,7 @@
 import spotipy
 import pandas as pd
 from typing import Dict, List
+import seaborn as sns
 
 from sklearn.preprocessing import MultiLabelBinarizer
 from scipy.spatial.distance import yule
@@ -16,38 +17,49 @@
 from .util import _enrich_by_feature
 
 class facade_enrich_artist_genre:
+    """
+    Class which handle the artists data and most particurarly the genre information from the artist.
+    https://spotipy.readthedocs.io/en/2.9.0/#spotipy.client.Spotify.recommendation_genre_seeds
+
+    This class take care of the processing of the genres:
+        * cleaning the "useless" genre (<country_name>_indie)
+        * remove outliers
+        * cluster into "super_genre"
+        * name the "super_genre"
+        * enrich initial data with the newly added "super_genre"
+
+    Clustering is done by 2 algorithms:
+        * DBSCAN with OPTICS to detect the outliers
+        * hierarchical clustering to create the "super-genre"
+
+    The metric used for the genre combination is the Yule distance:
+    https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.yule.html
+    It is a binary array disambleance distance. It uses the amount of time a True is encountered at the same index for both arrays
+
+    Parameters
+    ----------
+    method : str
+        method for Hierarchical Clustering, see scipy.cluster.hierarchy.linkage
+        by default 'weighted'
+    feature : pd.DataFrame
+        feature from artist request on Spotify in a DataFrame
+    df_genre : pd.DataFrane
+        encoded genre with as rows the artists, and columns the genre from those artists
+    _artist : pd.Series
+        artist data to be transformed
+    _mlb : MultiLabelBinarizer
+        used to encode the artist genre        
+    """
+
 
     def __init__(self, artists:pd.Series):
-        """
-        Class which handle the artists data and most particurarly the genre information from the artist.
-        https://spotipy.readthedocs.io/en/2.9.0/#spotipy.client.Spotify.recommendation_genre_seeds
-
-        This class take care of the processing of the genres:
-            * cleaning the "useless" genre (<country_name>_indie)
-            * remove outliers
-            * cluster into "super_genre"
-            * name the "super_genre"
-            * enrich initial data with the newly added "super_genre"
-
-        Clustering is done by 2 algorithms:
-            * DBSCAN with OPTICS to detect the outliers
-            * hierarchical clustering to create the "super-genre"
-
-        The metric used for the genre combination is the Yule distance:
-        https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.yule.html
-        It is a binary array disambleance distance. It uses the amount of time a True is encountered at the same index for both arrays
-        
-        Parameters
-        ----------
-        artists : pd.Series
-            artists data to be transformed
-        """
-        self._artists = artists
-        self._mlb = MultiLabelBinarizer()
+        self._artist:pd.Series = artists
+        self._mlb:MultiLabelBinarizer = MultiLabelBinarizer()
+        self.method:str = 'weighted'
 
 
     @property
-    def genre(self)->pd.Series:
+    def genre(self)->pd.Index:
         """
         Retrieve the genres from the initial data
         
@@ -59,44 +71,123 @@ def genre(self)->pd.Series:
         return self.df_genre.columns
 
 
-    def enrich_artists(self, sp:spotipy.Spotify)->pd.DataFrame:
-        df = _enrich_by_feature(self._artists,
-                               w=50,
-                               f=sp.artists)
-
+    def request_artist_features(self, sp:spotipy.Spotify):
+        self.feature:pd.DataFrame = _enrich_by_feature(self._artist,
+                                                       w=50,
+                                                       f=sp.artists)
+        
         # DataFrame with genre as a column and each row is an artist
-        genre_col = f'{self._artists.name}.enrich.genres'
         self.df_genre = pd.DataFrame(
-            self._mlb.fit_transform(df[genre_col]),
+            self._mlb.fit_transform(self.feature['genres']),
             columns=self._mlb.classes_,
-            index=self._artists.index
+            index=self._artist.index
         )
-
-        return df
+
 
     def clean_useless_genre(self):
         """
         spotify contains strange genre such as "alabama_indie" which are not useful for our purpose. As such this method get rids of all of them
         """
         mask = self.df_genre.columns.str.contains(r'genre_\w+ indie')
-        self.df_genre = self.df_genre.loc[:,~mask]
+        self.df_genre:pd.DataFrame = self.df_genre.loc[:,~mask]
+
+
+    def cluster_genre(self):
+        """
+        Clustering is done by 2 algorithms:
+        * DBSCAN with OPTICS to detect the outliers
+        * hierarchical clustering to create the "super-genre"
+
+        The metric used for the genre combination is the Yule distance:
+        https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.yule.html
+        It is a binary array disambleance distance. It uses the amount of time a True is encountered at the same index for both arrays
+        """
+
+        from sklearn.cluster import OPTICS
+
+        df_genre = self.df_genre.transpose()
 
+        self.cl_optics = OPTICS(
+            max_eps=1,
+            min_samples=2,# to avoid lonely genre, goal is to have super-genre
+            metric='yule',
+            cluster_method='dbscan',
+            algorithm='brute',# only valid algorithm for this metric
+            n_jobs=-2
+        )
+        self.cl_optics = self.cl_optics.fit_predict(df_genre)
 
-    def cluster_genre(self, method:str='average'):
-        self.linkage = linkage(self.df_genre,
-                               method=method,
-                               metric='yule')
+        # remove the outlier cluster
+        self.cl_linkage = linkage(df_genre,
+                            method=self.method,
+                            metric='yule')
 
+
+    def plot_clustermap(self):
+        plt.figure()
+        g = sns.clustermap(df_genre,
+                           row_linkage=self.cl_linkage,
+                           col_cluster=False,
+                           yticklabels=True)
+        g.fig.suptitle(self.method)
+
+
     def plot_dendrogram(self):
         dendrogram(
-            self.linkage,
+            self.cl_linkage,
             orientation='left',
             labels=self.genre.to_list()
         )
 
+
     def create_super_genre(self):
-        import re
         from itertools import chain
-        from collections import Counter
-        from statistics import mode
-        pass
+
+
+        pass
+
+
+    def enrich_artist_genre(self, df:pd.DataFrame)->pd.DataFrame:
+        pass
+
+
+@staticmethod
+def _split_word(x:str)->List[str]:
+    """
+    split a string as a list of words
+    
+    Arguments:
+        x {str} -- string to split
+    
+    Returns:
+        List[str] -- list of words (as strngs)
+    """
+    import re
+
+    if x is np.nan:
+        return []
+    x = x.replace('genre_', '')
+    return re.findall(r"[\w']+", x)
+
+
+@staticmethod
+def _robust_str_mode(x:List[str], sep:str='-')->str:
+    """
+    robust method to calculate the mode of a list of string, returns a concatenate string in case of equal number of counter
+    
+    Arguments:
+        x {List[str]} -- list of string to calcualte the mode
+        sep {str} -- string used to concatenate mutliple string in case of draw
+    
+    Returns:
+        [str] -- mode word
+    """
+    from statistics import mode
+
+    try:
+        return mode(x)
+    except:
+        vc = pd.Series(x).value_counts()
+        x_to_join = vc.index[vc == vc.max()].values
+        return '-'.join(x_to_join)
+
diff --git a/lib_spotify_app/util.py b/lib_spotify_app/util.py
@@ -51,7 +51,7 @@ def json_list2dict(d:Dict)->Dict:
     return d
 
 
-def normalize_request(request)->pd.DataFrame:
+def normalize_request(_request)->pd.DataFrame:
     """
     transform the output of a request into a DataFrame
     
@@ -67,8 +67,14 @@ def normalize_request(request)->pd.DataFrame:
     """
     # some request gives back a strange dict with key the name of the
     # request and values the lists output
-    if isinstance(request, dict):
-        request = list(request.values())[0]
+    if isinstance(_request, dict) and 'items' in _request.keys():
+        request = _request['items']
+    elif isinstance(_request, dict) \
+        and len(_request.keys()) == 1 \
+        and isinstance(_request[list(_request.keys())[0]], list):
+        request = _request[list(_request.keys())[0]]
+    else:
+        request = _request
 
     # if there is multilple request inside the request (like a list). The 
     # output is a list, else is a dict
@@ -134,4 +140,5 @@ def enrich_df_by_feature(df:pd.DataFrame, col:str, f, w:int)->pd.DataFrame:
     df_enriched = _enrich_by_feature(df[col], f=f, w=w)
     df_enriched = df_enriched.add_prefix(f'{col}.')
 
-    return df.join(df_enriched, on=col)
+    return df.join(df_enriched, on=col)
+
diff --git a/spotify_app_script.py b/spotify_app_script.py
@@ -15,7 +15,9 @@
 import lib_spotify_app as lib
 
 import pandas as pd
+pd.reset_option()
 pd.set_option('max_columns', None)
+pd.reset_option('max_rows')
 
 from typing import Dict, List, Union
 from copy import deepcopy
@@ -68,22 +70,38 @@
           sharey=True,
           sharex=False)
 
+#%% standard data enrichment
+
+def data_enrich(df:pd.DataFrame, col:str='id')->pd.DataFrame:
+    return lib.enrich_df_by_feature(df,
+                                    col=col,
+                                    f=sp_adapter.sp.audio_features,
+                                    w=100)
+
+
 # %%
 
 df_toptracks = lib.normalize_request(
     sp_adapter.sp.current_user_top_tracks(limit=50)
-)
-df_toptracks = lib.enrich_df_by_feature(df_toptracks,
-                                        col='id',
-                                        f=sp_adapter.sp.audio_features,
-                                        w=100)
+).pipe(data_enrich, col='id')
 
 #%%
 
-df_likedsong = sp_adapter.query_liked_songs(tracks_count=200)
+df_likedsong = lib.normalize_request(
+    sp_adapter.query_liked_songs(tracks_count=200)
+).pipe(data_enrich, col='track.id')
 
 # %%
 
-enrich_genre = lib.facade_enrich_artist_genre(
-    artists=df_toptracks
+genre_toptracks = lib.facade_enrich_artist_genre(
+    artists=df_toptracks['artists.0.id']
 )
+
+
+# %%
+
+genre_toptracks.request_artist_features(sp_adapter.sp)
+genre_toptracks.clean_useless_genre()
+genre_toptracks.cluster_genre()
+
+# %%