Skip to content

Commit

Permalink
WIP: some debug up until genre class
Browse files Browse the repository at this point in the history
I added too a new way to connect to the OAuth API
spotipy-dev/spotipy#263
  • Loading branch information
sqrx-mckl committed Mar 23, 2020
1 parent 9fb70f8 commit d3e615a
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 65 deletions.
30 changes: 24 additions & 6 deletions lib_spotify_app/adapter_spotipy_api.py
Expand Up @@ -47,10 +47,10 @@ def __init__(self, credential_fp:Path, scope:List[str], cache_path:Path):
self.sp:spotipy.Spotify = None


def refresh_token(self):
pass

def _get_token(self):
NotImplementedError('''deprecated method
refer to https://github.com/plamere/spotipy/issues/263''')

# token need to be refreshed
try:
self.token_code = spotipy.util.prompt_for_user_token(
Expand All @@ -70,9 +70,27 @@ def _get_token(self):
with open(self.credential_fp, 'w') as file:
json.dump(self.credential, file)


def open_session(self):
self._get_token()
self.sp = spotipy.Spotify(auth=self.credential['token']['code'])
"""
Open a session with OAuth
refer to <https://github.com/plamere/spotipy/issues/263>
"""

## Depreciated
# self._get_token()
# self.sp = spotipy.Spotify(auth=self.credential['token']['code'])

# new method as per <https://github.com/plamere/spotipy/issues/263>

self.sp=spotipy.Spotify(
auth_manager=spotipy.SpotifyOAuth(
client_id=self.credential["client_id"],
client_secret=self.credential["client_secret"],
redirect_uri="http://localhost/",
scope=self.scope,
cache_path=self.cache_path)
)

def query_liked_songs(self, tracks_count:int=-1, limit:int=50):
result_liked_songs = None
Expand All @@ -94,7 +112,7 @@ def query_liked_songs(self, tracks_count:int=-1, limit:int=50):
offset = offset + limit

# check condition if there is one
if tracks_count > 0 and offset > tracks_count:
if tracks_count > 0 and offset >= tracks_count:
break

return result_liked_songs
Expand Down
185 changes: 138 additions & 47 deletions lib_spotify_app/facade_enrich_artist_genre.py
Expand Up @@ -3,6 +3,7 @@
import spotipy
import pandas as pd
from typing import Dict, List
import seaborn as sns

from sklearn.preprocessing import MultiLabelBinarizer
from scipy.spatial.distance import yule
Expand All @@ -16,38 +17,49 @@
from .util import _enrich_by_feature

class facade_enrich_artist_genre:
"""
Class which handle the artists data and most particurarly the genre information from the artist.
https://spotipy.readthedocs.io/en/2.9.0/#spotipy.client.Spotify.recommendation_genre_seeds
This class take care of the processing of the genres:
* cleaning the "useless" genre (<country_name>_indie)
* remove outliers
* cluster into "super_genre"
* name the "super_genre"
* enrich initial data with the newly added "super_genre"
Clustering is done by 2 algorithms:
* DBSCAN with OPTICS to detect the outliers
* hierarchical clustering to create the "super-genre"
The metric used for the genre combination is the Yule distance:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.yule.html
It is a binary array disambleance distance. It uses the amount of time a True is encountered at the same index for both arrays
Parameters
----------
method : str
method for Hierarchical Clustering, see scipy.cluster.hierarchy.linkage
by default 'weighted'
feature : pd.DataFrame
feature from artist request on Spotify in a DataFrame
df_genre : pd.DataFrane
encoded genre with as rows the artists, and columns the genre from those artists
_artist : pd.Series
artist data to be transformed
_mlb : MultiLabelBinarizer
used to encode the artist genre
"""


def __init__(self, artists:pd.Series):
"""
Class which handle the artists data and most particurarly the genre information from the artist.
https://spotipy.readthedocs.io/en/2.9.0/#spotipy.client.Spotify.recommendation_genre_seeds
This class take care of the processing of the genres:
* cleaning the "useless" genre (<country_name>_indie)
* remove outliers
* cluster into "super_genre"
* name the "super_genre"
* enrich initial data with the newly added "super_genre"
Clustering is done by 2 algorithms:
* DBSCAN with OPTICS to detect the outliers
* hierarchical clustering to create the "super-genre"
The metric used for the genre combination is the Yule distance:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.yule.html
It is a binary array disambleance distance. It uses the amount of time a True is encountered at the same index for both arrays
Parameters
----------
artists : pd.Series
artists data to be transformed
"""
self._artists = artists
self._mlb = MultiLabelBinarizer()
self._artist:pd.Series = artists
self._mlb:MultiLabelBinarizer = MultiLabelBinarizer()
self.method:str = 'weighted'


@property
def genre(self)->pd.Series:
def genre(self)->pd.Index:
"""
Retrieve the genres from the initial data
Expand All @@ -59,44 +71,123 @@ def genre(self)->pd.Series:
return self.df_genre.columns


def enrich_artists(self, sp:spotipy.Spotify)->pd.DataFrame:
df = _enrich_by_feature(self._artists,
w=50,
f=sp.artists)

def request_artist_features(self, sp:spotipy.Spotify):
self.feature:pd.DataFrame = _enrich_by_feature(self._artist,
w=50,
f=sp.artists)
# DataFrame with genre as a column and each row is an artist
genre_col = f'{self._artists.name}.enrich.genres'
self.df_genre = pd.DataFrame(
self._mlb.fit_transform(df[genre_col]),
self._mlb.fit_transform(self.feature['genres']),
columns=self._mlb.classes_,
index=self._artists.index
index=self._artist.index
)

return df


def clean_useless_genre(self):
"""
spotify contains strange genre such as "alabama_indie" which are not useful for our purpose. As such this method get rids of all of them
"""
mask = self.df_genre.columns.str.contains(r'genre_\w+ indie')
self.df_genre = self.df_genre.loc[:,~mask]
self.df_genre:pd.DataFrame = self.df_genre.loc[:,~mask]


def cluster_genre(self):
"""
Clustering is done by 2 algorithms:
* DBSCAN with OPTICS to detect the outliers
* hierarchical clustering to create the "super-genre"
The metric used for the genre combination is the Yule distance:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.yule.html
It is a binary array disambleance distance. It uses the amount of time a True is encountered at the same index for both arrays
"""

from sklearn.cluster import OPTICS

df_genre = self.df_genre.transpose()

self.cl_optics = OPTICS(
max_eps=1,
min_samples=2,# to avoid lonely genre, goal is to have super-genre
metric='yule',
cluster_method='dbscan',
algorithm='brute',# only valid algorithm for this metric
n_jobs=-2
)
self.cl_optics = self.cl_optics.fit_predict(df_genre)

def cluster_genre(self, method:str='average'):
self.linkage = linkage(self.df_genre,
method=method,
metric='yule')
# remove the outlier cluster
self.cl_linkage = linkage(df_genre,
method=self.method,
metric='yule')


def plot_clustermap(self):
plt.figure()
g = sns.clustermap(df_genre,
row_linkage=self.cl_linkage,
col_cluster=False,
yticklabels=True)
g.fig.suptitle(self.method)


def plot_dendrogram(self):
dendrogram(
self.linkage,
self.cl_linkage,
orientation='left',
labels=self.genre.to_list()
)


def create_super_genre(self):
import re
from itertools import chain
from collections import Counter
from statistics import mode
pass


pass


def enrich_artist_genre(self, df:pd.DataFrame)->pd.DataFrame:
pass


@staticmethod
def _split_word(x:str)->List[str]:
"""
split a string as a list of words
Arguments:
x {str} -- string to split
Returns:
List[str] -- list of words (as strngs)
"""
import re

if x is np.nan:
return []
x = x.replace('genre_', '')
return re.findall(r"[\w']+", x)


@staticmethod
def _robust_str_mode(x:List[str], sep:str='-')->str:
"""
robust method to calculate the mode of a list of string, returns a concatenate string in case of equal number of counter
Arguments:
x {List[str]} -- list of string to calcualte the mode
sep {str} -- string used to concatenate mutliple string in case of draw
Returns:
[str] -- mode word
"""
from statistics import mode

try:
return mode(x)
except:
vc = pd.Series(x).value_counts()
x_to_join = vc.index[vc == vc.max()].values
return '-'.join(x_to_join)

15 changes: 11 additions & 4 deletions lib_spotify_app/util.py
Expand Up @@ -51,7 +51,7 @@ def json_list2dict(d:Dict)->Dict:
return d


def normalize_request(request)->pd.DataFrame:
def normalize_request(_request)->pd.DataFrame:
"""
transform the output of a request into a DataFrame
Expand All @@ -67,8 +67,14 @@ def normalize_request(request)->pd.DataFrame:
"""
# some request gives back a strange dict with key the name of the
# request and values the lists output
if isinstance(request, dict):
request = list(request.values())[0]
if isinstance(_request, dict) and 'items' in _request.keys():
request = _request['items']
elif isinstance(_request, dict) \
and len(_request.keys()) == 1 \
and isinstance(_request[list(_request.keys())[0]], list):
request = _request[list(_request.keys())[0]]
else:
request = _request

# if there is multilple request inside the request (like a list). The
# output is a list, else is a dict
Expand Down Expand Up @@ -134,4 +140,5 @@ def enrich_df_by_feature(df:pd.DataFrame, col:str, f, w:int)->pd.DataFrame:
df_enriched = _enrich_by_feature(df[col], f=f, w=w)
df_enriched = df_enriched.add_prefix(f'{col}.')

return df.join(df_enriched, on=col)
return df.join(df_enriched, on=col)

34 changes: 26 additions & 8 deletions spotify_app_script.py
Expand Up @@ -15,7 +15,9 @@
import lib_spotify_app as lib

import pandas as pd
pd.reset_option()
pd.set_option('max_columns', None)
pd.reset_option('max_rows')

from typing import Dict, List, Union
from copy import deepcopy
Expand Down Expand Up @@ -68,22 +70,38 @@
sharey=True,
sharex=False)

#%% standard data enrichment

def data_enrich(df:pd.DataFrame, col:str='id')->pd.DataFrame:
return lib.enrich_df_by_feature(df,
col=col,
f=sp_adapter.sp.audio_features,
w=100)


# %%

df_toptracks = lib.normalize_request(
sp_adapter.sp.current_user_top_tracks(limit=50)
)
df_toptracks = lib.enrich_df_by_feature(df_toptracks,
col='id',
f=sp_adapter.sp.audio_features,
w=100)
).pipe(data_enrich, col='id')

#%%

df_likedsong = sp_adapter.query_liked_songs(tracks_count=200)
df_likedsong = lib.normalize_request(
sp_adapter.query_liked_songs(tracks_count=200)
).pipe(data_enrich, col='track.id')

# %%

enrich_genre = lib.facade_enrich_artist_genre(
artists=df_toptracks
genre_toptracks = lib.facade_enrich_artist_genre(
artists=df_toptracks['artists.0.id']
)


# %%

genre_toptracks.request_artist_features(sp_adapter.sp)
genre_toptracks.clean_useless_genre()
genre_toptracks.cluster_genre()

# %%

0 comments on commit d3e615a

Please sign in to comment.