In [531]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
import random

warnings.filterwarnings("ignore")
from sklearn.cluster import KMeans

In [532]:
data = pd.read_csv(r"datasets/data_by_artist.csv")
data.head()

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,9,0.590111,"""Cats"" 1981 Original London Cast",0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5
1,1,26,0.862538,"""Cats"" 1983 Broadway Cast",0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5
2,1,7,0.856571,"""Fiddler On The Roof” Motion Picture Chorus",0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0
3,1,27,0.884926,"""Fiddler On The Roof” Motion Picture Orchestra",0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0
4,1,7,0.510714,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5


In [533]:
data_for_songs = pd.read_csv(r"datasets/data.csv")
data_for_songs.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [534]:
artist_data = data[['valence', 'artists', 'count', 'popularity']]
artist_data.to_csv('artist_data.csv')

In [535]:
def PCA_algorithm(data):
    num_datatypes = data.select_dtypes(np.number)

    pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
    song_embedding = pca_pipeline.fit_transform(num_datatypes)
    return song_embedding

In [536]:
def KMeans_with_PCA_algorithm(song_embedding):
    kmeans_pca = KMeans(n_clusters=50, init='k-means++', random_state=42)
    label = kmeans_pca.fit(song_embedding)
    centroids = kmeans_pca.cluster_centers_
    return label, centroids

In [537]:
def filter_based_on_segment(df_segm_pca_kmeans, segment_val):
    #get list of songs with above song id
    #print(df_segm_pca_kmeans[df_segm_pca_kmeans['Segment K-means PCA']==segment_val].values)

    filtered_data_per_segment = df_segm_pca_kmeans[df_segm_pca_kmeans['Segment K-means PCA'] == segment_val][
        ['artists', 'valence', 'com1', 'com2', 'Segment K-means PCA']]

    #save to csv file
    filtered_data_per_segment.to_csv('filtered_data_per_segment_for_artists.csv')
    return filtered_data_per_segment

In [538]:
def filter_based_on_cluster_centroid(kmeans_pca, filtered_data_per_segment, segment_val, tracks):
    #calculate distance based on cluster centroid
    scaled_data = kmeans_pca.transform(filtered_data_per_segment[['com1', 'com2']])
    scaled_song_center = kmeans_pca.transform(kmeans_pca.cluster_centers_[segment_val].reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')

    #sort based on distance
    index = list(np.argsort(distances)[0])
    rec_songs = filtered_data_per_segment.iloc[index]

    #recommend based on segment data
    # rec_songs = rec_songs[~rec_songs['name'].isin(filtered_data_per_segment['name'])]
    return rec_songs

In [539]:
p = 0.02  # to randomly select 1% of the rows
df_playlist_spotify = pd.read_csv(r"datasets/spotify_dataset.csv", error_bad_lines=False, warn_bad_lines=False,
                                  skiprows=lambda i: i > 0 and random.random() > p)
df_playlist_spotify.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Bruce Springsteen,Wrecking Ball,IOW 2012
1,07f0fc3be95dcd878966b1f9572ff670,2080,Live Die 5 Continues,2080
2,07f0fc3be95dcd878966b1f9572ff670,C418,Door,C418
3,07f0fc3be95dcd878966b1f9572ff670,C418,Haggstrom,C418
4,07f0fc3be95dcd878966b1f9572ff670,Bonobo,Animals,Chill out


In [540]:
df_playlist_spotify.columns = df_playlist_spotify.columns.str.replace('"', '')
df_playlist_spotify.columns = df_playlist_spotify.columns.str.replace('name', '')
df_playlist_spotify.columns = df_playlist_spotify.columns.str.replace(' ', '')
df_playlist_spotify.columns

Index(['user_id', 'artist', 'track', 'playlist'], dtype='object')

In [541]:
df_playlist = df_playlist_spotify.groupby('artist').filter(lambda x: len(x) >= 50)
print(df_playlist)

                                 user_id                artist  \
0       9cc0cfd4d7d7885102480dd99e7a90d6     Bruce Springsteen   
4       07f0fc3be95dcd878966b1f9572ff670                Bonobo   
6       07f0fc3be95dcd878966b1f9572ff670             Daft Punk   
7       07f0fc3be95dcd878966b1f9572ff670             Daft Punk   
8       07f0fc3be95dcd878966b1f9572ff670           The Prodigy   
...                                  ...                   ...   
256654  488ead471813ae982119153f792e4a3e  Two Door Cinema Club   
256655  488ead471813ae982119153f792e4a3e           Wiz Khalifa   
256656  488ead471813ae982119153f792e4a3e     Foster The People   
256663  488ead471813ae982119153f792e4a3e            Katy Perry   
256667  2302bf9c64dc63d88a750215ed187f2c            Pink Floyd   

                                   track          playlist  
0                          Wrecking Ball          IOW 2012  
4                                Animals         Chill out  
6                       

In [542]:
df_playlist = df_playlist[df_playlist.groupby('user_id').artist.transform('nunique') >= 10]
print(df_playlist)

                                 user_id           artist  \
4       07f0fc3be95dcd878966b1f9572ff670           Bonobo   
6       07f0fc3be95dcd878966b1f9572ff670        Daft Punk   
7       07f0fc3be95dcd878966b1f9572ff670        Daft Punk   
8       07f0fc3be95dcd878966b1f9572ff670      The Prodigy   
9       07f0fc3be95dcd878966b1f9572ff670  Crystal Castles   
...                                  ...              ...   
256542  aedf684d88f71c448add86ee4873278e          Incubus   
256549  aedf684d88f71c448add86ee4873278e    NEEDTOBREATHE   
256550  aedf684d88f71c448add86ee4873278e    NEEDTOBREATHE   
256551  aedf684d88f71c448add86ee4873278e        Relient K   
256554  aedf684d88f71c448add86ee4873278e    NEEDTOBREATHE   

                                  track     playlist  
4                               Animals    Chill out  
6                               Da Funk   Daft Punk   
7                             Rectifier   Daft Punk   
8                   Run With The Wolves      El

In [543]:
size = lambda x: len(x)
df_freq = df_playlist.groupby(['user_id', 'artist']).agg('size').reset_index().rename(columns={0: 'freq'})[
    ['user_id', 'artist', 'freq']].sort_values(['freq'], ascending=False)
df_freq.head()

Unnamed: 0,user_id,artist,freq
6748,26b51e580277e131f87e4c7ee4c0887a,Vitamin String Quartet,68
355,014e695cc6df96011b90a5beb3206012,Ilaiyaraaja,49
11017,414050deadb38aafd8d4ad22ca634055,Vitamin String Quartet,46
14544,5711b8cd3a6219fe01728cd9b97be46c,Johnny Cash,40
30238,b1d4116e7cf150ae7d77413620f5f571,Wolfgang Amadeus Mozart,38


In [544]:
df_artist = pd.DataFrame(df_freq["artist"].unique())
df_artist = df_artist.reset_index()
df_artist = df_artist.rename(columns={'index': 'artist_id', 0: 'artist'})
df_artist.head()

Unnamed: 0,artist_id,artist
0,0,Vitamin String Quartet
1,1,Ilaiyaraaja
2,2,Johnny Cash
3,3,Wolfgang Amadeus Mozart
4,4,Jamey Aebersold Play-A-Long


In [545]:
def GetInPut(user):
    inputArtist = pd.DataFrame(user)
    #Filtering out the movies by title
    Id = df_artist[df_artist['artist'].isin(inputArtist['artist'].tolist())]
    #Then merging it so we can get the movieId. It's implicitly merging it by title.
    inputArtist = pd.merge(Id, inputArtist)
    #Dropping information we won't use from the input dataframe
    #inputArtist = inputArtist.drop('year', 1)
    return inputArtist

In [546]:
def get_song_from_artist(artist, n_per_artist= 1):
    list_of_artists_songs = []
    for i in range(len(data_for_songs)):
        if artist in data_for_songs.loc[i, "artists"]:
            list_of_artists_songs.append(data_for_songs.loc[i, ["id", "artists", "name", "popularity"]])

    sorted_list_of_artists_songs = sorted(list_of_artists_songs, key=lambda d: d['popularity'], reverse=True)
    sorted_list_of_songs_only = np.array(sorted_list_of_artists_songs)
    unique_songs = np.unique(sorted_list_of_songs_only[:, 2])
    return unique_songs[:n_per_artist]

In [547]:
from sklearn.decomposition import PCA

def recommend_artists(artist, n=10):
    """
    Recommends songs based on a list of previous songs that a user has listened to.
    """
    song_embedding = PCA_algorithm(artist_data)
    kmeans_pca, centroids = KMeans_with_PCA_algorithm(song_embedding)

    df_segm_pca_kmeans = pd.concat([artist_data.reset_index(drop=True), pd.DataFrame(song_embedding)], axis=1)
    df_segm_pca_kmeans.columns.values[-2:] = ['com1', 'com2']
    df_segm_pca_kmeans['Segment K-means PCA'] = kmeans_pca.labels_

    #get segment value of artist
    # artist = 'Frank Sinatra'
    segment_val = df_segm_pca_kmeans[df_segm_pca_kmeans['artists'] == artist]['Segment K-means PCA'].values[0]

    filtered_data_per_segment = filter_based_on_segment(df_segm_pca_kmeans, segment_val)

    rec_artists = filter_based_on_cluster_centroid(kmeans_pca, filtered_data_per_segment, segment_val, artist_data)
    # print(rec_artists)

    #recommend top n artists
    return rec_artists.head(n)['artists'].tolist()

In [548]:
def recommend_artist_and_songs(artist, n_songs=10):
    # get recommended artists
    recommended_artists = pd.DataFrame({
            'artist' : recommend_artists(artist, n_songs)
        })
    # print(recommended_artists)
    recommend_songs_list = pd.DataFrame(columns = ['artist', 'songs'])
    # artist_count = len(recommended_artists)
    # song_per_artist = n_songs > artist_count and (n_songs/artist_count) or 1

    # get songs per artist
    for i in range(len(recommended_artists)):
        artist = recommended_artists.loc[i, "artist"]
        recommend_song = pd.DataFrame({
            'artist' : artist,
            'songs' :get_song_from_artist(artist, 1)
        })
        recommend_songs_list = recommend_songs_list.append(recommend_song, ignore_index = True)
    return  recommend_songs_list

In [549]:
user = [
    {'artist': 'Ella Fitzgerald', 'freq': 40},
    {'artist': 'Frank Sinatra', 'freq': 10},
    {'artist': 'Lil Wayne', 'freq': 3},
    {'artist': "The Rolling Stones", 'freq': 5},
    {'artist': 'Louis Armstrong', 'freq': 5}
]

In [550]:
inputArtist = GetInPut(user).sort_values('freq', ascending=False)
# print(inputArtist)

In [551]:
# pass favorite artist name and no of recommended artist/song you want to listen
recommended_artist_and_songs = recommend_artist_and_songs('Ella Fitzgerald',10)
# this list contains n recommended artist with 1 songs per artist
print(recommended_artist_and_songs)

                    artist                                              songs
0          Ella Fitzgerald                                         'Deed I Do
1          Lata Mangeshkar                                  Aa Ab Laut Chalen
2              Dean Martin                                    'Til I Find You
3           The Beach Boys  "Cassius" Love Vs. "Sonny" Wilson - Remastered...
4       The Rolling Stones  (I Can't Get No) Satisfaction - (Original Sing...
5     Ludwig van Beethoven  12 Variations on "Ein Mädchen oder Weibchen", ...
6                Bob Dylan  (Quinn the Eskimo) The Mighty Quinn - Live at ...
7            Fleetwood Mac                            Albatross - 2018 Master
8  Wolfgang Amadeus Mozart                                          Adagio II
9             Irina Salkow              Kapitel 1 - Der Page und die Herzogin
