In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

In [2]:
def PCA_algorithm(data): 
    num_datatypes = data.select_dtypes(np.number)

    pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
    song_embedding = pca_pipeline.fit_transform(num_datatypes)
    return song_embedding

In [3]:
def KMeans_with_PCA_algorithm(song_embedding,n_clusters):
    kmeans_pca = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
    label = kmeans_pca.fit(song_embedding)
    centroids = kmeans_pca.cluster_centers_
    return label, centroids

In [4]:
def filter_based_on_segment(df_segm_pca_kmeans,segment_val,field1,field2):    
    #get list of songs with above song id
    #print(df_segm_pca_kmeans[df_segm_pca_kmeans['Segment K-means PCA']==segment_val].values)
    
    filtered_data_per_segment = df_segm_pca_kmeans[df_segm_pca_kmeans['Segment K-means PCA']==segment_val][[
        field1, field2 ,'com1','com2','Segment K-means PCA']]

    #save to csv file
    filtered_data_per_segment.to_csv('filtered_data_per_segment.csv')
    return filtered_data_per_segment

In [5]:
def filter_based_on_cluster_centroid(kmeans_pca, filtered_data_per_segment, segment_val, tracks):   
    #calculate distance based on cluster centroid
    scaled_data = kmeans_pca.transform(filtered_data_per_segment[['com1', 'com2']])
    scaled_song_center = kmeans_pca.transform(kmeans_pca.cluster_centers_[segment_val].reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')

    #sort based on distance
    index = list(np.argsort(distances)[0])
    rec_songs = filtered_data_per_segment.iloc[index]

    #recommend based on segment data
    # rec_songs = rec_songs[~rec_songs['name'].isin(filtered_data_per_segment['name'])]
    return rec_songs

In [6]:
def GetInPut(user):
    inputArtist = pd.DataFrame(user)
    #Filtering out the movies by title
    Id = df_artist[df_artist['artist'].isin(inputArtist['artist'].tolist())]
    #Then merging it so we can get the movieId. It's implicitly merging it by title.
    inputArtist = pd.merge(Id, inputArtist)
    #Dropping information we won't use from the input dataframe
    #inputArtist = inputArtist.drop('year', 1)
    return inputArtist

In [7]:
def get_song_from_artist(artist, n_per_artist= 1):
    list_of_artists_songs = []
    for i in range(len(data_for_songs)):
        if artist in data_for_songs.loc[i, "artists"]:
            list_of_artists_songs.append(data_for_songs.loc[i, ["id", "artists", "name", "popularity"]])

    sorted_list_of_artists_songs = sorted(list_of_artists_songs, key=lambda d: d['popularity'], reverse=True)
    sorted_list_of_songs_only = np.array(sorted_list_of_artists_songs)
    unique_songs = np.unique(sorted_list_of_songs_only[:, 2])
    return unique_songs[:n_per_artist]

In [8]:
def recommend_artists(artist, n=10):    
    """
    Recommends songs based on a list of previous songs that a user has listened to.
    """
    song_embedding = PCA_algorithm(artist_data)
    kmeans_pca, centroids = KMeans_with_PCA_algorithm(song_embedding,50)

    df_segm_pca_kmeans = pd.concat([artist_data.reset_index(drop=True), pd.DataFrame(song_embedding)], axis=1)
    df_segm_pca_kmeans.columns.values[-2:] = ['com1', 'com2']
    df_segm_pca_kmeans['Segment K-means PCA'] = kmeans_pca.labels_

    #get segment value of artist
    # artist = 'Frank Sinatra'
    segment_val = df_segm_pca_kmeans[df_segm_pca_kmeans['artists'] == artist]['Segment K-means PCA'].values[0]

    filtered_data_per_segment = filter_based_on_segment(df_segm_pca_kmeans, segment_val,'artists', 'valence')

    rec_artists = filter_based_on_cluster_centroid(kmeans_pca, filtered_data_per_segment, segment_val, artist_data)
    # print(rec_artists)

    #recommend top n artists
    return rec_artists.head(n)['artists'].tolist()

In [9]:
def recommend_artist_and_songs(artist, n_songs=10):
    # get recommended artists
    recommended_artists = pd.DataFrame({
            'artist' : recommend_artists(artist, n_songs)
        })
    # print(recommended_artists)
    recommend_songs_list = pd.DataFrame(columns = ['artist', 'songs'])
    # artist_count = len(recommended_artists)
    # song_per_artist = n_songs > artist_count and (n_songs/artist_count) or 1

    # get songs per artist
    for i in range(len(recommended_artists)):
        artist = recommended_artists.loc[i, "artist"]
        recommend_song = pd.DataFrame({
            'artist' : artist,
            'songs' :get_song_from_artist(artist, 1)
        })
        recommend_songs_list = recommend_songs_list.append(recommend_song, ignore_index = True)
    return  recommend_songs_list

In [10]:
def get_song_data(song, spotify_data):
    
    """
    Gets the song data for a specifi song. The song argument takes the form of a dictionary with 
    key-value pairs for the name and release year of the song.
    """
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) ].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'])