In [14]:
!pip install import-ipynb
!pip install seaborn
!pip install circlify



In [15]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

In [16]:
def PCA_algorithm(data): 
    num_datatypes = data.select_dtypes(np.number)

    pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
    song_embedding = pca_pipeline.fit_transform(num_datatypes)
    return song_embedding

In [17]:
def KMeans_with_PCA_algorithm(song_embedding,n_clusters):
    kmeans_pca = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
    label = kmeans_pca.fit(song_embedding)
    centroids = kmeans_pca.cluster_centers_
    return label, centroids

In [18]:
def filter_based_on_segment(df_segm_pca_kmeans,segment_val,field1,field2):    
    #get list of songs with above song id
    #print(df_segm_pca_kmeans[df_segm_pca_kmeans['Segment K-means PCA']==segment_val].values)
    
    filtered_data_per_segment = df_segm_pca_kmeans[df_segm_pca_kmeans['Segment K-means PCA']==segment_val][[
        field1, field2 ,'com1','com2','Segment K-means PCA']]

    #save to csv file
    filtered_data_per_segment.to_csv('datasets/filtered_data_per_segment.csv')
    return filtered_data_per_segment

In [19]:
def filter_based_on_cluster_centroid(kmeans_pca, filtered_data_per_segment, segment_val, tracks):   
    #calculate distance based on cluster centroid
    scaled_data = kmeans_pca.transform(filtered_data_per_segment[['com1', 'com2']])
    scaled_song_center = kmeans_pca.transform(kmeans_pca.cluster_centers_[segment_val].reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')

    #sort based on distance
    index = list(np.argsort(distances)[0])
    rec_songs = filtered_data_per_segment.iloc[index]

    #recommend based on segment data
    # rec_songs = rec_songs[~rec_songs['name'].isin(filtered_data_per_segment['name'])]
    return rec_songs

In [20]:
def GetInPut(user,df_artist):
    inputArtist = pd.DataFrame(user)
    #Filtering out the movies by title
    Id = df_artist[df_artist['artist'].isin(inputArtist['artist'].tolist())]
    #Then merging it so we can get the movieId. It's implicitly merging it by title.
    inputArtist = pd.merge(Id, inputArtist)
    #Dropping information we won't use from the input dataframe
    #inputArtist = inputArtist.drop('year', 1)
    return inputArtist

In [21]:
def get_song_from_artist(artist, n_per_artist= 1):
    data_for_songs = pd.read_csv(r"datasets/data.csv")

    list_of_artists_songs = []
    for i in range(len(data_for_songs)):
        if artist in data_for_songs.loc[i, "artists"]:
            list_of_artists_songs.append(data_for_songs.loc[i, ["id", "artists", "name", "popularity"]])

    sorted_list_of_artists_songs = sorted(list_of_artists_songs, key=lambda d: d['popularity'], reverse=True)
    sorted_list_of_songs_only = np.array(sorted_list_of_artists_songs)
    unique_songs = np.unique(sorted_list_of_songs_only[:, 2])
    return unique_songs[:n_per_artist]


In [22]:
def get_song_data(song, spotify_data):
    
    """
    Gets the song data for a specifi song. The song argument takes the form of a dictionary with 
    key-value pairs for the name and release year of the song.
    """
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) ].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'])

In [23]:
def find_song(name):
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials
    from collections import defaultdict

    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="123wer4sf",
                                                               client_secret="wefrf852fd1s35s"))

    song_data = defaultdict()
    results = sp.search(q= 'track: {}'.format(name), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)


In [24]:
def circlify_vizualization(df_val):
    import matplotlib.pyplot as plt
    import pandas as pd
    import circlify

    df = pd.DataFrame({'Name': df_val,
                       'Value': [1000, 500, 400, 200, 80, 50, 40, 25, 12, 1]})

    df = df.sort_values('Value') 
    circles = circlify.circlify(df['Value'].tolist(),
                                target_enclosure=circlify.Circle(x=0, y=0, r=1))

    fig, ax = plt.subplots(figsize=(10, 9))

    ax.set_title('Recommendation')
    ax.axis('off')  

    lim = max(max(abs(circle.x) + circle.r, abs(circle.y) + circle.r, )
              for circle in circles)
    ax.set_xlim(-lim, lim)
    ax.set_ylim(-lim, lim)

    labels = df['Name']  
    colors = ['#ffc4c4','#a48465','#ffdead','#e7ce8c','#d5c9b6','#ffc4c4','#e9e3ce','#ffdead','#e7ce8c','#d5c9b6']
    for circle, label, color in zip(circles, labels, colors):
        x, y, r = circle
        ax.add_patch(plt.Circle((x, y), r, linewidth=2, color=color))
        ax.annotate(label, (x, y), va='center', ha='center', size=12)
    plt.show()

In [25]:
def clusters_visualization_kmeans_with_pca(tracks, n_cluster=3):
    import seaborn as sns
    from matplotlib import pyplot as plt

    song_embedding = PCA_algorithm(tracks)
    kmeans_pca , centroids = KMeans_with_PCA_algorithm(song_embedding,n_cluster)

    # #get Segment K-means PCA from song_id argument
    df_segm_pca_kmeans = pd.concat([tracks.reset_index(drop=True), pd.DataFrame(song_embedding)],axis=1)
    df_segm_pca_kmeans.columns.values[-2:] = ['com1','com2']
    df_segm_pca_kmeans['Segment K-means PCA'] = kmeans_pca.labels_

    plt.figure(figsize = (10,10))
    sns.scatterplot(df_segm_pca_kmeans['com1'], df_segm_pca_kmeans['com2'],
                    hue=kmeans_pca.labels_,
                    palette='Set1',
                    s=100, alpha=0.2).set_title('KMeans Clusters with PCA', fontsize=15)
    plt.legend()
    plt.ylabel('PC2')
    plt.xlabel('PC1')
    plt.show()

In [26]:
def clusters_visualization_kmeans_only(tracks, n_cluster=3):
    from matplotlib import pyplot as plt

    data = tracks[['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'popularity']]
    pca = PCA(2)

    #Transform the data
    df = pca.fit_transform(data)

    #Import KMeans module
    from sklearn.cluster import KMeans

    #Initialize the class object
    kmeans = KMeans(n_clusters=n_cluster)

    #predict the labels of clusters.
    label = kmeans.fit_predict(df)

    #Getting unique labels
    u_labels = np.unique(label)
    plt.show()
    #plotting the results:
    for i in u_labels:
        plt.scatter(df[label == i, 0], df[label == i, 1], label=i)
    plt.legend()
    plt.ylabel('PC2')
    plt.xlabel('PC1')
    plt.title("KMeans only clusters")
    plt.show()