In [170]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
import os
import numpy as np

In [171]:
tracks = pd.read_csv(r"datasets/data.csv")

In [172]:
tfidf = TfidfVectorizer( stop_words='english')

#Replace NaN with an empty string
tracks['name'] = tracks['name'].fillna('')

tracks = tracks.head(25000)

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(tracks['name'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(25000, 15701)

In [173]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [174]:

indices = pd.Series(tracks.index, index=tracks['name']).drop_duplicates()


In [175]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]

    track_indices = [i[0] for i in sim_scores]
    return tracks['name'].iloc[track_indices].tolist()

In [176]:
get_recommendations("I Might Fall Back On You")

['Fall Back Down',
 'Fall For You',
 'I Might Fall Back On You',
 'When I Fall In Love',
 'When I Fall In Love',
 'When I Fall In Love',
 'When I Fall In Love',
 'If I Ever Fall In Love',
 'If I Ever Fall In Love',
 'I Could Fall In Love']

In [177]:
from collections import defaultdict
from scipy.spatial.distance import cdist
import difflib
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [178]:
tracks.to_csv('tracks_with_cluster.csv')

In [179]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="677a46ed628944af94b5cdb96e3e25ee",
                                                           client_secret="dfc3a58da7e34b009001ff1794592060"))

def find_song(name):
    song_data = defaultdict()
    results = sp.search(q= 'track: {}'.format(name), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)


In [180]:
def get_song_data(song, spotify_data):
    
    """
    Gets the song data for a specific song. The song argument takes the form of a dictionary with 
    key-value pairs for the name and release year of the song.
    """
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) ].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'])

In [181]:
def PCA_algorithm(tracks):
    num_datatypes = tracks.select_dtypes(np.number)      

    pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
    song_embedding = pca_pipeline.fit_transform(num_datatypes)
    return song_embedding

In [182]:
def KMeans_with_PCA_algorithm(song_embedding):
    kmeans_pca = KMeans(n_clusters = 3, init = 'k-means++', random_state=42)    
    return kmeans_pca.fit(song_embedding)

In [183]:
def filter_based_on_segment(df_segm_pca_kmeans,segment_val):    
    #get list of songs with above song id
    #print(df_segm_pca_kmeans[df_segm_pca_kmeans['Segment K-means PCA']==segment_val].values)
    
    filtered_data_per_segment = df_segm_pca_kmeans[df_segm_pca_kmeans['Segment K-means PCA']==segment_val][['id', 'name','com1','com2','Segment K-means PCA']]

    #save to csv file
    filtered_data_per_segment.to_csv('filtered_data_per_segment.csv')
    return filtered_data_per_segment

In [184]:
def filter_based_on_cluster_centroid(kmeans_pca,filtered_data_per_segment,segment_val,tracks):
    #calculate distance based on cluster centroid
    scaled_data = kmeans_pca.transform( filtered_data_per_segment[['com1','com2']])
    scaled_song_center = kmeans_pca.transform(kmeans_pca.cluster_centers_[segment_val].reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')

    #sort based on distance
    index = list(np.argsort(distances)[0])
    rec_songs = tracks.iloc[index]
    
    #recommend based on segment data
    rec_songs = rec_songs[~rec_songs['name'].isin(filtered_data_per_segment['name'])]
    return rec_songs

In [185]:
from sklearn.decomposition import PCA

def recommend_songs(song_id, n_songs=10):
  
    """
    Recommends songs based on a list of previous songs that a user has listened to.
    """
    song_embedding = PCA_algorithm(tracks)
    
    kmeans_pca = KMeans_with_PCA_algorithm(song_embedding)

    #get Segment K-means PCA from song_id argument
    df_segm_pca_kmeans = pd.concat([tracks.reset_index(drop=True), pd.DataFrame(song_embedding)],axis=1)
    df_segm_pca_kmeans.columns.values[-2:] = ['com1','com2']
    df_segm_pca_kmeans['Segment K-means PCA'] = kmeans_pca.labels_

    #get segment value of song_id
    segment_val = df_segm_pca_kmeans[df_segm_pca_kmeans['id'] == song_id]['Segment K-means PCA'].values[0]
    
    filtered_data_per_segment = filter_based_on_segment(df_segm_pca_kmeans,segment_val)   

    rec_songs = filter_based_on_cluster_centroid(kmeans_pca,filtered_data_per_segment,segment_val,tracks)

    #recommend top n songs
    return rec_songs.head(n_songs)['name'].tolist()

In [186]:
recommend_songs("3w3cxwYuR7ThpE8KVSys5x")



X has feature names, but KMeans was fitted without feature names



['Danza cubana no.3: Serenata melancolica',
 'Lento, ma non troppo',
 'Riase - Remasterizado',
 'Daahoud',
 'Katha Chhilo Aaj Raate',
 "I've Got A Crush On You",
 'If Dreams Come True - Live',
 'Enta Fakerny',
 'Autumn Leaves - 1999 / Digital Remaster',
 'Fugue in G Minor, BWV 1000 (arr. A. Segovia)']

In [187]:
# import plotly.express as px
# fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
# fig.show()

In [188]:
# df = pd.read_csv(r"C:/Users/shris/Desktop/profiles.csv")
# print(df.gender)
# df.columns = ['userid', 'gender', 'age', 'country']

In [189]:
# df.head()

In [190]:
# songs = df.gender.tolist()
# output = {}
# outputs = []
# for song in songs:
#     k = recommend_songs(song)
#     output.update( {song : k})


In [191]:
# output

In [192]:
# df1 = pd.DataFrame.from_dict(output, orient='index')

In [193]:
# df1.to_csv('content_based.csv')