In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
import os
import numpy as np

In [2]:
os.getcwd()

'/Users/bipinkarki/Desktop/ML-ABKS-Music-Recommender-System'

In [3]:
tracks = pd.read_csv(r"Notebooks/datasets/tracks.csv")

In [4]:
tracks.tail(30)

Unnamed: 0,track-id,track-name,genre,artist-id,artist-name,rating,popularity,instruments
9970,9971,Track 9971,Rock,806,Artist 806,1.666964,8,"['Guitar', 'Violin', 'Vocals']"
9971,9972,Track 9972,Latin,685,Artist 685,2.773753,6,"['Saxophone', 'Cello', 'Guitar']"
9972,9973,Track 9973,Jazz,123,Artist 123,1.541703,6,"['Guitar', 'Violin', 'Saxophone', 'Vocals', 'B..."
9973,9974,Track 9974,Pop,890,Artist 890,2.597888,6,"['Trumpet', 'Guitar', 'Piano', 'Vocals', 'Saxo..."
9974,9975,Track 9975,Country,316,Artist 316,0.945645,3,"['Trumpet', 'Piano', 'Bass', 'Cello', 'Saxopho..."
9975,9976,Track 9976,Classical,804,Artist 804,0.954592,6,"['Trumpet', 'Bass', 'Drums', 'Saxophone', 'Cel..."
9976,9977,Track 9977,Hip Hop,523,Artist 523,4.699004,2,['Drums']
9977,9978,Track 9978,Jazz,534,Artist 534,3.383778,1,"['Piano', 'Cello', 'Vocals', 'Violin']"
9978,9979,Track 9979,Pop,871,Artist 871,2.776193,2,"['Saxophone', 'Guitar', 'Trumpet']"
9979,9980,Track 9980,Pop,580,Artist 580,4.15291,1,"['Vocals', 'Piano', 'Trumpet', 'Guitar']"


In [5]:
tfidf = TfidfVectorizer( stop_words='english')

#Replace NaN with an empty string
tracks['track-name'] = tracks['track-name'].fillna('')

tracks = tracks.head(25000)

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(tracks['track-name'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(10000, 9992)

In [6]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:

indices = pd.Series(tracks.index, index=tracks['track-name']).drop_duplicates()


In [8]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]

    track_indices = [i[0] for i in sim_scores]
    print(tracks[title].iloc[track_indices].tolist())
    return tracks['track-id'].iloc[track_indices].tolist()

In [9]:
get_recommendations("Track 9998")

KeyError: 'Track 9998'

In [None]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['track-name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []


In [None]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
features = ['artist-name','popularity', 'genre', 'instruments']
#features = ['artists']
for feature in features:
    tracks[feature] = tracks[feature].apply(clean_data)
#     tracks[feature] = tracks[feature].apply(eval)

In [None]:
tracks.head(3)

In [None]:
def cos_sim(a,b):
    dot_product = np.dot(a,b)
    norm_a = linalg.norm(a)
    norm_b = linalg.norm(b)
    return dot_product/(norm_a*norm_b)

In [None]:
https://towardsdatascience.com/how-to-build-an-amazing-music-recommendation-system-4cce2719a572

In [None]:
from collections import defaultdict
from scipy.spatial.distance import cdist
import difflib
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=2))],verbose=True)
X = tracks.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
tracks['cluster_label'] = song_cluster_labels

In [None]:
tracks.to_csv('tracks_with_cluster.csv')

In [None]:
tracks.head()

In [None]:
from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = tracks['track-name']
projection['cluster'] = tracks['cluster_label']

In [None]:
import plotly.express as px
fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

In [None]:
number_cols = ['time_signature','duration_ms','popularity','valence', 'acousticness', 'danceability',  'energy', 'instrumentalness', 'explicit','key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo']

In [None]:
def get_song_data(song, spotify_data):
    
    """
    Gets the song data for a specific song. The song argument takes the form of a dictionary with 
    key-value pairs for the name and release year of the song.
    """
    
    try:
        song_data = spotify_data[(spotify_data['track-name'] == song['track-name']) ].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['track-name'])

In [None]:
def get_mean_vector(song_list, spotify_data):
  
    """
    Gets the mean vector for a list of songs.
    """
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['track-name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

In [None]:
def flatten_dict_list(dict_list):
   
    """
    Utility function for flattening a list of dictionaries.
    """
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict

In [None]:
def recommend_songs(song_id, n_songs=10):
  
    """
    Recommends songs based on a list of previous songs that a user has listened to.
    """
    spotify_data = tracks
    song_name = tracks.loc[tracks.get("track-id") == song_id, 'track-name']
    print(song_name)
    song_list = [{'track-name': song_name}]
    print(song_list)
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    print(scaler)
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['track-name'].isin(song_dict['track-name'])]
    return rec_songs['id'].tolist()

In [None]:
recommend_songs('9975')

In [None]:
df = pd.read_csv(r"C:/Users/shris/Desktop/profiles.csv")
print(df.gender)
df.columns = ['userid', 'gender', 'age', 'country']

In [None]:
df.head()

In [None]:
songs = df.gender.tolist()
output = {}
outputs = []
for song in songs:
    k = recommend_songs(song)
    output.update( {song : k})


In [None]:
output

In [None]:
df1 = pd.DataFrame.from_dict(output, orient='index')

In [None]:
df1.to_csv('content_based.csv')