<a href="https://colab.research.google.com/github/prishanmu/Music_Recommender/blob/master/Spotify_Music_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Statements

In [0]:
import pandas as pd
import numpy as np

# Import & Process Data

source: https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks

In [3]:
df = pd.read_csv('data.csv')
df.drop(columns = ['Unnamed: 0', 'duration_ms', 'key', 'mode', 'year', 'popularity'], inplace=True)
df.head()

Unnamed: 0,acousticness,artists,danceability,energy,explicit,id,instrumentalness,liveness,loudness,name,release_date,speechiness,tempo,valence
0,0.732,['Dennis Day'],0.819,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,0.16,-12.441,Clancy Lowered the Boom,1921,0.415,60.936,0.963
1,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,0.665,-20.096,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",1921,0.0366,80.954,0.0594
2,0.996,['John McCormack'],0.518,0.203,0,5uNZnElqOS3W4fRmRYPk4T,0.0,0.115,-10.589,The Wearing of the Green,1921,0.0615,66.221,0.406
3,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,0.211,0,1SCWBjhk5WmXPxhDduD3HM,0.878,0.665,-20.096,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",1921,0.0366,80.954,0.0594
4,0.957,['Phil Regan'],0.418,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,0.229,-10.096,When Irish Eyes Are Smiling,1921,0.038,101.665,0.253


In [4]:
df_artist = pd.read_csv('data_w_genres.csv')
df_artist.drop(columns = ['Unnamed: 0', 'duration_ms', 'key', 'mode', 'count', 'popularity'], inplace = True)
df_artist.head()

Unnamed: 0,artists,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genres
0,Francisco Canaro,0.983072,0.654711,0.292622,0.490675,0.201118,-11.733373,0.111007,123.608786,0.746469,"['tango', 'vintage tango']"
1,Frédéric Chopin,0.989961,0.340087,0.106874,0.876899,0.155677,-22.575578,0.042913,90.977772,0.203644,"['classical', 'early romantic era', 'polish cl..."
2,Ludwig van Beethoven,0.955019,0.340157,0.153176,0.69077,0.164078,-20.107704,0.05555,104.833536,0.260255,"['classical', 'classical era', 'early romantic..."
3,Wolfgang Amadeus Mozart,0.962084,0.353895,0.138348,0.514837,0.187091,-20.214154,0.067756,108.59492,0.332855,"['classical', 'classical era']"
4,Johann Sebastian Bach,0.958405,0.354224,0.201932,0.744322,0.165837,-20.936518,0.048844,106.551869,0.570526,"['baroque', 'classical', 'early music', 'germa..."


In [5]:
# normalize columns
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df_no_artist_no_genre = pd.DataFrame(scaler.fit_transform(df_artist[['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']]))

artist_genre = df_artist[['artists', 'genres']]

df_artists = pd.merge(df_no_artist_no_genre, artist_genre, left_index=True, right_index=True)
df_artists.columns = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'artists', 'genres']
df_artists.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,artists,genres
0,0.987268,0.693733,0.295613,0.502484,0.173374,0.711058,0.115434,0.657573,0.769714,Francisco Canaro,"['tango', 'vintage tango']"
1,0.994186,0.360357,0.107953,0.898002,0.123383,0.409608,0.044625,0.483983,0.209985,Frédéric Chopin,"['classical', 'early romantic era', 'polish cl..."
2,0.959094,0.360432,0.154732,0.707393,0.132625,0.478223,0.057765,0.557692,0.268359,Ludwig van Beethoven,"['classical', 'classical era', 'early romantic..."
3,0.96619,0.374988,0.139752,0.527227,0.157942,0.475264,0.070458,0.577702,0.34322,Wolfgang Amadeus Mozart,"['classical', 'classical era']"
4,0.962495,0.375337,0.20399,0.762235,0.13456,0.45518,0.050792,0.566833,0.588293,Johann Sebastian Bach,"['baroque', 'classical', 'early music', 'germa..."


# GOAL: get a list of similar artists, with genre specified

source: https://www.kaggle.com/florianheiny/spotify-artist-recommendation/notebook

In [0]:
def similar_artist(df, artist): 

  # similarity matrix

  df['similarity'] = np.linalg.norm(df[['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']] - df[df['artists'] == artist][['acousticness', 'danceability', 'energy', 'instrumentalness','liveness', 'loudness', 'speechiness', 'tempo', 'valence']].to_numpy(), axis=1)
  df = df.sort_values('similarity')
  df['similarity'] = 1- df['similarity'] / df['similarity'].max()

  similar_artist_list = list(zip(df['artists'].to_numpy()[1:], df['similarity'].to_numpy()[1:]))

  return similar_artist_list


In [0]:
def artist_rec(df, artist_rating_matrix, genre): 
    # filter out by genre

  for line in df.values.tolist():
    df_genre = []
    lst = line[10]
    if genre in lst:
      df_genre.append(line)
  df_genre = pd.DataFrame(df_genre)

    # similar artist dictionary

  dict_similar = {}
  for artist, weight in artist_rating_matrix.items():
        dict_similar[artist] = similar_artist(artist)
  artists_all = []
  for artist, similar_artists in dict_similar.items():
    artists_all.append(list(similar_artists.keys()))
    artists_unique = np.unique(artists_all).tolist()
    artists_dict = {artist: 0 for artist in artists_unique}
  for artist, similar_artists in dict_similar.items():
    for similar_artist, score in similar_artists.items():
      artists_dict[similar_artist] += artists[artist] * score

  return list({k: v for k, v in sorted(artists_dict.items(), key=lambda item: item[1], reverse=True) if k not in artists}.keys())[0:num]


In [12]:
#test

artist_dict = {'Kehlani': 8, 'Lady Gaga': 5}

artist_rec(df = df_artist, artist_rating_matrix = artist_dict, genre = 'pop')

TypeError: ignored