# Capstone Project

## *Recommender System*

#### Table of Contents

* [Topic 1](#topic-1)
* [Topic 2](#topic-2)

### Import Libraries & Read in Data

In [1]:
## standard imports 
import pandas as pd 
import numpy as np
import re
## visualizations
import matplotlib.pyplot as plt
import seaborn as sns

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

### Spotify Credentials - must be set in local environment to run
auth_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(auth_manager=auth_manager)

## options
# pd.options.display.max_rows = 4000
# pd.options.display.max_columns = 100
# pd.set_option('max_colwidth', 100)

In [2]:
### read in data
df = pd.read_csv('../data/hiphop_clustering.csv')
track = pd.read_csv('../data/WAP.csv')

In [3]:
df.head(2)

Unnamed: 0,artist_name,track_id,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
0,J. Cole,2JvzF1RMd7lE3KmFlsyZD8,0.96,0.149,0.837,0.364,0.0,0.271,-11.713,0.276,123.984,0.463
1,Meek Mill,2IRZnDFmlqMuOrYOLnZZyc,0.95,0.259,0.889,0.496,0.0,0.252,-6.365,0.0905,86.003,0.544


In [4]:
track.drop(columns=['key', 'mode'], inplace=True)  ### not present in test set using here

In [5]:
track

Unnamed: 0,artist_name,track_id,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Cardi B,4Oun2ylbjFKMPTiaSbbCih,0.97,0.935,0.454,-7.509,0.375,0.0194,0,0.0824,0.357,133.073


In [6]:
# pd.concat([df, track], ignore_index=True)

## Recommender Functions <a class="anchor" id="topic-1"></a>
<hr/>

In [47]:
def track_reccommender(df, track, include_pop=True):
    ID = track['track_id'].values[0]
    ### Create X data
    data = pd.concat([df, track], ignore_index=True)
    ### desired features for model (may change later)
    features = ['popularity', 'acousticness', 'danceability', 'energy', 'speechiness', 'valence']
    if include_pop==False:
        features.remove('popularity')
    X = data[features]
    ### calculate similarity matrix
    similarity_matrix = cosine_similarity(X, X)
    
    ### create mapping bwtn track ids and index
    track_id_map = pd.Series(data.index, index=data['track_id'])
    ## find index of track in dataframe
    track_index = track_id_map[ID]
    ### find the correct column for the track in the similarity matrix
#     similarity_scores = list(enumerate(similarity_matrix[track_index]))
    

#     ### above creates list of tuples, score for each track is second element from each tuple
#     similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
#     ### select top 15 matches
#     similarity_scores_top15 = similarity_scores[:15]
    
#     ### get indices for top 15 tracks to pull from dataframe
#     recc_track_idxs = [i[0] for i in similarity_scores_top15]
#     ### get scores for top 15
#     recc_track_scores = [i[1] for i in similarity_scores_top15]
    
    ### USING SERIES INSTEAD
    similarity_scores = pd.Series(similarity_matrix[track_index])
    similarity_scores.sort_values(ascending=False, inplace=True)
    
    similarity_scores_top15 = similarity_scores[:15]
    recc_track_ids = data['track_id'].loc[similarity_scores_top15.index]
    
    ### created dataframe of reccommended tracks
    recc_tracks_df = data[data['track_id'].isin(recc_track_ids.values)].copy()
    recc_tracks_df['score'] = similarity_scores
    recc_tracks_df.sort_values(by='score', ascending=False, inplace=True)

    return recc_tracks_df

def make_track_URIs(track_ids):
    ### reformats track ids as track URIs
    ### need text spotify:track: in front of each ID to use in Spotify
    track_URIs = []
    for track_id in track_ids:
        uri = 'spotify:track:'+ track_id
        track_URIs.append(uri)
    return track_URIs

def create_playlist_file(track_ids, name):
    ### creates text file of Spotify URIs
    track_URIs = make_track_URIs(track_ids)
    ### write URIs to text file
    playlist = open(fr'../playlists/playlist_{name}.txt','w')
    playlist.writelines('%s\n' % track for track in track_URIs) 
    playlist.close()
    pass

def display_playlist(playlist_tracks):
    ### displays playlist track name, artist, album
    tracks_dict = sp.tracks(playlist_tracks)['tracks']
    playlist_info = []
    for i in range(len(playlist_tracks)):
        track = [
            tracks_dict[i]['name'], 
            tracks_dict[i]['artists'][0]['name'],
            tracks_dict[i]['album']['name']
            ]
        playlist_info.append(track)
    
    playlist_df = pd.DataFrame(playlist_info, columns=['Title', 'Artist', 'Album'] )
    ### start index at 1
    playlist_df.index = np.arange(1,len(playlist_df)+1)
    return playlist_df

In [48]:
results = track_reccommender(df, track, include_pop=True)
results

Unnamed: 0,artist_name,track_id,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,score
9295,Cardi B,4Oun2ylbjFKMPTiaSbbCih,0.97,0.0194,0.935,0.454,0.0,0.0824,-7.509,0.375,133.073,0.357,1.0
48,21 Savage,2fQrGHiQOvpL9UgPvtYy6G,0.84,0.0151,0.884,0.346,7e-06,0.0871,-8.228,0.351,75.016,0.376,0.997328
155,Future,0VgkVdmE4gld66l8iyGjgx,0.82,0.0102,0.833,0.434,0.0219,0.165,-8.795,0.431,150.062,0.281,0.99625
1389,XXXTENTACION,5b6zjsBrZCqe6RFycLz2tc,0.67,0.0762,0.606,0.349,0.0711,0.1,-10.032,0.198,180.055,0.239,0.995345
194,Drake,4Kz4RdRCceaA9VgTqBhBfa,0.75,0.000107,0.766,0.442,6.1e-05,0.111,-8.558,0.356,201.8,0.39,0.994348
243,21 Savage,2wOXxtHZgRkkrkEbKLzzqs,0.77,0.000244,0.885,0.52,1e-06,0.454,-8.353,0.359,84.022,0.306,0.993051
448,Future,4VMiNOpnjRwSCwmHaUfOMM,0.72,0.0117,0.73,0.465,0.0,0.28,-6.29,0.278,148.028,0.367,0.992899
166,ScHoolboy Q,4LmAnpjlhWTahvRkYR8xJa,0.77,0.0201,0.768,0.471,0.0,0.268,-8.406,0.259,131.023,0.405,0.992471
1553,Fat Nick,3Bo3lNVQTtZYMCoItX8dKW,0.68,0.00096,0.723,0.459,0.0,0.11,-10.259,0.239,139.972,0.313,0.992404
1310,Future,6DB2KOEwHnjkgEnBt5SdeJ,0.71,0.0111,0.727,0.431,0.0166,0.506,-12.007,0.395,150.005,0.223,0.992364


In [49]:
results.columns

Index(['artist_name', 'track_id', 'popularity', 'acousticness', 'danceability',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'score'],
      dtype='object')

In [51]:
cols = ['artist_name','score', 'popularity', 'danceability',
       'energy', 'valence', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'loudness','tempo',
       'track_id']
results = results[cols]
results

Unnamed: 0,artist_name,score,popularity,danceability,energy,valence,speechiness,acousticness,instrumentalness,liveness,loudness,tempo,track_id
9295,Cardi B,1.0,0.97,0.935,0.454,0.357,0.375,0.0194,0.0,0.0824,-7.509,133.073,4Oun2ylbjFKMPTiaSbbCih
48,21 Savage,0.997328,0.84,0.884,0.346,0.376,0.351,0.0151,7e-06,0.0871,-8.228,75.016,2fQrGHiQOvpL9UgPvtYy6G
155,Future,0.99625,0.82,0.833,0.434,0.281,0.431,0.0102,0.0219,0.165,-8.795,150.062,0VgkVdmE4gld66l8iyGjgx
1389,XXXTENTACION,0.995345,0.67,0.606,0.349,0.239,0.198,0.0762,0.0711,0.1,-10.032,180.055,5b6zjsBrZCqe6RFycLz2tc
194,Drake,0.994348,0.75,0.766,0.442,0.39,0.356,0.000107,6.1e-05,0.111,-8.558,201.8,4Kz4RdRCceaA9VgTqBhBfa
243,21 Savage,0.993051,0.77,0.885,0.52,0.306,0.359,0.000244,1e-06,0.454,-8.353,84.022,2wOXxtHZgRkkrkEbKLzzqs
448,Future,0.992899,0.72,0.73,0.465,0.367,0.278,0.0117,0.0,0.28,-6.29,148.028,4VMiNOpnjRwSCwmHaUfOMM
166,ScHoolboy Q,0.992471,0.77,0.768,0.471,0.405,0.259,0.0201,0.0,0.268,-8.406,131.023,4LmAnpjlhWTahvRkYR8xJa
1553,Fat Nick,0.992404,0.68,0.723,0.459,0.313,0.239,0.00096,0.0,0.11,-10.259,139.972,3Bo3lNVQTtZYMCoItX8dKW
1310,Future,0.992364,0.71,0.727,0.431,0.223,0.395,0.0111,0.0166,0.506,-12.007,150.005,6DB2KOEwHnjkgEnBt5SdeJ


In [52]:
make_track_URIs(results['track_id'])

['spotify:track:4Oun2ylbjFKMPTiaSbbCih',
 'spotify:track:2fQrGHiQOvpL9UgPvtYy6G',
 'spotify:track:0VgkVdmE4gld66l8iyGjgx',
 'spotify:track:5b6zjsBrZCqe6RFycLz2tc',
 'spotify:track:4Kz4RdRCceaA9VgTqBhBfa',
 'spotify:track:2wOXxtHZgRkkrkEbKLzzqs',
 'spotify:track:4VMiNOpnjRwSCwmHaUfOMM',
 'spotify:track:4LmAnpjlhWTahvRkYR8xJa',
 'spotify:track:3Bo3lNVQTtZYMCoItX8dKW',
 'spotify:track:6DB2KOEwHnjkgEnBt5SdeJ',
 'spotify:track:2Grb4G6t9VIqo6moKUloom',
 'spotify:track:2b4SSorCTQ2VzmllaeWuuT',
 'spotify:track:3mvYQKm8h6M5K5h0nVPY9S',
 'spotify:track:2bjwRfXMk4uRgOD9IBYl9h',
 'spotify:track:4JsKaTag0OHXibnqYcDfIA']

In [53]:
create_playlist_file(results['track_id'], 'cosine-sim')

In [54]:
display_playlist(make_track_URIs(results['track_id']))

Unnamed: 0,Title,Artist,Album
1,WAP (feat. Megan Thee Stallion),Cardi B,WAP (feat. Megan Thee Stallion)
2,Bank Account,21 Savage,Issa Album
3,Mask Off,Future,FUTURE
4,schizophrenia,XXXTENTACION,?
5,The Motto,Drake,Take Care (Deluxe)
6,1.5,21 Savage,i am > i was
7,Realer N Realer,Future,Future & Juice WRLD Present... WRLD ON DRUGS
8,X (with 2 Chainz & Saudi),ScHoolboy Q,Black Panther The Album Music From And Inspire...
9,P.S Fuck You Cunt (feat. Lil Peep),Fat Nick,When the Lean Runs Out
10,Mask Off (feat. Kendrick Lamar) - Remix,Future,Mask Off (feat. Kendrick Lamar) [Remix]


## Try it without Popularity <a class="anchor" id="topic-2"></a>
<hr/>

In [12]:
results_nopop = track_reccommender(df, track, include_pop=False)

In [13]:
results_nopop

Unnamed: 0,artist_name,track_id,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
3113,The Sugarhill Gang,0FWhGmPVxLI6jOVF0wjALa,0.56,0.465,0.9,0.845,0.0,0.0724,-7.306,0.257,111.187,0.914
3191,Ice Cube,4r13PlX7hKjNxdyq5ukR5A,0.63,0.000917,0.946,0.857,0.0,0.0525,-4.498,0.159,99.88,0.882
3627,$uicideBoy$,6OB7Xp69uSWA4vDwcKgEDA,0.6,0.119,0.905,0.817,0.0,0.15,-4.26,0.397,140.083,0.8
4543,Noname,1XiKRU9Dg7QPlGRSWbfXgo,0.57,0.091,0.891,0.827,2e-06,0.221,-6.139,0.351,100.987,0.861
4749,G-Eazy,6uz0z5oMeC8QWfTHeR5qlF,0.56,0.241,0.838,0.781,0.0,0.105,-8.249,0.45,135.941,0.913
4854,Lil Pump,6ox6GMJt3WP2QghksDa6UO,0.6,0.0263,0.917,0.76,0.0,0.0976,-3.29,0.38,136.014,0.813
4922,Key Glock,3RHOFe6m1g0nCZ7CClVKn0,0.54,0.00134,0.935,0.772,2.8e-05,0.11,-8.773,0.354,139.999,0.875
5873,Young Dolph,0xSLJxGbnhA6FHtar3CEmc,0.51,0.106,0.866,0.947,0.0,0.631,-5.365,0.272,132.013,0.813
6969,2 LIVE CREW,0ioFIQXbTUaliQpsJyX4zN,0.49,0.00107,0.895,0.9,0.000661,0.239,-8.905,0.195,131.671,0.896
7329,EPMD,0f1yzIIGD8k32DNVeEn6jb,0.5,0.089,0.882,0.814,0.0,0.16,-5.214,0.424,99.689,0.854


In [14]:
display_playlist(make_track_URIs(results_nopop['track_id']))

Unnamed: 0,Title,Artist,Album
1,Rapper's Delight,The Sugarhill Gang,Rhino Hi-Five: The Sugarhill Gang
2,You Can Do It - Feat. Mack 10 And Ms Toi,Ice Cube,War & Peace Vol. 2 (The Peace Disc)
3,HUNG UP ON THE COME UP,$uicideBoy$,HUNG UP ON THE COME UP
4,Blaxploitation,Noname,Room 25
5,Loaded (Bonus Track) [feat. DJ Carnage],G-Eazy,Must Be Nice
6,Smoke My Dope (feat. Smokepurpp),Lil Pump,Lil Pump
7,Dope,Key Glock,Glockoma
8,What's the Deal,Young Dolph,Thinking Out Loud
9,Hoochie Mama,2 LIVE CREW,Goes To the Movies: Decade of Hits
10,You Gots To Chill,EPMD,Strictly Business


In [39]:

Y = np.array([
    [1,2,3],
    [6,4,5],
    [9,7,8]
]
)

s = cosine_similarity(Y, Y)
s

array([[1.        , 0.88326011, 0.90184723],
       [0.88326011, 1.        , 0.99819089],
       [0.90184723, 0.99819089, 1.        ]])

In [38]:
s[1]

array([0.88326011, 1.        , 0.99819089])