# Model v3


---
**Note**

We could construct a model which determines if a user will like a certain playlist based on comparing their playlists and liked playlists to the audio features of our playlist. Train model on users playlists and likes, classify playlist as 0 or 1 if the user will like the playlist.

I believe our final model will look something like this: https://developer.spotify.com/console/get-recommendations/

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
pd.set_option('display.max_columns', 25)

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LassoCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import confusion_matrix

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import statsmodels.api as sm
from statsmodels.api import OLS

import seaborn as sns
sns.set()

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time
import requests

## Initialize API with keys

In [2]:
# https://stackoverflow.com/questions/30557409/spotify-api-post-call-response-415
def initiate_api():    
    client_id = "9cd3dd2ea2cf492ca28ab0247a79d781"
    client_secret = "11c972ad002843e9be5ecc31f022dd6e"
    grant_type = 'client_credentials'
    body_params = {'grant_type' : grant_type}
    url = 'https://accounts.spotify.com/api/token'
    response = requests.post(url, data=body_params, auth = (client_id, client_secret)) 
    client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
    return spotipy.Spotify(client_credentials_manager=client_credentials_manager)
sp = initiate_api()

### Read in pkl file of songs collected from first 30K playlists in dataset

In [3]:
songs = pd.read_pickle("pickles/songs_30k_dropped.pkl")
songs.head()

Unnamed: 0,index,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
53,1,3.2e-05,https://api.spotify.com/v1/audio-analysis/1MYY...,0.343,70294,0.975,1MYYt7h6amcrauCOoso3Gx,0.991,1,0.0515,-2.502,0,0.135,119.883,4,https://api.spotify.com/v1/tracks/1MYYt7h6amcr...,audio_features,spotify:track:1MYYt7h6amcrauCOoso3Gx,0.133
54,2,2.3e-05,https://api.spotify.com/v1/audio-analysis/3x2m...,0.414,65306,0.959,3x2mJ2bjCIU70NrH49CtYR,0.933,7,0.368,-4.299,0,0.0378,145.911,4,https://api.spotify.com/v1/tracks/3x2mJ2bjCIU7...,audio_features,spotify:track:3x2mJ2bjCIU70NrH49CtYR,0.589
55,3,0.0534,https://api.spotify.com/v1/audio-analysis/1Pm3...,0.522,108532,0.205,1Pm3fq1SC6lUlNVBGZi3Em,0.235,2,0.0985,-7.986,1,0.0376,103.868,4,https://api.spotify.com/v1/tracks/1Pm3fq1SC6lU...,audio_features,spotify:track:1Pm3fq1SC6lUlNVBGZi3Em,0.353
59,7,0.201,https://api.spotify.com/v1/audio-analysis/7dkb...,0.493,226000,0.969,7dkbEHIMLoeuG4zXGmzhEH,0.655,2,0.247,-3.282,0,0.0463,170.581,4,https://api.spotify.com/v1/tracks/7dkbEHIMLoeu...,audio_features,spotify:track:7dkbEHIMLoeuG4zXGmzhEH,0.34
88,36,0.00774,https://api.spotify.com/v1/audio-analysis/0hBb...,0.293,658987,0.787,0hBby0yygBY1u3m6tSpZgC,0.496,5,0.392,-8.841,1,0.0438,165.307,4,https://api.spotify.com/v1/tracks/0hBby0yygBY1...,audio_features,spotify:track:0hBby0yygBY1u3m6tSpZgC,0.138


In [4]:
songs.shape

(199252, 19)

### This function calculates the distance in our KNN Model

In [7]:
def distance(songs, seed_song):
    '''
    inputs: 
        idkey : song ID of seed song
        songs : Dataframe of songs to choose from
    outputs:
        dist : pandas Series containing calculated 'distance' from songs in the input dataframe to the seed song
    '''
    dist = np.sqrt((np.subtract(songs.acousticness, seed_song.acousticness))**2
                 + (np.subtract(songs.danceability, seed_song.danceability))**2
                 + (np.subtract(songs.energy, seed_song.energy))**2
                 + (np.subtract(songs.instrumentalness, seed_song.instrumentalness))**2
                 + (np.subtract(songs.liveness, seed_song.liveness))**2
                 + (np.subtract(songs.speechiness, seed_song.speechiness))**2
                 + (np.subtract(songs.tempo, seed_song.tempo))**2)
        
    return dist

### Enter song ID to seed the playlist from

In [8]:
# spotify:track:1ghlpxVfPbFH2jenrv9vVw
# spotify:track:0AJUX8BRUehB6RHPZUOoYS
# spotify:track:5IVuqXILoxVWvWEPm82Jxr
# spotify:track:5IVuqXILoxVWvWEPm82Jxr
# spotify:track:550rQQCGkrTzvp4SfpOPzx
# spotify:track:2c37Gkpu75l3kvh1FUZrHV
spotify:track:7AJIHT8hK423KPQZtvwEkM
spotify:track:7N3PAbqfTjSEU1edb2tY8j

SyntaxError: invalid syntax (<ipython-input-8-1c599e3a7647>, line 7)

In [9]:
song_id = '5cYA45RVGI6F4f06gtWjsd' # Crazy in Love by Beyonce


In [10]:
seed_song = pd.DataFrame(sp.audio_features(song_id))
seed_song

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.349,0.166,4,-14.957,1,0.0395,0.903,0.783,0.11,0.0895,97.972,audio_features,5cYA45RVGI6F4f06gtWjsd,spotify:track:5cYA45RVGI6F4f06gtWjsd,https://api.spotify.com/v1/tracks/5cYA45RVGI6F...,https://api.spotify.com/v1/audio-analysis/5cYA...,206960,4


### Get playlist from seed song

In [11]:
print("Seed Song: " + sp.track(song_id)['name'])

Seed Song: Concerto For Violin And Strings In E, Op.8, No.1, R.269 "La Primavera": 1. Allegro


### Final Baseline Playlist Generator

In [12]:
def playlist_generator(songs, seed_song, n):
    topn_dist = distance(songs, seed_song).sort_values()[0:n]
    topn = songs.iloc[topn_dist.index]
    return topn

def playlist_printer(playlist): 
    for ref in playlist['id']:
        track = sp.track(ref)
        print(track['name'])
        print('by')
        for artist in track['artists']:
            print(artist['name'])
        print('-----------------------------------')

In [13]:
playlist = playlist_generator(songs, seed_song, 25)
playlist_printer(playlist)

Lose Control
by
Missy Elliott
Ciara
Fatman Scoop
-----------------------------------
Toxic
by
Britney Spears
-----------------------------------
Crazy In Love (feat. Jay-Z)
by
Beyoncé
JAY Z
-----------------------------------
Rock Your Body
by
Justin Timberlake
-----------------------------------
It Wasn't Me
by
Shaggy
Rik Rok
-----------------------------------
Yeah!
by
Usher
Lil Jon
Ludacris
-----------------------------------
My Boo
by
Usher
Alicia Keys
-----------------------------------
Buttons
by
The Pussycat Dolls
-----------------------------------
Say My Name
by
Destiny's Child
-----------------------------------
Hey Ya!
by
OutKast
-----------------------------------
Promiscuous
by
Nelly Furtado
Timbaland
-----------------------------------
Right Where You Want Me - Radio Edit Version
by
Jesse McCartney
-----------------------------------
Beautiful Soul
by
Jesse McCartney
-----------------------------------
Leavin'
by
Jesse McCartney
-----------------------------------
Me & U


### Baseline Model 2 

Instead of simply taking the audio features as they are, let's scale tempo to be on the same scale as our other normalized predictors

In [None]:
songs2 = songs.copy()
songs2 = songs2.reset_index()
cols = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "speechiness", "tempo"]

def scaler(cols, df_old, df_new):
    df_new = df_old.copy()
    x_scaler = MinMaxScaler().fit(df_old["tempo"].values.reshape(-1,1))
    x_scaled = x_scaler.transform(df_old["tempo"].values.reshape(-1,1))
    df_new["tempo"] = x_scaled
    return(df_new, x_scaler)

songs2, x_scaler = scaler(cols, songs, songs2)