# Visualizations

In [23]:
# imports

import numpy as np
from math import pi
import pandas as pd
import seaborn as sns
from annoy import AnnoyIndex
import matplotlib.pyplot as plt
from sklearn import preprocessing

In [24]:
df = pd.read_csv('/Users/flanuer/Downloads/Lambda/Course_material/misc_datasets/non_dup_200k_song_aud_feat.csv')
drop_col = ['Unnamed: 0']
df = df.drop(drop_col, axis=1)

In [25]:
df.head(3)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature
0,0.746,0.765,6,-4.41,0,0.0993,0.0112,0.0,0.0936,0.737,114.044,0v1x6rN6JHRapa03JElljE,199054,4
1,0.935,0.454,1,-7.509,1,0.375,0.0194,0.0,0.0824,0.357,133.073,4Oun2ylbjFKMPTiaSbbCih,187541,4
2,0.548,0.816,0,-4.209,1,0.0465,0.122,0.0,0.335,0.557,95.39,6UelLqGlWMcVH1E5c4H7lY,174000,4


In [26]:
df.describe(percentiles=[.25, .5, .75, .90, .95])

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
count,210124.0,210124.0,210124.0,210124.0,210124.0,210124.0,210124.0,210124.0,210124.0,210124.0,210124.0,210124.0,210124.0
mean,0.567127,0.550861,5.243942,-10.453061,0.627225,0.12702,0.367884,0.211113,0.194406,0.434407,118.455018,231524.2,3.858212
std,0.189523,0.275391,3.590865,6.846359,0.483544,0.180621,0.357377,0.35113,0.167107,0.257878,30.919423,179375.9,0.562426
min,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3203.0,0.0
25%,0.447,0.342,2.0,-12.904,0.0,0.0378,0.0327,0.0,0.0974,0.221,95.038,170064.5,4.0
50%,0.591,0.585,5.0,-8.262,1.0,0.0531,0.232,0.000192,0.124,0.413,119.96,207726.0,4.0
75%,0.708,0.777,8.0,-5.748,1.0,0.12,0.713,0.327,0.237,0.631,138.001,251760.0,4.0
90%,0.797,0.901,10.0,-4.223,1.0,0.317,0.944,0.887,0.383,0.808,160.034,323827.0,4.0
95%,0.84,0.946,11.0,-3.459,1.0,0.475,0.983,0.926,0.573,0.89,173.48925,410093.0,4.0
max,0.996,1.0,11.0,1.806,1.0,0.969,0.996,1.0,0.999,1.0,249.983,5925082.0,5.0


### Cleaning the Data
Normalizing Def: Rescaling numeric attributes to range between 0 - 1

In [27]:
df.shape

(210124, 14)

In [28]:
def reduce_mem_usage(df, verbose=True):
    """ Function iterates through all the columns of a dataframe and modify the data type
        to reduce memory usage.
        Credit to: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
        Parameters
        ----------
        df : Pandas DataFrame
        verbose: (True) by default, prints out before and after memory usage
        Returns
        -------
        df : Reduced Memory Pandas DataFrame
    """

    if verbose:
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    if verbose:
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(
            100 * (start_mem - end_mem) / start_mem))

    return df

In [29]:
df = reduce_mem_usage(df)
df.shape

Memory usage of dataframe is 22.44 MB
Memory usage after optimization is: 6.61 MB
Decreased by 70.5%


(210124, 14)

In [30]:
# checking for duplicated and NA values in ID column

df['id'].duplicated().any(), df.isna().any(), df.isnull().any()

(False,
 danceability        False
 energy              False
 key                 False
 loudness            False
 mode                False
 speechiness         False
 acousticness        False
 instrumentalness    False
 liveness            False
 valence             False
 tempo               False
 id                  False
 duration_ms         False
 time_signature      False
 dtype: bool,
 danceability        False
 energy              False
 key                 False
 loudness            False
 mode                False
 speechiness         False
 acousticness        False
 instrumentalness    False
 liveness            False
 valence             False
 tempo               False
 id                  False
 duration_ms         False
 time_signature      False
 dtype: bool)

In [31]:
# Removing high speechiness values because they are typically vocal tracks of poems etc
# .32 was determined by the pd.DataFrame.describe() method; retains 90% of data

df = df[df['speechiness'] < .32]
df.shape

(189334, 14)

## Approximate Nearest Neighbors (ANN) Model without dimensionality reduction

In [13]:
normalized_X[0]

array([ 0.97757577,  0.74299462,  0.21746881,  0.85387916, -1.30526954,
        0.35009147, -0.96966725, -0.63694419, -0.58619537,  1.16306124,
       -0.15088629, -0.20253793,  0.24950356])

In [14]:
# Applying ANNOY with different distance metric

# metrics list
metrics = ['angular', 'euclidean', 'manhattan', 'hamming', 'dot']

# Using normalized_X because it is in array form and normalize
for metric in metrics:
    f = len(features)
    t = AnnoyIndex(f, metric)
    for i in range(len(normalized_X)):
        t.add_item(i, normalized_X[i])
    t.build(30)
    t.save(f'test_{metric}.ann')

# f = len(features)
# t = AnnoyIndex(f, 'angular')

# index maping items; keep track of index mapping
# for i in range(len(X)):
#     t.add_item(i, X_array[i])

# t.build(20)
# t.save('test.ann')

In [15]:
metric_list = []
for metric in metrics:
    u = AnnoyIndex(f, metric)
    u.load(f'test_{metric}.ann')
    globals()["test_" + metric] = u.get_nns_by_item(0, 20, include_distances=True, search_k=15)

In [16]:
for metric in metrics:
    globals()['song_list_' + metric] = []
    for song in globals()['test_' + metric][0]:
        globals()['song_list_' + metric].append(df.id.iloc[song])

### ANNOY with dimensionality reduction

In [18]:
# categorical encoding of IDs

danceability        float16
energy              float16
key                    int8
loudness            float16
mode                   int8
speechiness         float16
acousticness        float16
instrumentalness    float16
liveness            float16
valence             float16
tempo               float16
id                   object
duration_ms           int32
time_signature         int8
dtype: object

In [17]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [60]:
pca = PCA()
pca.fit(normalized_X)
pca_data = pca.transform(normalized_X)

In [62]:
pca_data.shape

(189334, 13)

In [65]:
loading_scores = pd.Series(pca.components_[0], index=features)
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
top = sorted_loading_scores[:].index.values
print(loading_scores[top])

tempo               0.995083
loudness           -0.090250
key                 0.030430
time_signature      0.025736
mode                0.004030
valence             0.003601
danceability        0.003565
energy              0.003044
instrumentalness    0.002994
acousticness        0.002952
liveness            0.001815
duration_ms        -0.001634
speechiness         0.000823
dtype: float64


In [67]:
drop_cols = [
    'mode',
    'valence',
    'danceability',
    'energy',
    'instrumentalness',
    'acousticness',
    'liveness',
    'duration_ms',
    'speechiness']
df = df.drop(drop_cols, axis=1)

In [68]:
features = ['tempo', 'loudness', 'key', 'time_signature']
X = df[features]

In [69]:
normalized_X = preprocessing.normalize(X)

In [78]:
# Applying ANNOY with different distance metric after PCA

# metrics list
metrics = ['angular', 'euclidean', 'manhattan', 'hamming', 'dot']

# Using normalized_X because it is in array form and normalize
for metric in metrics:
    f = len(features)
    t = AnnoyIndex(f, metric)
    for i in range(len(normalized_X)):
        t.add_item(i, normalized_X[i])
    t.build(30)
    t.save(f'test_{metric}_pca.ann')

In [80]:
metric_list = []
for metric in metrics:
    u = AnnoyIndex(f, metric)
    u.load(f'test_{metric}_pca.ann')
    v = u.get_item_vector(0)
    globals()["test_" + metric + "_pca"] = u.get_nns_by_vector(v, 10, search_k=100)

for metric in metrics:
    globals()['pca_song_list_' + metric] = []
    for song in globals()['test_' + metric + '_pca']:
        globals()['pca_song_list_' + metric].append(df.id.iloc[song])

In [81]:
pca_song_list_angular

['0v1x6rN6JHRapa03JElljE',
 '1Rham8xtHYhHxp5RCySnMk',
 '51trdVTGB18HFEsb8dEzq1',
 '10JbxMr5YV6nEM1YtorHES',
 '4eRfCcicjpjoCaYWylRU7m',
 '4oPqTEZApxZCx6ILs4xHwu',
 '5r6IhrGntQ9UUcAG1MNT6F',
 '1oiUGWvAAxFDt7Q45xOjzg',
 '0JumfyG7GUshFjxouEXwhw',
 '2zMqWoeZJlJ7cg7LHjeSuw']

# User Statistics
- Obtain user library
    - Track Audio Features
    - Artists
    - Playlists
    - Track Names
    - Genres
        - Can be gathered via artist
    - All time top songs/artist

In [49]:
from os import getenv
import spotipy
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyOAuth

In [50]:
load_dotenv()
client_secret = getenv('SPOTIFY_CLIENT_SECRET')
client_id = getenv('SPOTIFY_CLIENT_ID')
user_id = 'Agustinvargas'

# change for deplpoyment
uri = getenv('uri')

# Scopes: User top track; creates playlist
scope = getenv('SCOPE')
cache_path = '../../token_cache/'

spot_cc = spotipy.oauth2.SpotifyOAuth(
                                username=user_id,
                                client_id=client_id,
                                client_secret=client_secret,
                                scope=scope,
                                redirect_uri=uri,
                                show_dialog=True
                                )


spot_session = spotipy.Spotify(oauth_manager=spot_cc)

### Creating a playlist from the ANNOY generated nearest neighbor songs

In [96]:
# spot_session.user_playlist_create(user='37t3cvb5u3o97hin4bsj40abw', name='ANNOY TEST')

# spot_session.user_playlist_add_tracks(user='37t3cvb5u3o97hin4bsj40abw', playlist_id='3NMt8PDHIVkYTyzVNusGM7', tracks=pca_song_list_manhattan)

# spot_session.user_playlist_remove_all_occurrences_of_tracks(user='37t3cvb5u3o97hin4bsj40abw', playlist_id='3NMt8PDHIVkYTyzVNusGM7', tracks=pca_song_list_manhattan)

{'snapshot_id': 'MTMsNzVmNDYwZTk3OTNmYWEzZGNhNzNkNzMwMjdiMDM0ZjEzOTMyYjYzYg=='}

### Track Audio Features

In [12]:
def track_audio_feat(user, all_tracks=None, top_tracks=None, term=None):
    """Generate DataFrame from user music library.
    
    Args:
        'user': Spotify user ID
        'all_tracks': if True, gathers all the track IDs for a
        given user's library
        'top_tracks': if True, gathers the user's top track IDs
        'term': The term should be specified; it allows to identify the
        term over which data is being acquired.  short_term refers to
        7 days; medium_term refers to 6 months; long_term is all time
    Returns:
        DataFrame containing the user's music tastes
    """
    artist_lst = []
    offset = 0
    for _ in range(2):
        top_artists = spot_session.current_user_top_artists()
    
    return None

In [111]:
trak = spot_session.track(track_id='0NeJjNlprGfZpeX2LQuN6c')
trak['name']

'River'

In [112]:
genres_col = []
for track_id in song_ids:
    track_info = spot_session.track(track_id)
    artist = spot_session.artist(track_info['artists'][0]['id'])
    genres_col.append(artist['genres'])
genres_col

[['contemporary country',
  'country',
  'country dawn',
  'country pop',
  'country road',
  'modern country rock',
  'roots americana',
  'texas country'],
 ['argentine hip hop', 'trap argentino', 'trap latino'],
 ['alternative americana',
  'alternative country',
  'ectofolk',
  'folk',
  'lilith',
  'new americana'],
 ['black thrash',
  'crossover thrash',
  'death metal',
  'metal',
  'mexican metal',
  'new wave of thrash metal',
  'portland metal',
  'thrash metal'],
 ['alternative metal',
  'french death metal',
  'french metal',
  'metal',
  'nu metal',
  'progressive groove metal'],
 ['classic swedish pop',
  'danspunk',
  'svensk progg',
  'swedish alternative rock',
  'swedish prog'],
 ['alternative emo',
  'anthem emo',
  'bubblegrunge',
  'emo',
  'indie punk',
  'indie rock',
  'midwest emo'],
 [],
 ['shimmer pop'],
 ['dutch trance', 'edm', 'pop dance', 'progressive house', 'trance'],
 ['japanese city pop'],
 ['bay area hip hop',
  'cali rap',
  'hyphy',
  'oakland hip h

In [135]:
# spot_session.user_playlist_create(user='', name='Annoy_test', public=False)

In [136]:
# spot_session.user_playlist_add_tracks(user='', playlist_id='', tracks=)

In [126]:
df = pd.read_csv('/Users/flanuer/Downloads/Lambda/Course_material/misc_datasets/kdtree_suggestions.csv')
drop_col = ['Unnamed: 0']
df = df.drop(drop_col, axis=1)

In [131]:
song_ids_2 = df.iloc[1].to_list()