# Project 4: Hackathon

## *Template Notebook*

In this notebook:

* [Topic 1](#topic-1)
* [Topic 2](#topic-2)

#### Import Libraries & Read in Data

In [2]:
## standard imports 
import pandas as pd 
import numpy as np
import re
## visualizations
import matplotlib.pyplot as plt
import seaborn as sns
## preprocessing
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

## options
import sklearn
pd.options.display.max_rows = 4000
pd.options.display.max_columns = 100
pd.set_option('max_colwidth', 100)

In [3]:
### read in data
data = pd.read_csv('../data/hiphop_clustering.csv')

In [4]:
data.head(2)

Unnamed: 0,artist_name,track_id,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
0,J. Cole,2JvzF1RMd7lE3KmFlsyZD8,0.96,0.149,0.837,0.364,0.0,0.271,-11.713,0.276,123.984,0.463
1,Meek Mill,2IRZnDFmlqMuOrYOLnZZyc,0.95,0.259,0.889,0.496,0.0,0.252,-6.365,0.0905,86.003,0.544


In [10]:
### Select data
features = ['popularity', 'acousticness', 'danceability', 'energy', 'speechiness', 'valence']
X = data[features]
X.head()

Unnamed: 0,popularity,acousticness,danceability,energy,speechiness,valence
0,0.96,0.149,0.837,0.364,0.276,0.463
1,0.95,0.259,0.889,0.496,0.0905,0.544
2,0.93,0.0395,0.837,0.636,0.086,0.274
3,0.88,0.00195,0.942,0.383,0.565,0.38
4,0.92,0.194,0.729,0.625,0.0315,0.261


## Topic 1 <a class="anchor" id="topic-1"></a>
<hr/>

In [6]:
sim_matrix = linear_kernel(X, X)

In [7]:
sim_matrix

array([[2.067411  , 2.152078  , 1.9813565 , ..., 1.3363441 , 1.4134089 ,
        1.9373827 ],
       [2.152078  , 2.31004425, 2.1101185 , ..., 1.4821333 , 1.53026345,
        2.1273897 ],
       [1.9813565 , 2.1101185 , 2.05399725, ..., 1.38584035, 1.4068773 ,
        1.95470685],
       ...,
       [1.3363441 , 1.4821333 , 1.38584035, ..., 1.20867301, 0.86161092,
        1.68924455],
       [1.4134089 , 1.53026345, 1.4068773 , ..., 0.86161092, 1.2430687 ,
        1.296205  ],
       [1.9373827 , 2.1273897 , 1.95470685, ..., 1.68924455, 1.296205  ,
        2.49323969]])

In [8]:
mapping = pd.Series(data.index, index=data['track_id'])

## Topic 2 <a class="anchor" id="topic-2"></a>
<hr/>

In [70]:
def recc_tracks(track_features):
    track_index = mapping[track_features]
    sim_score = list(enumerate(sim_matrix[track_index]))
    
    sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)
    
    ## Get top 15 tracks
    sim_score_15 = sim_score[:15]
    
    #return track ids using mapping series
    track_indices = [i[0] for i in sim_score_15]
    
#     reccs = pd.concat(data['track_id'].loc[track_indices], data['artist_name'], axis=0)
    return data['track_id'].loc[track_indices]

In [11]:
# WAP = [97, 0.0194, 0.935, 0.454, 0.375, 0.357]

In [71]:
# r_tracks = pd.DataFrame(recc_tracks('2JvzF1RMd7lE3KmFlsyZD8'))
# r_tracks
r_tracks = recc_tracks('2JvzF1RMd7lE3KmFlsyZD8')
r_tracks

185     5ByAIlEEnxYdvpnezg7HTX
134     6FyRXC8tJUh863JCkyWqtk
263     7Ie9W94M7OjPoZVV216Xus
480     04KTF78FFg8sOHC1BADqbY
10      0JP9xo3adEtGSdUEISiszL
192     3m2aDCqn8eT5ElXDUiboGU
3216    6CWMm2mJOrKyDH4W1X5d06
324     44nlOJLjLqwNgYcT277V25
78      1B4gJRq61xTs6r1O0Uq2iY
1027    4CstQ0SUkl0YkoeZkIZlIx
356     6U9OkV9oa8kN8LyBGf3wvJ
3024    4yZf3ui3Jc8dx8MMVZtHWE
235     6p8NuHm8uCGnn2Dtbtf7zE
54      21kOVEG3bDCVphKhXL8XmQ
212     2O8gSQ9z52tZSLJnuzjkcG
Name: track_id, dtype: object

In [82]:
# pd.concat([r_tracks, data], axis=)
rec_tracks_df = data[data['track_id'].isin(r_tracks.values)]
# rec_artists = pd.concat([data['artist_name'], r_tracks], axis=1)
rec_tracks_df

Unnamed: 0,artist_name,track_id,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
10,XXXTENTACION,0JP9xo3adEtGSdUEISiszL,0.91,0.556,0.921,0.537,0.00404,0.102,-5.723,0.0804,128.009,0.711
54,Comethazine,21kOVEG3bDCVphKhXL8XmQ,0.82,0.221,0.831,0.795,0.0,0.291,-6.186,0.264,160.063,0.744
78,Takeoff,1B4gJRq61xTs6r1O0Uq2iY,0.79,0.525,0.921,0.607,1e-06,0.123,-7.397,0.209,140.071,0.753
134,Paulo Londra,6FyRXC8tJUh863JCkyWqtk,0.95,0.323,0.767,0.709,0.0,0.0676,-4.47,0.336,171.993,0.72
185,The Notorious B.I.G.,5ByAIlEEnxYdvpnezg7HTX,0.78,0.472,0.889,0.816,0.0,0.204,-4.67,0.247,96.056,0.777
192,Lil Yachty,3m2aDCqn8eT5ElXDUiboGU,0.76,0.105,0.95,0.489,0.0,0.411,-8.411,0.462,130.021,0.919
212,G-Eazy,2O8gSQ9z52tZSLJnuzjkcG,0.76,0.0729,0.843,0.806,0.0,0.758,-5.156,0.367,97.858,0.821
235,Migos,6p8NuHm8uCGnn2Dtbtf7zE,0.78,0.307,0.92,0.674,0.0,0.104,-5.662,0.264,141.967,0.741
263,Eminem,7Ie9W94M7OjPoZVV216Xus,0.79,0.529,0.855,0.954,0.0,0.205,-1.19,0.264,114.635,0.668
324,Bass Santana,44nlOJLjLqwNgYcT277V25,0.75,0.747,0.936,0.514,0.0,0.11,-8.908,0.271,129.997,0.812


In [67]:
rec_artists['artist_name'].shape

(9295,)

In [74]:
r_track_ids = r_tracks.values
r_track_uris = []
for track_id in r_track_ids:
    uri = 'spotify:track:'+ track_id
    print(uri)
    r_track_uris.append(uri)

spotify:track:5ByAIlEEnxYdvpnezg7HTX
spotify:track:6FyRXC8tJUh863JCkyWqtk
spotify:track:7Ie9W94M7OjPoZVV216Xus
spotify:track:04KTF78FFg8sOHC1BADqbY
spotify:track:0JP9xo3adEtGSdUEISiszL
spotify:track:3m2aDCqn8eT5ElXDUiboGU
spotify:track:6CWMm2mJOrKyDH4W1X5d06
spotify:track:44nlOJLjLqwNgYcT277V25
spotify:track:1B4gJRq61xTs6r1O0Uq2iY
spotify:track:4CstQ0SUkl0YkoeZkIZlIx
spotify:track:6U9OkV9oa8kN8LyBGf3wvJ
spotify:track:4yZf3ui3Jc8dx8MMVZtHWE
spotify:track:6p8NuHm8uCGnn2Dtbtf7zE
spotify:track:21kOVEG3bDCVphKhXL8XmQ
spotify:track:2O8gSQ9z52tZSLJnuzjkcG


In [73]:
len(r_track_uris)

15

In [None]:
spotify:track:2JvzF1RMd7lE3KmFlsyZD8