# Content-Based Recommendation

Purpose of this jupyter notebook is to develop a recommendation model based on the data set we have on Billboard Hot 100 music between 1950 t0 2018.

The approach is to use both NLP techniques (e.g. Word2Vec) and the unsupervised learning algorithm Nearest Neighbors to generate songs similar to each popular track in the dataset.

In [3]:
import os
import sys
import json
from json.decoder import JSONDecodeError
import webbrowser
import spotipy
import spotipy.util as util
from pprint import pprint

# visualization libraries
import seaborn as sns  
import matplotlib.pyplot as plt
from matplotlib import style
% matplotlib inline

import pandas as pd
import numpy as np

import fuzzyset

from collections import Counter

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [4]:
# load dataframe, without audio features, but no missing lyrics
df_final_set = pd.read_pickle("data/billboard_tracks_1950_2018_FINAL_v2.pkl")

## prep stopwords and list of lyrics

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from gensim.models import Word2Vec

stop_words = ENGLISH_STOP_WORDS | {'?','!','.',',',':',';','oh oh','just',"I'm",'oh','it','Cause','na','la'}

lyrics_list = list(df_final_set['lyrics']) # list of all 4000+ track lyrics
lyrics_list = [i.replace('\n',' ') for i in lyrics_list] # replace line breaks with whitespace

lyrics_texts = [[word for word in document.lower().split() if word not in stop_words]
        for document in lyrics_list]

# Word2Vec

In [None]:
# train Word2Vec model on lyrics w/o stop words
w2v_lyrics_model = Word2Vec(lyrics_texts, size=100, window=5, min_count=1, workers=4, sg=1)

### Word Embeddings are useful because:

- We can measure the semantic similarity between two words
- We can use these word vectors as features for various NLP supervised learning tasks (such as classification, sentiment analysis).

### some helpful Word2Vec functions below:

In [None]:
# print 7 items
list(w2v_lyrics_model.wv.vocab.items())[:7]

# print vectorized array of a specific word
print (w2v_lyrics_model['hi'])

# top 8 most similar word to 'hi'
w2v_lyrics_model.most_similar('hi' ,topn=8)

# similarity between two words
w2v_lyrics_model.similarity('hi','bye')

# Compute cosine_similarity
w2v_lyrics_model.n_similarity(['hi', 'bye'], ['hi', 'lie'])

### helper function to turn each song into a 100-dimensional vector

In [None]:
def avg_feature_vector(song, model, num_features):
    
    index2word_set= set(model.wv.index2word)
    
    words = song.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [None]:
#Initialize a list to store song vectors
import numpy as np
lyrics_vecs =[]

# For each song - split words in lyrics and get the full vector (each song has 100 features)
for song in lyrics_list:
    vector = avg_feature_vector(song, model=w2v_lyrics_model, num_features=100)
    lyrics_vecs.append(vector)

In [None]:
lyrics_vecs[0]

### with 100-dimension vectors for all songs, fit into a NEAREST NEIGHBOR model that finds similar songs based on cosine similarities

In [None]:
from sklearn.neighbors import NearestNeighbors

# turn list of lyric vectors into a numpy array
lyrics_array = np.asarray(lyrics_vecs)

# fit lyrics array into Nearest Neighbor model
nn = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute',n_jobs = -1)
nn.fit(lyrics_array)

##Reshaping the vecs to fit in model
lyrics_vec_T = []
for vec in lyrics_array:
    new_vec = vec.reshape(1,100)
    lyrics_vec_T.append(new_vec)

#List of closest neighbor of a given item, we are extracting the 1st index for each neighbor
closest_song_ex = []
for vec in lyrics_vec_T:
    close_five = nn.kneighbors(X=vec)[1]
    closest_song_ex.append(close_five)

## PICKLE nearest neighbor list

In [None]:
import pickle

with open('data/nn_list.pkl', 'wb') as f:
    pickle.dump(closest_song_ex, f)

## OPEN nearest neighbor list

In [17]:
import pickle

with open('data/nn_list.pkl', 'rb') as f:
    closest_song_ex = pickle.load(f)

### Similar tracks for the first five songs of the data set

In [29]:
closest_song_ex[0:5]

[array([[   0, 1809, 4216, 2015, 3013]]),
 array([[   1, 2620, 2520, 2029, 1232]]),
 array([[  14,    2, 4229, 1379, 1344]]),
 array([[   3, 1520,   12, 2758, 1264]]),
 array([[1958, 4145, 2433, 2299,    4]])]

In [21]:
closest_song_ex[0][0][1:] # four songs most similar to track index 0

array([1809, 4216, 2015, 3013])

In [28]:
# pull url for an album (example)
df_final_set.iloc[[4215,8,1130,4191],[14]].values[0][0]

'https://i.scdn.co/image/602102500b9cebde7559a5f9c16daaaef2846440'

In [32]:
# pull track album url + track preview url (columns 14 + 17 of DataFrame df_final_set)
df_final_set.iloc[[4292,8,1130,4191],[14,17]]

Unnamed: 0,track_album_cover_art,track_preview_url
4292,https://i.scdn.co/image/5026fddc6c0b69e931ae99...,https://p.scdn.co/mp3-preview/6f984c9c6e43eccb...
8,https://i.scdn.co/image/a350bd5b964f06fcf299d2...,
1130,https://i.scdn.co/image/7af24a365c97ee0e55213a...,
4191,https://i.scdn.co/image/6b743d95f5ea98fc26c1af...,


# Use below space / code to work on Flask App
- Refer to separate python file 'music_app.py' - functional code there

## obtain user inputs and start searching for main artist info

In [65]:
# user inputs
artist = 'Adele'
track = 'Hello'

In [66]:
# pull features for song searched by user
results = df_final_set[(df_final_set['artist']==artist) & (df_final_set['title']==track)] \
[['track_album_cover_art','track_preview_url','artist','title']].values

In [67]:
# assign features to variables
track_cover = results[0][0]
track_preview = results[0][1]
track_artist = results[0][2]
track_title = results[0][3]

In [68]:
# create temporary list with song features of interest
temp_list = []
temp_list.append(track_cover)
temp_list.append(track_preview)
temp_list.append(track_artist)
temp_list.append(track_title)

In [70]:
# print out results
temp_list

['https://i.scdn.co/image/602102500b9cebde7559a5f9c16daaaef2846440',
 'https://p.scdn.co/mp3-preview/0b90429fd554bad6785faa2b8931d613db4a0ee4?cid=39cc01d4b4a544ebad4e813e7190e606',
 'Adele',
 'Hello']

## use previously-created Nearest Neighbor list 'closest_song_ex' to find similar songs

In [46]:
artist = 'a-ha'
track = 'Take On Me'

In [47]:
# get song index of main song user searched
song_index = df_final_set[(df_final_set['artist']==artist) & (df_final_set['title']==track)].index[0]

In [62]:
# find similar songs
close_songs = closest_song_ex[song_index][0][1:]

In [50]:
# print out song index of all similar songs
for i in close_songs:
    print(i)

1385
2384
206
2626


In [51]:
temp_list = []
for i in close_songs:

    track_details = df_final_set.iloc[i,][['track_album_cover_art','track_preview_url','artist','title','year', 'agg_genre','num_words','num_syllables','difficult_words','f_k_grade','sentiment_compound','tempo', 'acousticness','instrumentalness','loudness','speechiness','valence']]
    
    temp_list.append(track_details[4])
    

In [52]:
temp_list

[1980, 1993, 1959, 1997]

# helper function to look up details for each similar song
- Function created using code from above

In [58]:
def find_artists(artist,track):
    
    final_dict = {}
    temp_list = []
    counter = 0
    
    # first get details for main song
    results = df_final_set[(df_final_set['artist']==artist) & (df_final_set['title']==track)] \
    [['track_album_cover_art','track_preview_url','artist','title']].values
    
    # store main song details into dictionary
    track_cover = results[0][0]
    track_preview = results[0][1]
    track_artist = results[0][2]
    track_title = results[0][3]
    
    temp_list.append(track_cover)
    temp_list.append(track_preview)
    temp_list.append(track_artist)
    temp_list.append(track_title)
    
    final_dict[str(counter)] = temp_list
    counter+=1
    
    # get song index of main song user searched
    song_index = df_final_set[(df_final_set['artist']==artist) & (df_final_set['title']==track)].index[0]
    
    # find similar songs with nearest neighbor
    close_songs = closest_song_ex[song_index][0][1:]
    
    # iterate through each similar track and gather details
    for i in close_songs:
        temp_list = []
        track_details = df_final_set.iloc[i,][['track_album_cover_art','track_preview_url','artist','title']]
        temp_list.append(track_details[0])
        temp_list.append(track_details[1])
        temp_list.append(track_details[2])
        temp_list.append(track_details[3])
        final_dict[str(counter)] = temp_list
        counter += 1
    
    return final_dict

In [59]:
artist = 'a-ha'
track = 'Take On Me'

In [60]:
rec_songs_list = find_artists(artist,track)

In [61]:
rec_songs_list

{'0': ['https://i.scdn.co/image/c41ae5b513a39d1c616c4c279ebe19981a3092f3',
  'https://p.scdn.co/mp3-preview/0f980d0fac59f77123d0272b78bce97f1374d9e9?cid=39cc01d4b4a544ebad4e813e7190e606',
  'a-ha',
  'Take On Me'],
 '1': ['https://i.scdn.co/image/3606e581d251b1591ad9ac1bca34c4aecff72016',
  None,
  'Blondie',
  'Call Me'],
 '2': ['https://i.scdn.co/image/9d31ca65b1360fd319d5f52cd7f15b0992b08c23',
  'https://p.scdn.co/mp3-preview/5002e02aca86fdd65d4b6d9b0098001e44b3000a?cid=39cc01d4b4a544ebad4e813e7190e606',
  'Michael Jackson',
  'Will You Be There'],
 '3': ['https://i.scdn.co/image/7452533c56b1407b8ec9c25a85c22c2ad566044e',
  'https://p.scdn.co/mp3-preview/542f104f37b855b572e37995d467cc73b7cdf1b5?cid=39cc01d4b4a544ebad4e813e7190e606',
  'Jackie Wilson',
  'Lonely Teardrops'],
 '4': ['https://i.scdn.co/image/c8347eed321b9927f38ab270992132c8f6461561',
  'https://p.scdn.co/mp3-preview/77bbf1e3bc27bfe6339579f71fc0b612391aa5e3?cid=39cc01d4b4a544ebad4e813e7190e606',
  'Mark Morrison',
  'Re