In [None]:
#General############################################
import pandas as pd
import numpy as np
import re
import pprint
from time import sleep
from allfunctions import *
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pyplot import table

#WebScraping########################################
from bs4 import BeautifulSoup
import requests

#Spotify############################################
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from Credentials import Client_ID, Client_Secret
import json

#Machine Learning###################################
from sklearn import datasets 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pickle


------------------------------------------

# Objective Day 1


Create a function to scrape the Billboards 100 HOT songs and create local dataframe of songs with them including:

* Song’s name

* Song’s artist

# Objective Day 2

Create a second function to retrieve the audio features of a given song.

Create a third function to update the internal database with the song features using the previous function and to add this information to the dataframe 


* Song’s url/ uri

* Song’s audio_features


Recommendation: create a function to extend the internal database with songs of your choice/playlist,.... The more songs you have in the database, the better. (you will see tomorrow why).


# Objective Day 3

* Fit the standard scaler to scale the audio features of each song using both dataframes.

* Save the freshly trained standard scaler with pickle

* Get the user’s song audio features

* Apply the trained scaler to transform the user’s song audio features

Create a function to fit  the K-Means clustering method using all the songs contained in the both datasets ( hot100 and spotify ).

Remember to:
- Do some research on the optimal K-value for the K-means
- Save the final K-means model with pickle
- Use the trained K-means model to predict the cluster of each song in the internal databases and add this information to the internal databases



# Objective Day 4

Create a function to ask the user:

- Song title
- Song artist

Create a function to search the user song in the Spotify API to get the audio features

Create a function to retrieve the standard scaler previously saved and to scale the song’s audio features

Create a function to retrieve the K-Means model and predicts to which cluster belongs the song.

Create a function to retrieve one/more songs at random from the corresponding database which belongs to the same cluster as the user’s song and it’s not the same as the user song.


------------------------------------------

# Webscraping Hot 100

In [None]:
#Link to Hot 100 Chart
url = "https://www.billboard.com/charts/hot-100"

#Setting up Spotify API
client_credentials_manager = SpotifyClientCredentials(client_id=Client_ID, client_secret=Client_Secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
#Importing url info
response = requests.get(url)

In [None]:
#Getting the html code for the website.
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
#List of song titles from the Hot 100 chart
top_100_song_titles = [tag.select("span.chart-element__information__song.text--truncate.color--primary")[0].get_text() for tag in soup.select("span.chart-element__information")]

In [None]:
#List of song artists from the Hot 100 chart
top_100_song_artists = [elem.select("span.chart-element__information__artist.text--truncate.color--secondary")[0].get_text() for elem in soup.select("span.chart-element__information")]

In [None]:
#Creating a dictionary with all the information
top_100_artists_titles_dict = {"artist_Names":top_100_song_artists, "song_Names":top_100_song_titles} 

In [None]:
#Converting the dictionary to a dataframe
top_100_artists_titles_df = create_dataframe_from_dictionary(top_100_artists_titles_dict)

## Audio features from one song

In [None]:
#Getting json file with all information about song.
song = get_all_information_about_song("Butter")

Now we get the audiofeatures of our chosen song.

In [None]:
audio_features = sp.audio_features(song["tracks"]["items"][0]["uri"])[0]
#audio_features     #The audio-features of the song are shown as a dictionary.

In [None]:
audio_features_new = { key: [audio_features[key]] for key in list(audio_features.keys()) }

In [None]:
#Dataframe with the audio features for our chosen song.
pd.DataFrame(audio_features_new) 

--------------------------------

## Code

### Top 100 Songs

Getting all the song names from our Hot Songs 100 and making a dataframe with uri. 

In [None]:
#Uri's of top 100 in a list.
top_100_uris = []
for i in top_100_song_titles:
    top_100_uris.append(get_uri_of_song(i))

In [None]:
#List with song uri's converted to list  of dicts of the audiofeatures.
top_100_audio_features = get_audio_features_from_list_of_uris(top_100_uris)

In [None]:
#Creating a dictionary with all the audio features and an empty list, which will be filled with the values from all songs.
top_100_dict_audio_features = create_dict_with_audiofeatures_for_all_songs(top_100_audio_features)
df_top_100 = pd.DataFrame(top_100_dict_audio_features)

### Random playlists

Getting the information from a random playlist.

In [None]:
#This playlist has around 5000 songs.
playlist_id = "4ehqP8QHaIPdHMsssoB4y2" 

#This creates a json list with all the songs info in the playlist.
playlist =  get_playlist_tracks("spotify", playlist_id) 

In [None]:
#The path to the uri for each song. 
#playlist[0]["track"]["uri"] 
get_uri_of_song(playlist[0])

In [None]:
# This part creates three lists, one with the uri's the other with the artists names. The last one is a list of song titles

playlist_uri = [] #Creating a list with all the uris to get the other information.
playlist_artist_names = [] #Creating a list with all the artist names.
playlist_song_names = []

for i in range(len(playlist)):
    playlist_uri.append(playlist[i]["track"]["uri"]) #The i is the name of the song in the playlist.
    playlist_artist_names.append(playlist[i]["track"]["artists"][0]["name"])
    playlist_song_names.append(playlist[i]["track"]["name"])

In [None]:
playlist_audio_features = get_audio_features_from_list_of_uris(playlist_uri)

In [None]:
#Creating a list for each key in the audiofeatures. All keys 
playlist_dict_audiofeature = create_dict_with_audiofeatures_for_all_songs(playlist_audio_features)

In [None]:
#Dataframe column with playlist audio features.
df_playlist = pd.DataFrame(playlist_dict_audiofeature)

In [None]:
#Adding the artist names and song artists to dataframe.
df_artist_names = pd.DataFrame(playlist_artist_names, columns=["Artist_Names"])

In [None]:
#Dataframe column with song names.
df_song_names = pd.DataFrame(playlist_song_names, columns=["Song_Names"])

In [None]:
x = pd.concat([df_song_names, df_artist_names], axis=1)

In [None]:
pd.concat([x, df_playlist], axis=1)

Getting the Dataframe from any random playlist by id using the function.

In [None]:
playlist_id_list1 = ["37i9dQZF1DXcBWIGoYBM5M", 
                     "37i9dQZEVXbMDoHDwVN2tF", 
                     "37i9dQZF1DX0XUsuxWHRQd", 
                     "37i9dQZF1DX10zKzsJ2jva", 
                     "37i9dQZF1DWY7IeIP1cdjF", 
                     "37i9dQZF1DWWMOmoXKqHTD", 
                     "37i9dQZF1DX4o1oenSJRJd", 
                     "37i9dQZF1DWXRqgorJj26U", 
                     "37i9dQZF1DX4UtSsGT1Sbe", 
                     "37i9dQZF1DX76Wlfdnj7AP", 
                     "37i9dQZF1DX4WYpdgoIcn6"]
playlist_id_list2 = ["68wQGN02SQNnDEc58HJfvn", 
                     "1iKrRpeXJbcxIDoixPYHXj", 
                     "6MMT0aj5QfhbG3mmaMxgt8", 
                     "1XSkjkrJjTAqksykUWD32Y", 
                     "6ibR5GQ3F2xsLQ1X4h7MnT", 
                     "37i9dQZF1E4slNaEJk0JW0", 
                     "6sLDvdiSbl9PxuvF8FD72B"] 
                     

In [None]:
#Creating first big dataframe
df_mega1 = mega_dataframe_from_playlist_id_list(playlist_id_list1)   


In [None]:
#Creating second big dataframe
df_mega2 = mega_dataframe_from_playlist_id_list(playlist_id_list2)

## Preparation

In [None]:
#Concatenating the two big dataframes.
data = pd.concat([df_mega1, df_mega2])
data_copy = data.copy()

In [None]:
data2 = df_top_100
data_copy2 = data2.copy()

In [None]:
len(data_copy.isnull())

In [None]:
len(data_copy2.isnull())

In [None]:
#data_copy.dropna()

In [None]:
df_copy = data_copy.dropna()

In [None]:
df_copy2 = data_copy2.dropna()

In [None]:
df_copy.columns

In [None]:
df_copy2.columns

In [None]:
numerical = df_copy

In [None]:
numerical2 = df_copy2

In [None]:
#numerical

In [None]:
numerical.drop(["Song_Names", "Artist_Names", "type", "id", "uri", "track_href", "analysis_url", "time_signature"], axis=1, inplace=True)

In [None]:
numerical2.drop(["type", "id", "uri", "track_href", "analysis_url", "time_signature"], axis=1, inplace=True)

**top 100 and internal database together**

## Clustering the scaled df with K-Means

In [None]:
kmeans = KMeans(n_clusters=8, random_state=1234)
kmeans.fit(numerical_scaled_df)

In [None]:
K = range(2, 21)
inertia = []
silhouette = []

for k in K:
    print("Training a K-Means model with {} neighbours! ".format(k))
    print()
    kmeans = KMeans(n_clusters=k,random_state=1234)
    kmeans.fit(numerical_scaled_df)
    inertia.append(kmeans.inertia_)
    filename = "../datasets/kmeans_" + str(k) + ".pickle"
    with open(filename, "wb") as f:
        pickle.dump(kmeans,f)
    silhouette.append(silhouette_score(numerical_scaled_df, kmeans.predict(numerical_scaled_df)))

In [None]:
scaler = StandardScaler()
scaler.fit(numerical)
numerical_scaled = scaler.transform(numerical)
numerical_scaled_df = pd.DataFrame(numerical_scaled, columns = numerical.columns)
display(numerical_scaled_df.head())
data_cluster = kmeans.predict(numerical_scaled)

In [None]:
scaler = StandardScaler()
scaler.fit(numerical2)
numerical_scaled2 = scaler.transform(numerical2)
numerical_scaled_df2 = pd.DataFrame(numerical_scaled2, columns = numerical2.columns)
display(numerical_scaled_df2.head())
data2_cluster = kmeans.predict(numerical_scaled2)

In [None]:
numerical3 = pd.concat([numerical, numerical2])
scaler = StandardScaler()
scaler.fit(numerical3)
numerical_scaled3 = scaler.transform(numerical3)
numerical_scaled_df3 = pd.DataFrame(numerical_scaled3, columns = numerical3.columns)
display(numerical_scaled_df3.head())

In [None]:
K = range(2, 21)
inertia = []
silhouette = []

for k in K:
    print("Training a K-Means model with {} neighbours! ".format(k))
    print()
    kmeans = KMeans(n_clusters=k,random_state=1234)
    kmeans.fit(numerical_scaled_df3)
    inertia.append(kmeans.inertia_)
    filename = "../datasets2/kmeans_" + str(k) + ".pickle"
    with open(filename, "wb") as f:
        pickle.dump(kmeans,f)
    silhouette.append(silhouette_score(numerical_scaled_df3, kmeans.predict(numerical_scaled_df3)))

In [None]:

plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Silhouette Method showing the optimal k')

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')

## Objective 2

Save the freshly trained standard scaler with pickle

In [None]:
with open("../datasets/scaler.pickle", "wb") as f:
    pickle.dump(scaler,f)

with open("../datasets/kmeans_4.pickle", "wb") as f:
    pickle.dump(kmeans,f)

In [None]:
def load(filepath = "../data/kmeans.pickle"): 
    try: 
        with open(filename, "rb") as f: 
            return pickle.load(f) 
    except FileNotFoundError: 
        print("File not found!")

In [None]:
scaler2 = load("../datasets/scaler.pickle")
scaler2

Apply the trained scaler to transform the user’s song audio features

In [None]:
song_features = user_song_audio_featuers("Garden")
#x = get_audio_feature_items_into_seperate_lists(song_features)

#.drop(["type", "id", "uri", "track_href", "analysis_url", "time_signature"], axis=1, inplace=True)
#= song_features.drop(numerical.drop(["type", "id", "uri", "track_href", "analysis_url", "time_signature"], axis=1, inplace=True))

In [None]:
song_name = input("Please input the song name: ")

In [None]:
song_dict = {"Song_Name": [song_name], "Artist_Name": [get_artists_name_from_song(song_name)]}
song_df = pd.DataFrame(song_dict)
#song_df.columns
#scaler = StandardScaler()
kmeans = load("../Users/kubra/Desktop/Week3/Project-Week3-Spotify-ML/datasets/datasets/kmeans_4.pickle")
song_features_dict = { key: [value] for key, value in song_features[0][0].items() }
song_features_df = pd.DataFrame(song_features_dict)
#song_features_df
song_df = pd.concat([song_df,song_features_df], axis= 1)
#song_df.columns 
song_numerical = song_df[numerical3.columns]
song_scaled_numpy = scaler.transform(song_numerical)
song_scaled_df = pd.DataFrame(song_scaled_numpy, columns=numerical3.columns)
user_cluster = kmeans.predict(song_scaled_df)[0]
clusters = kmeans.predict(numerical_scaled3)
# top100
# spotify
# clusters 

data2['Cluster'] = clusters[0:data2.shape[0]]
data['Cluster']  = clusters[data2.shape[0]:]
#print(user_cluster)
#Search user song in hot100 dataframe which has the name data2. The search is base in song id
if ( song_df['id'].values in data2['id'].values ): # The song is in the hot100
    recomendation = data2[ data2['Cluster'] == user_cluster ].sample()
    print("Maybe you will also like the song {} from artist {}".format(recomendation['Song_Names'].values[0],recomendation['Artist_Names'].values[0]))
else:
    recomendation = data[ data['Cluster'] == user_cluster ].sample()
    print("Try this one. Maybe you will like the song {} from artist {}".format(recomendation['Song_Names'].values[0],recomendation['Artist_Names'].values[0]))

  

#song_scaled = scaler.transform(song_features)
#song_cluster = kmeans.predict(song_scaled)
#song_final_df = song_df
#song_final_df["Cluster"] = song_cluster