In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spotipy
import datapane as dp
import chart_studio.plotly as py
import chart_studio
import os

# Setting up Plotly credentials
username = os.environ['PLOTLY_USERNAME']
api_key = os.environ['PLOTLY_API_KEY']
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

# Adjusting matplotlib font size
plt.rcParams.update({'font.size': 22})
%matplotlib inline

In [None]:
# Read data from CSV files
spotify_data = pd.read_csv('./data/data.csv.zip')
genre_data = pd.read_csv('./data/data_by_genres.csv')
data_by_year = pd.read_csv('./data/data_by_year.csv')

# Display information about the datasets
spotify_data.head(10)
spotify_data.info()
genre_data.info()
data_by_year.info()

In [None]:
## Exploratory Data Analysis
sns.distplot(spotify_data['popularity'])

In [None]:
# Define a function to categorize years into decades
def get_decade(year):
    period_start = int(year/10) * 10
    decade = '{}s'.format(period_start)
    return decade

# Apply the 'get_decade' function to create a new 'decade' column in spotify_data
spotify_data['decade'] = spotify_data['year'].apply(get_decade)

# Create a count plot for the 'decade' column
sns.set(rc={'figure.figsize':(11, 6)})
sns.countplot(spotify_data['decade'])

import plotly.express as px

In [None]:
# Define a list of sound features
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']

# Create a line plot for sound features over time using Plotly
fig = px.line(data_by_year, x='year', y=sound_features)
fig.show()

# Create a Datapane report and publish it
report = dp.Report(dp.Plot(fig))
report.publish(name='music_over_time', open=True, visibility='PUBLIC')


In [None]:
# Create a line plot for the 'tempo' feature over time using Plotly
fig = px.line(data_by_year, x='year', y='tempo')
fig.show()

# Create a Datapane report and publish it
report = dp.Report(dp.Plot(fig))
report.publish(name='music_tempo_over_time', open=True, visibility='PUBLIC')


In [None]:
# Select the top 10 genres based on popularity from genre_data
top10_genres = genre_data.nlargest(10, 'popularity')

# Create a bar plot using Plotly to visualize characteristics of top genres
fig = px.bar(top10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')
fig.show()

# Create a Datapane report and publish it
report = dp.Report(dp.Plot(fig))
report.publish(name='sound_of_different_genres', open=True, visibility='PUBLIC')


In [None]:
# Import necessary libraries for clustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create a clustering pipeline with StandardScaler and KMeans
cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10, n_jobs=-1))])

# Select numeric columns from genre_data
X = genre_data.select_dtypes(np.number)

# Fit the clustering pipeline to the data and create a 'cluster' column in genre_data
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

# Import TSNE for dimensionality reduction
from sklearn.manifold import TSNE

# Create a TSNE pipeline for dimensionality reduction
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=2))])

# Fit TSNE to the data and obtain genre embeddings
genre_embedding = tsne_pipeline.fit_transform(X)

# Create a DataFrame to store genre embeddings
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

# Create a scatter plot of genre embeddings using Plotly
fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()

# Create a Datapane report and publish it
report = dp.Report(dp.Plot(fig))
report.publish(name='clustering_genres', open=True, visibility='PUBLIC')


In [None]:
# Create a pipeline for clustering songs
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()),
                                  ('kmeans', KMeans(n_clusters=20,
                                   verbose=2, n_jobs=4))], verbose=True)

# Select numeric columns from spotify_data
X = spotify_data.select_dtypes(np.number)

# Get the column names of numeric columns
number_cols = list(X.columns)

# Fit the song clustering pipeline to the data
song_cluster_pipeline.fit(X)

# Predict cluster labels for songs and add them as a new column in spotify_data
song_cluster_labels = song_cluster_pipeline.predict(X)
spotify_data['cluster_label'] = song_cluster_labels

# Import PCA for dimensionality reduction
from sklearn.decomposition import PCA

# Create a PCA pipeline for dimensionality reduction of song embeddings
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])

# Fit PCA to the data and obtain song embeddings
song_embedding = pca_pipeline.fit_transform(X)

# Create a DataFrame to store song embeddings
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = spotify_data['name']
projection['cluster'] = spotify_data['cluster_label']

# Create a scatter plot of song embeddings using Plotly
fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

# Create a Datapane report and publish it
report = dp.Report(dp.Plot(fig))
report.publish(name='clustering_songs', open=True, visibility='PUBLIC')


In [None]:
## Building a Content-Based Recommender System
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

# Initialize Spotipy with client credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=os.environ["SPOTIFY_CLIENT_ID"],
                                                           client_secret=os.environ["SPOTIFY_CLIENT_SECRET"]))

# Function to find a song by name and year
def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,
                                                       year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]

    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

# Function to get song data from Spotify or a database
def get_song_data(song, spotify_data):
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name'])
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data

    except IndexError:
        return find_song(song['name'], song['year'])


# Function to calculate the mean vector of a list of songs
def get_mean_vector(song_list, spotify_data):
    song_vectors = []
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)

    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

# Function to flatten a list of dictionaries into a single dictionary
def flatten_dict_list(dict_list):
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []

    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)

    return flattened_dict


#Function to recommend songs based on a list of seed songs
def recommend_songs( song_list, spotify_data, n_songs=10):

    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)

    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])

    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

recommend_songs([{'name': 'Come As You Are', 'year':1991},
                {'name': 'Smells Like Teen Spirit', 'year': 1991},
                {'name': 'Lithium', 'year': 1992},
                {'name': 'All Apologies', 'year': 1993},
                {'name': 'Stay Away', 'year': 1993}],  spotify_data)
recommend_songs([{'name':'Beat It', 'year': 1982},
                 {'name': 'Billie Jean', 'year': 1988},
                 {'name': 'Thriller', 'year': 1982}], spotify_data)
recommend_songs([{'name': ''}])