This notebook serves as an example of using PySpark to explore big data, as well as explore the Spotify API functionality, and build a deep-embedding recommendation system. I have some good examples of SQL queries for EDA

In [None]:
%%capture
!pip install pyspark
import numpy as np 
import pandas as pd
import os
import seaborn as sns
import plotly.express as px
# these 2 lines fix a sporatic loading error in plotly
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)
sns.set_style('darkgrid')
# pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

# from pyspark.ml.regression import LinearRegression
# from pyspark.mllib.evaluation import RegressionMetrics

# from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
# from pyspark.ml.feature import VectorAssembler, StandardScaler
# from pyspark.ml.evaluation import RegressionEvaluator

from sklearn.pipeline import Pipeline
# from sklearn.manifold import TSNE
# from sklearn.decomposition import PCA

CSV_FILE= '/kaggle/input/spotify-huge-database-daily-charts-over-3-years/Final database.csv'
# CSV_FILE= '/kaggle/input/spotify-huge-database-daily-charts-over-3-years/Database to calculate popularity.csv'
# df = pd.read_csv(CSV_FILE)
# df.head()

From SparkByExample:
> A spark session unifies all the different contexts, and you can access all the different contexts by invoking them on the spark session object. A Spark “driver” is an application that creates a SparkContext for executing one or more jobs in the Spark cluster. It allows your Spark/PySpark application to access Spark Cluster with the help of Resource Manager.
> 
> When you create a SparkSession object, SparkContext is also created and can be retrieved using spark.sparkContext. SparkContext will be created only once for an application; even if you try to create another SparkContext, it still returns existing SparkContext.

In [None]:
spark = SparkSession.builder.master("local[2]").appName("Spotify-Huge-Dataset").getOrCreate() #.enableHiveSupport()
spark

In [None]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)

Note I'm fixing the numerical features after loading the df. This is much slower than defining the schema before loading into a spark dataframe. I'll come back and define the schema explicitely later when I have some free time.

In [None]:
df = spark.read.option("header", True).csv(CSV_FILE)
df = df.withColumn("Release_date", F.to_date("Release_date", "yyyy-MM-dd"))
numerical_features = ['danceability', 'energy', 'instrumentalness', 'valence', 'liveliness', 'speechiness', 'acoustics',
                      'speechiness', 'acoustics', 'instrumentalness', 'liveliness', 'valence', 'tempo', 'duration_ms', 
                      'time_signature', 'Days_since_release', 'n_words']
for c in numerical_features:
    df = df.withColumn(c, df[c].cast("float"))
cols_to_drop = ['syuzhet_norm', 'bing_norm', 'afinn_norm', 'nrc_norm', 'syuzhet', 'bing'] 
for c in cols_to_drop:
    df.drop(c).collect()
df.printSchema()
# df.show(n=1, truncate=False, vertical=True)

# EDA
Some good SQL queries, Plotly figures, and examples of using pyspark to filter results from a large dataset.

Most popular artist, all countries. Each tally represents a song on a given day (during the last 3 days) that was one of the most 200 most played songs on that day. An artist can have multiple songs per day, and the same song can be counted on again on subsequent days.

In [None]:
# each count is a song that was in the top 200 most played on a day on spotify during the last 3 years
result_df = df.groupBy("Artist") \
              .count() \
              .orderBy("count", ascending=False) \
              .limit(10) \
              .toPandas()
px.bar(result_df, y='Artist', x='count', title='Most Prolific Artists')

In [None]:
# same as above but with seaborn (sometimes plotly doesn't show up in the published notebok)
sns.barplot(data=result_df, y='Artist', x='count').set_title('Most Prolific Artists');

In [None]:
df.registerTempTable("df_table")

In [None]:
# Most popular artist (by sum of popularity of songs) in the USA
res = spark.sql('SELECT Artist, ROUND(SUM(Popularity), 2) AS Populartiy \
                 FROM df_table \
                 WHERE USA == 1 \
                 GROUP BY Artist \
                 ORDER BY AVG(Popularity) DESC \
                 LIMIT 10'
               )
res.show(10, truncate=False)

In [None]:
df.sample(.1).select('Artist').distinct().count() # number of unique artists in 10% random sample

In [None]:
# select only the songs released in 1939
df.filter(F.year(df['Release_date']) == 1939) \
  .select('Title', 'Artist','Release_date', 'Genre') \
  .distinct() \
  .show(5, truncate=False)

### Most Popular Song per Decade
First with a nested query and using pandas to drop duplicates. Then optimized with [scalar-aggregate reduction](https://www.stevenmoseley.com/blog/tech/high-performance-sql-correlated-scalar-aggregate-reduction-queries)

In [None]:
res = spark.sql('SELECT \
                     ROUND(Year(Release_date), -1) AS Decade, \
                     Round(Popularity, 2) AS Popularity, Title, Artist \
                 FROM df_table \
                 INNER JOIN (SELECT Max(Popularity) as mp \
                                FROM df_table \
                             WHERE ROUND(Year(Release_date), -1) IS NOT NULL \
                                 AND USA == 1 \
                             GROUP BY ROUND(Year(Release_date), -1) \
                             ) AS temp \
                 ON temp.mp = df_table.Popularity \
                 ORDER BY Decade ASC, Popularity ASC \
                ')
res.toPandas().drop_duplicates(subset='Decade', keep="last")

In [None]:
# highly optimized version of the above query via scalar-aggregate-reduction
spark.sql('SELECT \
              ROUND(Year(Release_date), -1) as Decade, \
              ROUND(Max(Popularity), 2) as Popularity, \
              SUBSTRING(MAX(CONCAT(LPAD(Popularity, 11, 0), Title)), 12) AS Title, \
              SUBSTRING(MAX(CONCAT(LPAD(Popularity, 11, 0), Artist)), 12) AS Artist \
          FROM df_table \
              WHERE ROUND(Year(Release_date), -1) IS NOT NULL \
                  AND USA == 1 \
          GROUP BY Decade \
          ORDER BY Decade ASC \
          ').show()

### Most popular Genre per decade

In [None]:
 # Most popular genres, period.
spark.sql('SELECT \
              Genre, COUNT(*) AS Tally \
          FROM df_table \
          GROUP BY Genre \
          ORDER BY Tally DESC \
          ').show(5)

In [None]:
res = spark.sql('SELECT  \
                    ROUND(Year(Release_date), -1) AS Decade, \
                    Genre, COUNT(Genre) AS counts \
                FROM df_table \
                WHERE ROUND(Year(Release_date), -1) IS NOT NULL \
                GROUP BY Decade, Genre \
                ORDER BY COUNT(Genre) DESC \
                ') \
            .dropDuplicates(subset=['Decade']) \
            .orderBy('Decade') \
            .show()
# res.toPandas().drop_duplicates(subset='Decade', keep="first")

## Let's see how music changed over the decades

In [None]:
sound_features = ['danceability', 'energy', 'instrumentalness', 'valence', 'liveliness', 'speechiness', 'acoustics']
col_names = ['Decade']
col_names.extend(sound_features)
df_music_features = df.sample(.2, seed=42) \
                      .groupBy(F.round(F.year(df.Release_date), -1)) \
                      .agg({feature: 'mean' for feature in sound_features}) \
                      .toDF(*col_names) \
                      .orderBy('Decade') \
                      .toPandas() \
                      .dropna(axis=0)
fig = px.line(df_music_features, x='Decade', y=sound_features, title='Song Characteristics Over the Decades')
fig.show()

In [None]:
# same as above but with seaborn. (sometimes plotly doesn't show up in the published notebok)
sns.lineplot(data=pd.melt(df_music_features, ['Decade']), x='Decade', y='value', hue='variable').set_title('Song Characteristics Over the Decades');

# Let's check out the spotify API

[currently based off this](https://www.kaggle.com/vatsalmavani/music-recommendation-system-using-spotify-dataset). We can extract more song information than is provided by the dataset by interacting with the Spotify API. Using this, we can get features like song length using `spotipy.audio_features()`

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
SPOTIFY_CLIENT_ID = user_secrets.get_secret("SPOTIFY_CLIENT_ID")
SPOTIFY_CLIENT_SECRET = user_secrets.get_secret("SPOTIFY_CLIENT_SECRET")

In [None]:
%%capture
!pip install spotipy

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID,
                                                           client_secret=SPOTIFY_CLIENT_SECRET
                                                          )
                    )

In [None]:
# to search for a specific song title and filter the returned JSON
sp.search(q='track: smells like teen spirit')['tracks']['items'][0]['album']

In [None]:
def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q=f'track: {name} year: {year}', limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

## Compare song similaries in preparation for making recomendations

There are a few possible approaches for comparing song similarities. One is to just use the continuous, numerical variables (things like danceability, energy, etc.) and do PCA or k-means or some other way to reduce dimensionality.  If you're just considering the song features (continuous variables) you could just create a feature vector and look at the cosine similartity to find the most similar sounding song, taking into account the numerical features and the one-hot-encoded countries.

Some options: 
- [Non-linear PCA (NLPCA) With CATPCA](https://pubmed.ncbi.nlm.nih.gov/22176263/)
- [Factor Analysis of Mixed Data (FAMD)](https://github.com/MaxHalford/Prince#factor-analysis-of-mixed-data-famd)

Alternatively, we can create an embedding, where we map all the songs into an n-dimensional feature space and then look for the most similar vectors in this space (probably with k-NNN. Then we can get the k-most similar songs). 

## First let's do recomendations via the cosine similiarty of song feature vectors

In [None]:
df_kpop_songs = spark.sql('SELECT Title, Artist, {} \
                          FROM df_table \
                          WHERE `k-pop` = 1 \
                          ' \
                         .format(', '.join(numerical_features)) \
                        ) \
                    .sample(.1) \
                    .dropna() \
                    .toPandas() # don't do this, it's better to sample before querying

df_rap_songs = spark.sql('SELECT Title, Artist, {} \
                          FROM df_table \
                          WHERE rap = 1 \
                          ' \
                         .format(', '.join(numerical_features)) \
                        ) \
                    .sample(.1) \
                    .dropna() \
                    .toPandas() # don't do this, it's better to sample before querying

df_rap_songs.head()

In [None]:
df_kpop_songs.head()

In [None]:
# it might be better to used a normalized cosine similarity instead of scaling first and then doing it.
from scipy import spatial
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
transformer = Normalizer()

scaled_kpop_df = scaler.fit_transform(df_kpop_songs.iloc[:, 2:]) # drop the title and artist with the iloc
scaled_rap_df = scaler.fit_transform(df_rap_songs.iloc[:, 2:])


# cos similarity of a rap and a k-pop song
song1 = np.array(scaled_rap_df[1])
song2 = np.array(scaled_kpop_df[2])
result = 1 - spatial.distance.cosine(song1, song2)
result

In [None]:
# cos similarity of two rap songs
song1 = np.array(scaled_rap_df[1])
song2 = np.array(scaled_rap_df[10])
result = 1 - spatial.distance.cosine(song1, song2)
result

## And now with embeddings

(Still working on this, but keeping some of the boiler plate in this version in case someone wants it)

In [None]:
# take only a certain percent of the database for training
songs_pd_df = df.sample(.1) \
             .select([*numerical_features, 'Title']) \
             .dropna() \
             .toPandas()
songs_labels = songs_pd_df.pop('Title')
song_ids = np.array(songs_pd_df.index)
songs_arr = np.asarray(songs_pd_df.values).astype('float32')  # needs to be float32 for tensorflow to auto-convert to tensors 
songs_pd_df.head()

In [None]:
songs_arr.shape

In [None]:
song_ids.shape

Still working on this. 

In [None]:
# from tensorflow import keras
# from tensorflow.keras import layers
# # from keras.layers import Dense , Flatten ,Embedding,Input

# EMBEDDING_SIZE = 10
# NUM_SONGS, ROW_COUNT = songs_arr.shape[0] + 1, songs_arr.shape[0] + 1

# model = keras.Sequential([
#         layers.Embedding(input_dim=NUM_SONGS, 
#                          output_dim=EMBEDDING_SIZE, 
#                          input_length=16), # , input_length=ROW_COUNT
#         layers.GlobalAveragePooling1D(),
#         layers.Dense(24, activation='relu'),
#         layers.Dense(6, activation='softmax')
#                         ])

# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) #'sparse_categorical_crossentropy'
# num_epochs = 10
# history = model.fit(songs_arr, song_ids,  epochs=num_epochs,
#                     validation_split=.2,  verbose=2)

# def plot_graphs(history, string):
#   plt.plot(history.history[string])
#   plt.plot(history.history['val_'+string])
#   plt.xlabel("Epochs")
#   plt.ylabel(string)
#   plt.legend([string, 'val_'+string])
#   plt.show()
  
# plot_graphs(history, "accuracy")
# plot_graphs(history, "loss")