In [69]:
import numpy as np
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import requests

In [70]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools
from collections import defaultdict
import csv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

In [71]:
%matplotlib inline

# Getting the Data

We are going to utilize the Open Opus API to retrieve our metadata 

In [72]:
# composer_response = requests.get("https://api.openopus.org/composer/list/pop.json")
# print(response.status_code)

This call gets a list of popular composers. Let's take a look.

In [73]:
# print(composer_response.json())

Only run the following once to retrieve Open Opus entire dataset:

In [74]:
# response = requests.get("https://api.openopus.org/work/dump.json")
# print(response.status_code)

In [75]:
# json_data = response.json()
# with open('data.json', 'w') as f:
#     json.dump(json_data, f)

### Let's take a look at the types of metadata

### Attributes for each Composer:
- ID
- Name
- Complete Name
- Birth
- Death
- Epoch (period): Mideval, Renaissance, Baroque, Classical, Early Romantic, Romantic, Late Romantic 20th Century, Post-War, 21st Century
- Portrait (link to jpeg)
- Popular (0 or 1)
- Recommended (0 or 1)

### Attributes for each work:
- Title
- Subtitle
- Searchterms (mostly left empty)
- Popular (0 or 1)
- Genre: Chamber, Keyboard, Orchestral, Stage, Vocal

### Observations:
- Currently the data is at composer level, we want it to be sorted by work
- There are hardly any numerical values that can be used to compare works, we will have to create one hot encodings for attributes like genre, epoch (period), and composer
- We don't have a lot of important metadata with numerical values to compare works
    - Some types of metadata spotify holds about pop songs can also apply here:
        - duration
        - energy
        - key
        - liveliness
        - loudness
        - mode (major or minor)
        - tempo
        - time signature
- There are many different recordings on spotify of the same work. Each recording has it's own unique set of meta data. Spotify does not necessarily have this information readily available, rather the information can be obtained from the title, album, or artist.
- Other types of metadata specific to classical music that would be helpful to have:
    - composer
    - movements
    - instrumentation (right now genre is a decent representation of this, but it would be nice to have solo, or small ensemble instrumentation)
    - performer


The Open Opus dataset gives us important information about composers and works, however we will need Spotify's api in order to get track specific information. 

### Data reconfiguration:
- configure data by work instead of by composer

In [76]:
class Composer(object):
    name: ""
    complete_name: ""
    epoch: ""
    popularity: 0
    recommended: 0

    # The class "constructor" - It's actually an initializer 
    def __init__(self, name, complete_name, epoch, popularity, recommended):
        self.name = name
        self.complete_name = complete_name
        self.epoch = epoch
        self.popularity = popularity
        self.recommended = recommended

In [77]:
with open('data.json', 'r') as file:
    json_data = json.load(file)

composer_df = pd.DataFrame(json_data['composers'])
composer_df.to_csv('composer_data.csv', index=False)  

work_dict = defaultdict(list)

for composer in json_data['composers']:
    for work in composer['works']:
        work_dict['title'].append(work['title'])
        work_dict['subtitle'].append(work['subtitle'])
        work_dict['searchterms'].append(work['searchterms'])
        work_dict['popularity'].append(work['popular'])
        work_dict['recommended'].append(work['recommended'])
        work_dict['genre'].append(work['genre'])
        work_dict['composer'].append(composer['complete_name'])
        work_dict['epoch'].append(composer['epoch'])
        work_dict['composer_popularity'].append(composer['popular'])
        work_dict['composer_recommended'].append(composer['recommended'])

work_df = pd.DataFrame(work_dict)
work_df.to_csv('works_data.csv', index=False)    

## Spotify Parse

The data we have is now sorted by work. However Spotify classifies songs by track and tracks are often specific movements within a work. Since the end goal is to play recommended songs on spotify, we are going to parse through all the tracks within the classical music genre on spotify and classify them within the work dataset we currently have.

In [78]:
# client id and secret for my application
client_id = '675300f7838a4a01af58e05419beae7d'

# store secret id in txt file name client_secret_file
with open('client_secret_file.txt', 'r') as file:
    client_secret = file.read()


In [79]:
scope = 'user-library-read'

if len(sys.argv) > 1:
    username = sys.argv[1]
else:
    print("Usage: %s username" % (sys.argv[0],))
    sys.exit()

In [80]:
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [81]:
token = util.prompt_for_user_token(scope, client_id= client_id, client_secret=client_secret, redirect_uri='http://localhost:8881')

In [82]:
sp = spotipy.Spotify(auth=token)

The search function of the spotify API can only return 50 items at a time. So we are going to search for each work in our dataset and take the most popular 50 tracks.

In [83]:
'''
for index, row in word_df.iterrows():
    curr_work = row['title']
    curr_comp = row['composer']
    tracks = sp.search(q='genre:classical' + '&year:' + year, type='album',market='GB',limit=50, offset=0)
'''

"\nfor index, row in word_df.iterrows():\n    curr_work = row['title']\n    curr_comp = row['composer']\n    tracks = sp.search(q='genre:classical' + '&year:' + year, type='album',market='GB',limit=50, offset=0)\n"

Ok jk I might not be able to parse spotify data like that ??? since a recommendation engine is kinda machine learning/Ai

# Feature Engineering
1. Normalize float variables
2. Create OHE variables
3. Create TF-IDF features off of composers

In [84]:
work_df.tail()

Unnamed: 0,title,subtitle,searchterms,popularity,recommended,genre,composer,epoch,composer_popularity,composer_recommended
24970,"Waldgespräch, ballad for soprano, 2 horns, har...",,,0,0,Vocal,Alexander von Zemlinsky,Late Romantic,0,
24971,"Waltz Songs on Tuscan Folk Lyrics, for voice a...",,,0,0,Vocal,Alexander von Zemlinsky,Late Romantic,0,
24972,"Waltz Songs on Tuscan Folk Lyrics, for voice a...",,,0,0,Vocal,Alexander von Zemlinsky,Late Romantic,0,
24973,"Wandl' ich im Wald des Abends, for voice and p...",,,0,0,Vocal,Alexander von Zemlinsky,Late Romantic,0,
24974,Zwölf Lieder op.27,,,0,0,Vocal,Alexander von Zemlinsky,Late Romantic,0,


### One Hot Encodings:
We want one hot encodings for genre, epoch, and composer

In [85]:
#simple function to create OHE features
#this gets passed later on
def ohe(df, column): 
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used
        
    Returns: 
        tf_df: One hot encoded features 
    """
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [column + " | " + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

In [86]:
# Term Frequency Inverse Document Frequency
# function to apply TF-IDF to emphasize metadata terms based on how often they show up in our dataset
def tfidf(df, col_name):
    """ 
    Process df to create a final set of features that will be used to generate recommendations

    Parameters: 
        df (pandas dataframe): Work Dataframe
        col_name: name of the column we want to apply TD-IDF to 
        
    Returns: 
        final: data frame contained the TD-IDF column 
    """
    
    #tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(df[col_name])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray())
    tfidf_df.columns = [col_name + "|" + i for i in tfidf.get_feature_names()]
    tfidf_df.reset_index(drop = True, inplace=True)
    
    return tfidf_df

In [87]:
composer_df = tfidf(work_df, 'composer')
genre_df = ohe(work_df, 'genre')
epoch_df = ohe(work_df, 'epoch')
complete_feature_set = pd.concat([composer_df, genre_df, epoch_df], axis = 1)

# add titles
complete_feature_set['title']=work_df['title'].values

complete_feature_set.to_csv('output.csv', encoding='utf-8', index=False)
    
complete_feature_set.head()

Unnamed: 0,composer|aaron,composer|adams,composer|adès,composer|alban,composer|albert,composer|alberto,composer|albinoni,composer|albéniz,composer|alessandro,composer|alexander,...,epoch | 21st Century,epoch | Baroque,epoch | Classical,epoch | Early Romantic,epoch | Late Romantic,epoch | Medieval,epoch | Post-War,epoch | Renaissance,epoch | Romantic,title
0,0.0,0.847827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,Berceuse élégiaque
1,0.0,0.847827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,Chamber Symphony
2,0.0,0.847827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,China Gates
3,0.0,0.847827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,Doctor Atomic
4,0.0,0.847827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,Doctor Atomic Symphony


# Connect to Spotify API

We're going to utilize spotify's api to generate recommendations from a single track.

The works in our data set are mostly spotify albums. However there are many different recordings for each work, and they include there own set of meta data including conductor, performer, etc. We will address this problem in later data.

In [88]:
#gather playlist names and images. 
#images aren't going to be used until I start building a UI
id_name = {}
list_photo = {}
for i in sp.current_user_playlists()['items']:

    id_name[i['name']] = i['uri'].split(':')[2]
    list_photo[i['uri'].split(':')[2]] = i['images'][0]['url']

In [89]:
id_name

{'Dreamcatcher pt 1': '3M5mCGWpsn4hN7T86JQqov',
 'girlie pop': '5aA9fkKwGCmZWdejTg98oK',
 'indie pop🌿': '3ZlJfhlVuJAczMsorudHu4',
 'Dreamcatcher': '0vGOltDDy6p9YNGj1gKOkm',
 'listening quiz study': '4fdOumYLOFEc9CpCUQ4F9A',
 'ayo ': '6yMc44CkwCHYKFXvwH0ztz',
 'can you please…': '4PMlFvtDP8ft5n8wgaOjaC',
 'basic bitch': '7gFFpS8Foz4irsTKEgJLiF',
 'kpop_curr': '5SHu3A3UgWmFiX3JJy41hC',
 '2 0 2 0': '69WUd0gOEyy9PPsq7PQnzk',
 'fuck yea, gurl': '7aI0te4fpWWXBgqo5Uuk10',
 'current mood': '5diY01VNUInpqszwcHtYYU',
 'ehhh': '2lb1Xlv0VWqqjjEDE7twpo',
 'yeet': '5aHAQoiCoJ0QTy1upFIBNl',
 'dark academia sxn': '29N9MvfWr2eYslnghjGTHM',
 'keira knightley can step on me': '1QD0NXK6piPqvomIJqbwWH',
 'rail me, draco malfoy': '57aj9WZQu0pWW6s6vvmX2P',
 'she a classy bitch': '3NwVo9Qp2rbxcAd0uLSCMk',
 'musicalz': '08m66ynzFjOUQ6Yrh1q03O',
 'disney hoe anthems': '4KodsOGzr3Eu9l8ozOaYhw',
 'main character shit': '3Mz2CI3DJ2pJzgft7QoKHP',
 'action movie heroine': '6Jx50M0tSSwagBSsClpHcK',
 'in a galaxy far 

I want to take my playlist 'she a classy bitch' and create recommendations based off the songs in that playlist.

In [90]:
def create_necessary_outputs(playlist_name,id_dic, df):
    """ 
    Pull songs from a specific playlist.

    Parameters: 
        playlist_name (str): name of the playlist you'd like to pull from the spotify API
        id_dic (dic): dictionary that maps playlist_name to playlist_id
        df (pandas dataframe): dataframe with our work data
        
    Returns: 
        playlist: all of the works that are in our dataset
    """
    
    #generate playlist dataframe
    playlist = pd.DataFrame()
    playlist_name = playlist_name

    for ix, i in enumerate(sp.playlist(id_dic[playlist_name])['tracks']['items']):
        #print(i['track']['artists'][0]['name'])
        playlist.loc[ix, 'artist'] = i['track']['artists'][0]['name']
        playlist.loc[ix, 'name'] = i['track']['name']
        playlist.loc[ix, 'id'] = i['track']['id'] # ['uri'].split(':')[2]
        playlist.loc[ix, 'url'] = i['track']['album']['images'][1]['url']
        playlist.loc[ix, 'date_added'] = i['added_at']

    playlist['date_added'] = pd.to_datetime(playlist['date_added'])  

    # Next I need to match the tracks in my playlist to works in our dataset 
    
    # Convert the 'name' column in the playlist DataFrame and 'title' column in df DataFrame to lists of strings
    playlist_names = playlist['name'].tolist()
    df_titles = df['title'].tolist()

    # Create a TF-IDF vectorizer instance
    vectorizer = TfidfVectorizer()

    # Fit the vectorizer on the combined list of strings
    vectorizer.fit(playlist_names + df_titles)

    # Transform the playlists and df titles into TF-IDF vectors
    playlist_vectors = vectorizer.transform(playlist_names)
    df_vectors = vectorizer.transform(df_titles)

    # Find the closest match for each playlist song
    closest_match_indices = cosine_similarity(playlist_vectors, df_vectors).argmax(axis=1)
    closest_matches = [df_titles[i] for i in closest_match_indices]

    # Add the closest match to the playlist DataFrame
    playlist['closest_match'] = closest_matches
    playlist = playlist.sort_values('date_added', ascending=False)

    
    return playlist

In [91]:
classical_playlist = create_necessary_outputs('she a classy bitch', id_name, work_df)
classical_playlist.head()

Unnamed: 0,artist,name,id,url,date_added,closest_match
57,Modest Mussorgsky,Pictures At An Exhibition: Promenade - II,1pDCv7N2OZ7oGcjH5ntjW9,https://i.scdn.co/image/ab67616d00001e02b43552...,2022-11-15 20:38:29+00:00,Pictures at an Exhibition
56,Gustav Holst,"The Planets, Op. 32: 4. Jupiter, the Bringer o...",59Id4KrBWiizuq53doxWtp,https://i.scdn.co/image/ab67616d00001e024c43a2...,2022-06-05 21:33:57+00:00,"The Planets, suite for orchestra and female ch..."
55,Georges Bizet,Habanera,3G1pA6OJSNdbkbMmw6Hpwm,https://i.scdn.co/image/ab67616d00001e0297bbc0...,2021-06-27 05:04:55+00:00,"Pièce en forme de Habanera, for violin and piano"
54,Sergei Prokofiev,"Romeo and Juliet, Op. 64 / Act 1: Dance Of The...",2LiWNkeUOAeibGxJKxmjsD,https://i.scdn.co/image/ab67616d00001e028af0e6...,2021-06-24 20:53:09+00:00,"Romeo and Juliet, op. 64"
53,London Philharmonic Orchestra,4th Movement: Allegro Con Fuocho,2iafV5jvEV5vj7WInmsJI6,https://i.scdn.co/image/ab67616d00001e023a819b...,2021-03-03 06:26:09+00:00,Movement


# Create Playlist Vector

We are going to create a vector that represents my entire playlist and then compare it to every work not in the playlist.

In [148]:
def generate_playlist_feature(complete_feature_set, playlist_df):
    """ 
    Summarize a user's playlist into a single vector

    Parameters: 
        complete_feature_set (pandas dataframe): Dataframe which includes all of the features for the spotify songs
        playlist_df (pandas dataframe): playlist dataframe
        weight_factor (float): float value that represents the recency bias. The larger the recency bias, the most priority recent songs get. Value should be close to 1. 
        
    Returns: 
        playlist_feature_set_weighted_final (pandas series): single feature that summarizes the playlist
        complete_feature_set_nonplaylist (pandas dataframe): 
    """
    complete_feature_set_playlist = complete_feature_set[complete_feature_set['title'].isin(playlist_df['closest_match'].values)]#.drop('id', axis = 1).mean(axis =0)
    complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['title'].isin(playlist_df['closest_match'].values)]#.drop('id', axis = 1)
    
    complete_feature_set_playlist.to_csv('playlist_features.csv', index=False)
    playlist_feature_set_final = complete_feature_set_playlist.iloc[:, :-1]
    
    # ensures we have the same corresponding columns
    # complete_feature_set_nonplaylist = complete_feature_set_nonplaylist[[*playlist_feature_set_final.columns, 'title']]
    
    return playlist_feature_set_final.sum(axis = 0), complete_feature_set_nonplaylist

In [149]:
complete_feature_set_playlist_vector, complete_feature_set_nonplaylist = generate_playlist_feature(complete_feature_set, classical_playlist)

In [150]:
complete_feature_set_playlist_vector.shape

(422,)

In [151]:
complete_feature_set_nonplaylist.shape

(24921, 423)

# Generate Recommendations

Now it's time to generate the actual recommendations! To do that we will find the cosine similarity between the playlist vector and the vectors for every other work in our dataset. 

In [152]:
def generate_playlist_recos(df, features, nonplaylist_features):
    """ 
    Pull songs from a specific playlist.

    Parameters: 
        df (pandas dataframe): spotify dataframe
        features (pandas series): summarized playlist feature
        nonplaylist_features (pandas dataframe): feature set of songs that are not in the selected playlist
        
    Returns: 
        non_playlist_df_top_10: Top 10 recommendations for that playlist
    """
    
    non_playlist_df = df[df['title'].isin(nonplaylist_features['title'].values)]
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('title', axis=1).values, features.values.reshape(1, -1))[:,0]
    non_playlist_df_top_10 = non_playlist_df.sort_values('sim',ascending = False).head(10)
    
    
    return non_playlist_df_top_10

In [153]:
top10 = generate_playlist_recos(work_df, complete_feature_set_playlist_vector, complete_feature_set_nonplaylist)
top10

Unnamed: 0,title,subtitle,searchterms,popularity,recommended,genre,composer,epoch,composer_popularity,composer_recommended,sim
7271,"Dayful of Song, suite",,,0,0,Orchestral,George Gershwin,20th Century,1,1,0.666824
7307,"Porgy and Bess, suite from the opera",,,0,0,Orchestral,George Gershwin,20th Century,1,1,0.666824
7314,"Second Rhapsody, ""Rhapsody in Rivets""",,"second rhapsody, rhapsody no. 2",0,1,Orchestral,George Gershwin,20th Century,1,1,0.666824
7270,Cuban Overture,,,0,1,Orchestral,George Gershwin,20th Century,1,1,0.666824
7308,Porgy and Bess: A Symphonic Picture,,,0,0,Orchestral,George Gershwin,20th Century,1,1,0.666824
7304,Piano Concerto in F,,,0,1,Orchestral,George Gershwin,20th Century,1,1,0.666824
7269,Catfish Row: Symphonic Suite from Porgy and Bess,,,0,0,Orchestral,George Gershwin,20th Century,1,1,0.666824
5135,Music for Movies,,,0,0,Orchestral,Aaron Copland,20th Century,0,1,0.653491
5136,Music for Radio: Saga of the Prairies,,,0,0,Orchestral,Aaron Copland,20th Century,0,1,0.653491
5137,"Music for the Theatre, suite for small orchestra",,,0,0,Orchestral,Aaron Copland,20th Century,0,1,0.653491
