In [14]:
import numpy as np
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import requests

In [15]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools
from collections import defaultdict
import csv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

In [16]:
%matplotlib inline

# Getting the Data

We are going to utilize the Open Opus API to retrieve our metadata 

In [17]:
# composer_response = requests.get("https://api.openopus.org/composer/list/pop.json")
# print(response.status_code)

This call gets a list of popular composers. Let's take a look.

In [18]:
# print(composer_response.json())

Only run the following once to retrieve Open Opus entire dataset:

In [19]:
# response = requests.get("https://api.openopus.org/work/dump.json")
# print(response.status_code)

In [20]:
# json_data = response.json()
# with open('data.json', 'w') as f:
#     json.dump(json_data, f)

### Let's take a look at the types of metadata

### Attributes for each Composer:
- ID
- Name
- Complete Name
- Birth
- Death
- Epoch (period): Mideval, Renaissance, Baroque, Classical, Early Romantic, Romantic, Late Romantic 20th Century, Post-War, 21st Century
- Portrait (link to jpeg)
- Popular (0 or 1)
- Recommended (0 or 1)

### Attributes for each work:
- Title
- Subtitle
- Searchterms (mostly left empty)
- Popular (0 or 1)
- Genre: Chamber, Keyboard, Orchestral, Stage, Vocal

### Observations:
- Currently the data is at composer level, we want it to be sorted by work
- There are hardly any numerical values that can be used to compare works, we will have to create one hot encodings for attributes like genre, epoch (period), and composer
- We don't have a lot of important metadata with numerical values to compare works
    - Some types of metadata spotify holds about pop songs can also apply here:
        - duration
        - energy
        - key
        - liveliness
        - loudness
        - mode (major or minor)
        - tempo
        - time signature
- There are many different recordings on spotify of the same work. Each recording has it's own unique set of meta data. Spotify does not necessarily have this information readily available, rather the information can be obtained from the title, album, or artist.
- Other types of metadata specific to classical music that would be helpful to have:
    - composer
    - movements
    - instrumentation (right now genre is a decent representation of this, but it would be nice to have solo, or small ensemble instrumentation)
    - performer


The Open Opus dataset gives us important information about composers and works, however we will need Spotify's api in order to get track specific information. 

### Spotify API parse:

### Data reconfiguration:
- configure data by work instead of by composer

In [21]:
class Composer(object):
    name: ""
    complete_name: ""
    epoch: ""
    popularity: 0
    recommended: 0

    # The class "constructor" - It's actually an initializer 
    def __init__(self, name, complete_name, epoch, popularity, recommended):
        self.name = name
        self.complete_name = complete_name
        self.epoch = epoch
        self.popularity = popularity
        self.recommended = recommended

In [22]:
with open('data.json', 'r') as file:
    json_data = json.load(file)

composer_df = pd.DataFrame(json_data['composers'])
composer_df.to_csv('composer_data.csv', index=False)  

work_dict = defaultdict(list)

for composer in json_data['composers']:
    for work in composer['works']:
        work_dict['title'].append(work['title'])
        work_dict['subtitle'].append(work['subtitle'])
        work_dict['searchterms'].append(work['searchterms'])
        work_dict['popularity'].append(work['popular'])
        work_dict['recommended'].append(work['recommended'])
        work_dict['genre'].append(work['genre'])
        work_dict['composer'].append(composer['complete_name'])
        work_dict['epoch'].append(composer['epoch'])
        work_dict['composer_popularity'].append(composer['popular'])
        work_dict['composer_recommended'].append(composer['recommended'])

work_df = pd.DataFrame(work_dict)
work_df.to_csv('works_data.csv', index=False)    

# Feature Engineering
1. Normalize float variables
2. Create OHE variables
3. Create TF-IDF features off of composers

In [28]:
work_df.tail()

Unnamed: 0,title,subtitle,searchterms,popularity,recommended,genre,composer,epoch,composer_popularity,composer_recommended
24970,"Waldgespräch, ballad for soprano, 2 horns, har...",,,0,0,Vocal,Alexander von Zemlinsky,Late Romantic,0,
24971,"Waltz Songs on Tuscan Folk Lyrics, for voice a...",,,0,0,Vocal,Alexander von Zemlinsky,Late Romantic,0,
24972,"Waltz Songs on Tuscan Folk Lyrics, for voice a...",,,0,0,Vocal,Alexander von Zemlinsky,Late Romantic,0,
24973,"Wandl' ich im Wald des Abends, for voice and p...",,,0,0,Vocal,Alexander von Zemlinsky,Late Romantic,0,
24974,Zwölf Lieder op.27,,,0,0,Vocal,Alexander von Zemlinsky,Late Romantic,0,


### One Hot Encodings:
We want one hot encodings for genre, epoch, and composer

In [38]:
#simple function to create OHE features
#this gets passed later on
def ohe(df, column): 
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used
        
    Returns: 
        tf_df: One hot encoded features 
    """
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [column + " | " + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

In [35]:
# Term Frequency Inverse Document Frequency
# function to apply TF-IDF to emphasize metadata terms based on how often they show up in our dataset
def tfidf(df, col_name):
    """ 
    Process df to create a final set of features that will be used to generate recommendations

    Parameters: 
        df (pandas dataframe): Work Dataframe
        col_name: name of the column we want to apply TD-IDF to 
        
    Returns: 
        final: data frame contained the TD-IDF column 
    """
    
    #tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(df[col_name])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray())
    tfidf_df.columns = [col_name + "|" + i for i in tfidf.get_feature_names()]
    tfidf_df.reset_index(drop = True, inplace=True)
    
    return tfidf_df

In [42]:
composer_df = tfidf(work_df, 'composer')
genre_df = ohe(work_df, 'genre')
epoch_df = ohe(work_df, 'epoch')
final = pd.concat([composer_df, genre_df, epoch_df], axis = 1)

# add titles
final['title']=work_df['title'].values


final.head()

Unnamed: 0,composer|aaron,composer|adams,composer|adès,composer|alban,composer|albert,composer|alberto,composer|albinoni,composer|albéniz,composer|alessandro,composer|alexander,...,epoch | 21st Century,epoch | Baroque,epoch | Classical,epoch | Early Romantic,epoch | Late Romantic,epoch | Medieval,epoch | Post-War,epoch | Renaissance,epoch | Romantic,title
0,0.0,0.847827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,Berceuse élégiaque
1,0.0,0.847827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,Chamber Symphony
2,0.0,0.847827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,China Gates
3,0.0,0.847827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,Doctor Atomic
4,0.0,0.847827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,Doctor Atomic Symphony


# Connect to Spotify API