# Echo Nest Data Scrape
Collect songs within groups of genres to train a model that can predict whether songs are similar

In [1]:
from pyechonest import playlist
import pandas as pd
import numpy as np
from pyechonest.util import EchoNestAPIError
import time
import json, urllib2

In [2]:
# use genres that are very different so the model can identify what distinguishes songs across genres
genres = ['rap', 'country', 'metal', 'baroque', 'jazz']

In [3]:
# THINGS FROM DETAILED ANALYSIS TO INCLUDE IN MODEL:
# tempo std dev across sections
# loudness_max average and std dev across segments
# loudness_max_time average and std dev across segments
# difference between loudness_max and loudness_start average and std dev across segments
# minimum loudness_max
# dynamic range across track
# pitches std dev across segments
# timbre average and std dev across segments
# average confidence across beats
def extract_details(detail):
    # looking across sections
    section_tempo = []
    for section in detail['sections']:
        try:
            section_tempo.append(section['tempo'])
        except:
            section_tempo.append(np.nan)
    
    # looking across beats
    beat_confidence = []
    for beat in detail['beats']:
        beat_confidence.append(beat['confidence'])
    
    # looking across segments
    segment_loudness_max = []
    segment_loudness_max_time = []
    segment_loudness_start = []
    segment_pitches = []
    segment_timbre = []
    for segment in detail['segments']:
        segment_loudness_max.append(segment['loudness_max'])
        segment_loudness_max_time.append(segment['loudness_max_time'])
        segment_loudness_start.append(segment['loudness_start'])
        segment_pitches.append(segment['pitches'])
        segment_timbre.append(segment['timbre'])
    segment_loudness_diff = np.array(segment_loudness_max) - np.array(segment_loudness_start)
    
    std_pitches = np.std(segment_pitches, axis=0)
    average_timbre = np.mean(segment_timbre, axis=0)
    std_timbre = np.std(segment_timbre, axis=0)
    
    return_dict = {
                    'Tempo Std Dev': np.std(section_tempo),
                    'Max Loudness Average': np.mean(segment_loudness_max),
                    'Max Loudness Std Dev': np.std(segment_loudness_max),
                    'Minimum Loudness': np.min(segment_loudness_start),
                    'Dynamic Range': np.max(segment_loudness_max) - np.min(segment_loudness_start),
                    'Max Loudness Time Average': np.mean(segment_loudness_max_time),
                    'Max Loudness Time Std Dev': np.std(segment_loudness_max_time),
                    'Loudness Diff Average': np.mean(segment_loudness_diff),
                    'Loudness Diff Std Dev': np.std(segment_loudness_diff),
                    'Beat Confidence Average': np.mean(beat_confidence),
                    'Pitch 0 Std Dev': std_pitches[0],
                    'Pitch 1 Std Dev': std_pitches[1],
                    'Pitch 2 Std Dev': std_pitches[2],
                    'Pitch 3 Std Dev': std_pitches[3],
                    'Pitch 4 Std Dev': std_pitches[4],
                    'Pitch 5 Std Dev': std_pitches[5],
                    'Pitch 6 Std Dev': std_pitches[6],
                    'Pitch 7 Std Dev': std_pitches[7],
                    'Pitch 8 Std Dev': std_pitches[8],
                    'Pitch 9 Std Dev': std_pitches[9],
                    'Pitch 10 Std Dev': std_pitches[10],
                    'Pitch 11 Std Dev': std_pitches[11],
                    'Timbre 0 Std Dev': std_timbre[0],
                    'Timbre 1 Std Dev': std_timbre[1],
                    'Timbre 2 Std Dev': std_timbre[2],
                    'Timbre 3 Std Dev': std_timbre[3],
                    'Timbre 4 Std Dev': std_timbre[4],
                    'Timbre 5 Std Dev': std_timbre[5],
                    'Timbre 6 Std Dev': std_timbre[6],
                    'Timbre 7 Std Dev': std_timbre[7],
                    'Timbre 8 Std Dev': std_timbre[8],
                    'Timbre 9 Std Dev': std_timbre[9],
                    'Timbre 10 Std Dev': std_timbre[10],
                    'Timbre 11 Std Dev': std_timbre[11],
                    'Timbre 0 Average': average_timbre[0],
                    'Timbre 1 Average': average_timbre[1],
                    'Timbre 2 Average': average_timbre[2],
                    'Timbre 3 Average': average_timbre[3],
                    'Timbre 4 Average': average_timbre[4],
                    'Timbre 5 Average': average_timbre[5],
                    'Timbre 6 Average': average_timbre[6],
                    'Timbre 7 Average': average_timbre[7],
                    'Timbre 8 Average': average_timbre[8],
                    'Timbre 9 Average': average_timbre[9],
                    'Timbre 10 Average': average_timbre[10],
                    'Timbre 11 Average': average_timbre[11]
                   }
    return return_dict
        

In [5]:
df = pd.DataFrame()
for genre in genres:
    try:
        songs = playlist.static(type='genre-radio', results=100, genres=genre, variety=0)
    except EchoNestAPIError:
        time.sleep(60)  # wait out echo nest rate limit
        songs = playlist.static(type='genre-radio', results=100, genres=genre, variety=0)
    for song in songs:
        for key in song.audio_summary:
            df.loc[song,key] = song.audio_summary[key]
        
        ## FIX DETAILED ANALYSIS LOADING
        detail = json.load(urllib2.urlopen(song.audio_summary['analysis_url']))
        extraction = extract_details(detail)
        for key in extraction:
            df.loc[song,key] = extraction[key]
            
        df.loc[song,'genre'] = genre
        print 'Progress: ' + str((genres.index(genre)*100 + songs.index(song) + 1)/5.0) + '%'

Progress: 0.2%
Progress: 0.4%
Progress: 0.6%
Progress: 0.8%
Progress: 1.0%
Progress: 1.2%
Progress: 1.4%
Progress: 1.6%
Progress: 1.8%
Progress: 2.0%
Progress: 2.2%
Progress: 2.4%
Progress: 2.6%
Progress: 2.8%
Progress: 3.0%
Progress: 3.2%
Progress: 3.4%
Progress: 3.6%
Progress: 3.8%
Progress: 4.0%
Progress: 4.2%
Progress: 4.4%
Progress: 4.6%
Progress: 4.8%
Progress: 5.0%
Progress: 5.2%
Progress: 5.4%
Progress: 5.6%
Progress: 5.8%
Progress: 6.0%
Progress: 6.2%
Progress: 6.4%
Progress: 6.6%
Progress: 6.8%
Progress: 7.0%
Progress: 7.2%
Progress: 7.4%
Progress: 7.6%
Progress: 7.8%
Progress: 8.0%
Progress: 8.2%
Progress: 8.4%
Progress: 8.6%
Progress: 8.8%
Progress: 9.0%
Progress: 9.2%
Progress: 9.4%
Progress: 9.6%
Progress: 9.8%
Progress: 10.0%
Progress: 10.2%
Progress: 10.4%
Progress: 10.6%
Progress: 10.8%
Progress: 11.0%
Progress: 11.2%
Progress: 11.4%
Progress: 11.6%
Progress: 11.8%
Progress: 12.0%
Progress: 12.2%
Progress: 12.4%
Progress: 12.6%
Progress: 12.8%
Progress: 13.0%
Progress:

In [6]:
non_standard = df[df.time_signature != 4].index
for song in df.index:
    if song in non_standard:
        df.loc[song,'Non-Standard Time Signature'] = 1
    else:
        df.loc[song,'Non-Standard Time Signature'] = 0

In [7]:
df = df.drop(['analysis_url', 'audio_md5', 'key', 'time_signature'], axis=1)

In [12]:
# save song info
df.to_pickle('song-info.pkl')

In [19]:
# load song info
df = pd.read_pickle('song-info.pkl')

In [None]:
# must treat binary and categorical variables differently (mode, Non-standard time signature, genre)