# MSD - tutorial_1

In [1]:
import sys
ver_info = sys.version
print("This jupyter notebook was executed with:\n{0}".format(ver_info));


This jupyter notebook was executed with:
2.7.14 |Anaconda, Inc.| (default, Nov  8 2017, 13:40:45) [MSC v.1500 64 bit (AMD64)]


In [2]:
# usual imports 
import os
import sys
import time
import glob
import datetime
import sqlite3
import numpy as np

Change the path to dataset and MSD code  

In [3]:
# path to the Million Song Dataset subset (uncompressed)
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_subset_path='../MillionSongSubset'  # mine is in the same directory as the pynb file (rshn) 
msd_subset_data_path=os.path.join(msd_subset_path,'data')
msd_subset_addf_path=os.path.join(msd_subset_path,'AdditionalFiles')
print(msd_subset_path)
assert os.path.isdir(msd_subset_path),'wrong path' # sanity check



# path to the Million Song Dataset code
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_code_path='../MSongsDB'
print(msd_code_path)
assert os.path.isdir(msd_code_path),'wrong path' # sanity check
# we add some paths to python so we can import MSD code
# Ubuntu: you can change the environment variable PYTHONPATH
# in your .bashrc file so you do not have to type these lines
sys.path.append( os.path.join(msd_code_path,'PythonSrc') )


../MillionSongSubset
../MSongsDB


In [4]:
# imports specific to the MSD
import hdf5_getters as GETTERS


In [5]:
# the following function simply gives us a nice string for
# a time lag in seconds
def strtimedelta(starttime,stoptime):
    return str(datetime.timedelta(seconds=stoptime-starttime))

### Function to iterate over the files 

In [6]:
# we define this very useful function to iterate the files
def apply_to_all_files(basedir,func=lambda x: x,ext='.h5'):
    """
    From a base directory, go through all subdirectories,
    find all files with the given extension, apply the
    given function 'func' to all of them.
    If no 'func' is passed, we do nothing except counting.
    INPUT
       basedir  - base directory of the dataset
       func     - function to apply to all filenames
       ext      - extension, .h5 by default
    RETURN
       number of files
    """
    cnt = 0
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        # count files
        cnt += len(files)
        # apply function to all files
        for f in files :
            func(f)       
    return cnt


In [7]:
# we can now easily count the number of files in the dataset
print 'number of song files:',apply_to_all_files(msd_subset_data_path)


number of song files: 9999


## How to use the simple h5_getter

In [8]:
h5 = GETTERS.open_h5_file_read('../MillionSongSubset/data/A/A/A/TRAAAAW128F429D538.h5')


x = GETTERS.get_title(h5)
print x
print type(x)

h5.close()

AttributeError: 'module' object has no attribute 'openFile'

In [None]:
h5 = GETTERS.open_h5_file_read('../MillionSongSubset/data/A/A/A/TRAAAAW128F429D538.h5')


x = GETTERS.get_beats_start(h5)
h5.close()


size =  x.size
total = 0

for i in range(1, size):
    total += x[i]-x[i-1]

print total/size


### Get artist name
+ `all_artist_names` is a set of strings
+ `func_to_get_artist_name(filename)` works on a single file
+ Use `apply_to_all_files()` to apply to all 


In [None]:
# let's now get all artist names in a set(). One nice property:
# if we enter many times the same artist, only one will be kept.
all_artist_names = set()

# we define the function to apply to all files
def func_to_get_artist_name(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_name = GETTERS.get_artist_name(h5)
    all_artist_names.add( artist_name )
    h5.close()

In [None]:
all_dance = []

# we define the function to apply to all files
def func_to_get_all_dance(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    dance_score = GETTERS.get_danceability(h5)
    all_dance.append( dance_score )
    h5.close()

In [None]:
# let's apply the previous function to all files
# we'll also measure how long it takes
t1 = time.time()
apply_to_all_files(msd_subset_data_path,func=func_to_get_artist_name)
t2 = time.time()
print 'all artist names extracted in:',strtimedelta(t1,t2)


In [None]:
# let's apply the previous function to all files
# we'll also measure how long it takes
t1 = time.time()
apply_to_all_files(msd_subset_data_path,func=func_to_get_all_dance)
t2 = time.time()
print 'all artist names extracted in:',strtimedelta(t1,t2)


In [None]:
all_dance.sort()
print all_dance

#### cool.
Now `all_artist_names` is populated.  
Lets look at the first 10.  

In [None]:
# let's see some of the content of 'all_artist_names'
print 'found',len(all_artist_names),'unique artist names'
for k in range(10):
    print list(all_artist_names)[k]


## sqlite method

In [None]:
# this is too long, and the work of listing artist names has already
# been done. Let's redo the same task using an SQLite database.
# We connect to the provided database: track_metadata.db
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))
# we build the SQL query
q = "SELECT DISTINCT artist_name FROM songs"
# we query the database
t1 = time.time()
res = conn.execute(q)
all_artist_names_sqlite = res.fetchall()
t2 = time.time()
print 'all artist names extracted (SQLite) in:',strtimedelta(t1,t2)
# we close the connection to the database
conn.close()
# let's see some of the content
for k in range(5):
    print all_artist_names_sqlite[k][0]



#### thats a lot faster
Now lets find out which artist has the most songs in the dataset   
First query... 
+ Note that artist_id is what we want bc artist_name have duplicates (FEAT. Lil Wayne) 
 

In [None]:
# what we want to work with is artist ID, not artist names. Some artists
# have many names, usually because the song is "featuring someone else"
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))
q = "SELECT DISTINCT artist_id FROM songs"
res = conn.execute(q)
all_artist_ids = map(lambda x: x[0], res.fetchall())
conn.close()


In [None]:
# The Echo Nest artist id look like:
for k in range(4):
    print all_artist_ids[k]


In [None]:
# let's count the songs from each of these artists.
# We will do it first by iterating over the dataset.
# we prepare a dictionary to count files
files_per_artist = {}
for aid in all_artist_ids:
    files_per_artist[aid] = 0


In [None]:
# we prepare the function to check artist id in each file
def func_to_count_artist_id(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_id = GETTERS.get_artist_id(h5)
    files_per_artist[artist_id] += 1
    h5.close()



In [None]:
# we apply this function to all files
apply_to_all_files(msd_subset_data_path,func=func_to_count_artist_id)

# the most popular artist (with the most songs) is:
most_pop_aid = sorted(files_per_artist,
                      key=files_per_artist.__getitem__,
                      reverse=True)[0]
print most_pop_aid,'has',files_per_artist[most_pop_aid],'songs.'



In [None]:
# of course, it is more fun to have the name(s) of this artist
# let's get it using SQLite
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))
q = "SELECT DISTINCT artist_name FROM songs"
q += " WHERE artist_id='"+most_pop_aid+"'"
res = conn.execute(q)
pop_artist_names = map(lambda x: x[0], res.fetchall())
conn.close()
print 'SQL query:',q
print 'name(s) of the most popular artist:',pop_artist_names



In [None]:
# let's redo all this work in SQLite in a few seconds
t1 = time.time()
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))
q = "SELECT DISTINCT artist_id, artist_name, Count(track_id) FROM songs"
q += " GROUP BY artist_id"
res = conn.execute(q)
pop_artists = res.fetchall()
conn.close()
t2 = time.time()
print 'found most popular artist in',strtimedelta(t1,t2)
print sorted(pop_artists,key=lambda x:x[2],reverse=True)[0]

In [None]:
# let's now get all artist names in a set(). One nice property:
# if we enter many times the same artist, only one will be kept.
all_hotness = set()

# we define the function to apply to all files
def func_to_get_artist_name(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    h5 = GETTERS.open_h5_file_read(filename)
    artist_name = GETTERS.get_artist_hotttnesss(h5)
    all_hotness.add( artist_name )
    h5.close()

In [None]:
import pprint
p = pprint.PrettyPrinter(indent=4)


In [None]:
# this is too long, and the work of listing artist names has already
# been done. Let's redo the same task using an SQLite database.
# We connect to the provided database: track_metadata.db
conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                    'subset_track_metadata.db'))
# we build the SQL query
q = "SELECT * FROM songs"
# we query the database
t1 = time.time()
res = conn.execute(q)
all_artist_hot_sqlite = res.fetchall()
t2 = time.time()
print 'all artist names extracted (SQLite) in:',strtimedelta(t1,t2)
# we close the connection to the database
conn.close()
# let's see some of the content
for k in range(1):
    pp.pprint(all_artist_hot_sqlite[k])



In [None]:
filter(lambda x: x[:3] == 'get',GETTERS.__dict__.keys())

In [None]:
# this is too long, and the work of listing artist names has already
# been done. Let's redo the same task using an SQLite database.
# We connect to the provided database: track_metadata.db
conn = sqlite3.connect('../testing01.db')

# we build the SQL query
q = "SELECT artist_name, track_7digitalid \
        FROM songs WHERE artist_familiarity=0 LIMIT 100"
# we query the database
t1 = time.time()
res = conn.execute(q)
x = res.fetchall()
t2 = time.time()
print 'time:',strtimedelta(t1,t2)
# we close the connection to the database
conn.close()


pp.pprint(x)


## How to connect to the swagmaster metadata db

In [None]:
conn = sqlite3.connect('../swagmaster.db')

# we build the SQL query
q = "SELECT count(*) FROM songs WHERE \
        song_hotttnesss>0 \
        "
# we query the database
t1 = time.time()
res = conn.execute(q)
x = res.fetchall()
t2 = time.time()
print 'time:',strtimedelta(t1,t2)
# we close the connection to the database
conn.close()


pp.pprint(x)


In [None]:
conn.close()

In [None]:
conn = sqlite3.connect('../track_metadata.db')

# we build the SQL query
q = "SELECT count(*) FROM songs limit 1"
# we query the database
t1 = time.time()
res = conn.execute(q)
x = res.fetchall()
t2 = time.time()
print 'time:', strtimedelta(t1,t2)
# we close the connection to the database
conn.close()


p.pprint(x)
