In [9]:
import os
import sys
import time
import glob
import tables
import scipy.io as sio
import pandas as pd
import hdf5_getters
from pandas import read_hdf

In [10]:
def get_all_files(basedir,ext='.h5') :
    """
    From a root directory, go through all subdirectories
    and find all files with the given extension.
    Return all absolute paths in a list.
    """
    allfiles = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files :
            allfiles.append( os.path.abspath(f) )
    return allfiles

In [11]:
def transfer(h5path,matpath=None,force=False):
    """
    Transfer an HDF5 song file (.h5) to a matfile (.mat)
    If there are more than one song in the HDF5 file, each
    field name gets a number happened: 1, 2, 3, ...., numfiles
    PARAM
        h5path  - path to the HDF5 song file
        matpath - path to the new matfile, same as HDF5 path
                  with a different extension by default
        force   - if True and matfile exists, overwrite
    RETURN
        True if the file was transfered, False if there was
        a problem.
        Could also raise an IOException
    NOTE
        All the data has to be loaded in memory! be careful
        if one file contains tons of songs!
    """
    # sanity checks
    if not os.path.isfile(h5path):
        print ('path to HF5 files does not exist:',h5path)
        return False
    if not os.path.splitext(h5path)[1] == '.h5':
        print ('expecting a .h5 extension for file:',h5path)
        return False
    # check matfile
    if matpath is None:
        matpath = os.path.splitext(h5path)[0] + '.mat'
    if os.path.exists(matpath):
        if not force: 
            print('matfile',matpath,'already exists (delete or force):')
            return False
    # get all getters! we assume that all we need is in hdf5_getters.py
    # further assume that they have the form get_blablabla and that's the
    # only thing that has that form
    getters = list(filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()))
    getters.remove("get_num_songs") # special case
    # open h5 file
    h5 = hdf5_getters.open_h5_file_read(h5path)
    # transfer
    nSongs = hdf5_getters.get_num_songs(h5)
    matdata = {'transfer_note':'transferred on '+time.ctime()+' from file: '+h5path}
    try:
        # iterate over songs
        for songidx in range(nSongs):
            # iterate over getter
            for getter in getters:
                gettername = getter[4:]
                if nSongs > 1:
                    gettername += str(songidx+1)
                data = hdf5_getters.__getattribute__(getter)(h5,songidx)
                matdata[gettername] = data
    except MemoryError:
        print('asdfasdfasdfasdf')
        raise
    finally:
        # close h5
        h5.close()
    # create
    return matdata

In [12]:
h5s = get_all_files('/home/rahul/Documents/MillionSongSubset/data/')

In [13]:
df = pd.DataFrame()

In [14]:
for file in h5s:
    xd = transfer(file)
    df = df.append(pd.Series(xd), ignore_index=True)

In [16]:
df.head()

Unnamed: 0,analysis_sample_rate,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_mbtags,...,tatums_confidence,tatums_start,tempo,time_signature,time_signature_confidence,title,track_7digitalid,track_id,transfer_note,year
0,22050.0,111885.0,0.497942,0.341073,b'AR63RWN1187FB3E89A',34.56039,"b'Marvell, AR'",-90.91387,b'2b520be3-0bfe-4904-b372-d04b8cc161ec',[b'blues'],...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.06406, 0.3642, 0.66434, 0.96449, 1.28001, 1...",99.701,1.0,0.089,b'Kindhearted Woman Blues',4114967.0,b'TRBDLIB128F9338167',transferred on Wed Jun 5 20:07:24 2019 from f...,1998.0
1,22050.0,7764.0,0.563749,0.339532,b'ARB78VW1187B98F81C',,b'',,b'8347fcf6-d073-4fa4-99b8-5f545fee39de',[],...,"[0.558, 0.597, 0.636, 0.649, 0.664, 0.684, 0.7...","[0.10031, 0.27974, 0.45917, 0.6386, 0.81804, 0...",168.121,4.0,0.896,b'Watchie Watchie',2018545.0,b'TRBDLXL128F4239ED3',transferred on Wed Jun 5 20:07:24 2019 from f...,0.0
2,22050.0,12829.0,0.645368,0.423022,b'ARHG53Q1187B9ACBFC',32.67828,b'Georgia',-83.22295,b'c76bbcf8-eea2-4342-b34a-1813adbd1de6',[],...,"[0.316, 0.28, 0.253, 0.23, 0.211, 0.167, 0.154...","[0.05937, 0.23972, 0.42454, 0.60757, 0.79239, ...",111.67,3.0,1.0,b'Dying To Stay',2463165.0,b'TRBDLRT128F4296B2F',transferred on Wed Jun 5 20:07:24 2019 from f...,0.0
3,22050.0,5172.0,0.672681,0.476389,b'ARGBR4I1187B99D073',47.60356,"b'Seattle, WA'",-122.32944,b'e675295a-1efe-4247-aa3b-53b78d0cdffc',[b'rock and indie'],...,"[0.711, 0.601, 0.517, 0.437, 0.376, 0.329, 0.2...","[0.05553, 0.28659, 0.51766, 0.73954, 0.9557, 1...",131.642,3.0,0.687,b'Beneath The Valley Of The Underdog (Album Ve...,6914201.0,b'TRBDLXE128F9310E95',transferred on Wed Jun 5 20:07:24 2019 from f...,1998.0
4,22050.0,287806.0,0.450586,0.33682,b'ARJZQ321187FB5AAA1',,b'',,b'29e24d21-a213-4cfc-8115-54be1c43174e',[],...,"[0.357, 0.306, 0.26, 0.222, 0.19, 0.174, 0.147...","[0.19427, 0.36373, 0.53405, 0.70352, 0.87298, ...",176.121,4.0,1.0,b'Take Five - Live',4147666.0,b'TRBDLAT128F92CB320',transferred on Wed Jun 5 20:07:24 2019 from f...,0.0


In [18]:
df.shape

(10000, 55)

In [19]:
df.columns

Index(['analysis_sample_rate', 'artist_7digitalid', 'artist_familiarity',
       'artist_hotttnesss', 'artist_id', 'artist_latitude', 'artist_location',
       'artist_longitude', 'artist_mbid', 'artist_mbtags',
       'artist_mbtags_count', 'artist_name', 'artist_playmeid', 'artist_terms',
       'artist_terms_freq', 'artist_terms_weight', 'audio_md5',
       'bars_confidence', 'bars_start', 'beats_confidence', 'beats_start',
       'danceability', 'duration', 'end_of_fade_in', 'energy', 'key',
       'key_confidence', 'loudness', 'mode', 'mode_confidence', 'release',
       'release_7digitalid', 'sections_confidence', 'sections_start',
       'segments_confidence', 'segments_loudness_max',
       'segments_loudness_max_time', 'segments_loudness_start',
       'segments_pitches', 'segments_start', 'segments_timbre',
       'similar_artists', 'song_hotttnesss', 'song_id', 'start_of_fade_out',
       'tatums_confidence', 'tatums_start', 'tempo', 'time_signature',
       'time_signature_

In [20]:
df.to_csv('mss.csv')