In [1]:
import h5py
import pandas as pd

# Function to extract features from a single HDF5 file
def extract_features(file_path):
    features = {}
    with h5py.File(file_path, 'r') as f:
        # Extract relevant features
        features['bpm'] = f['analysis']['songs']['tempo'][0]
        features['loudness'] = f['analysis']['songs']['loudness'][0]
        features['year'] = f['musicbrainz']['songs']['year'][0]
        features['title'] = f['metadata']['songs']['title'][0] # can be used for title analysis
        f.close()
    return features

In [2]:
import os

# Directory containing the Million Song Subset dataset, not included in the git repository
dataset_dir = '../MillionSongSubset/' 
song_features = []

# recursively iterating over HDF5 files in a directory
def iterate_hdf5_files(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.h5'):  # getting all of the HDF5 files
                file_path = os.path.join(root, file)
                yield file_path


# iterate over all of the files in the directory
for hdf5_file in iterate_hdf5_files(dataset_dir):
    # Extract features from the file and adding them to the list
    features = extract_features(hdf5_file)
    song_features.append(features)
      
# Converting the lists to a DataFrame
df = pd.DataFrame(song_features)

#convert title from bytes to string
df['title'] = df['title'].astype(str)

# Creating hdf5 and csv files
df.to_hdf('../data/aggregated_song_features.h5', key='data', mode='w', format='table', data_columns=True)
df.to_csv('../data/aggregated_song_features.csv', index=False)

#Print the DataFrame to verify the data
print(df)

          bpm  loudness  year                             title
0      92.198   -11.197     0                  I Didn't Mean To
1     121.274    -9.843  1969                         Soul Deep
2     100.070    -9.689     0                   Amor De Cabaret
3     119.293    -9.013  1982                   Something Girls
4     129.738    -4.501  2007                    Face the Ashes
...       ...       ...   ...                               ...
9995  140.185    -8.087  1998                    The Hanged Man
9996   77.072   -14.517  1998  The Wonderful World Of The Young
9997  118.123   -12.087     0                   Sentimental Man
9998  137.663   -12.574     0                 Zydeco In D-Minor
9999  150.575    -5.324  2005                    Shattered Life

[10000 rows x 4 columns]


In [3]:
# Open the created HDF5 file
f = h5py.File('../data/aggregated_song_features.h5', 'r')

# Print the keys to make sure the data is there
print(f.keys())
f.close()

<KeysViewHDF5 ['data']>
