In [4]:
from pathlib import Path
import tempfile
import tarfile
import zstandard  
import urllib.request

import os
from glob import iglob

import json
from collections import defaultdict
import numpy as np
import pandas as pd

In [5]:
for x in range(0,1):
    urllib.request.urlretrieve(f"https://data.metabrainz.org/pub/musicbrainz/acousticbrainz/dumps/acousticbrainz-highlevel-json-20220623/acousticbrainz-highlevel-json-20220623-{x}.tar.zst"
                           ,f"/Users/ryanlnewbury/Downloads/acousticbrainz-highlevel-json-20220623-{x}.tar.zst")

In [6]:
#extract zst files function
# from https://gist.github.com/scivision/ad241e9cf0474e267240e196d7545eca
def extract_zst(archive: Path, out_path: Path):
    """extract .zst file
    works on Windows, Linux, MacOS, etc.
    
    Parameters
    ----------
    archive: pathlib.Path or str
      .zst file to extract
    out_path: pathlib.Path or str
      directory to extract files and directories to
    """

    archive = Path(archive).expanduser()
    out_path = Path(out_path).expanduser().resolve()
    # need .resolve() in case intermediate relative dir doesn't exist

    dctx = zstandard.ZstdDecompressor()

    with tempfile.TemporaryFile(suffix=".tar") as ofh:
        with archive.open("rb") as ifh:
            dctx.copy_stream(ifh, ofh)
        ofh.seek(0)
        with tarfile.open(fileobj=ofh) as z:
            z.extractall(out_path)

In [8]:
#local variables to file path, set as needed
for x in range(0,1):
    path_to_zst = f"/Users/ryanlnewbury/Downloads/acousticbrainz-highlevel-json-20220623-{x}.tar.zst"
    path_to_zst_extract = "/Users/ryanlnewbury/Downloads/highlevel"
    extract_zst(path_to_zst,
                path_to_zst_extract)

In [9]:
#creates directory route for every file in database
local_path = "/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel"
rootdir_glob = local_path + "/**/*"
# This will return absolute paths
file_list = [f for f in iglob(rootdir_glob, recursive=True) if os.path.isfile(f)]

In [10]:
#list of directories
file_list[:20]

['/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/619f63d9-9303-431b-b413-1681b49ae1f7-0.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/619f60fa-b680-4735-a635-fc0f03715227-0.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/619632d3-d916-49ad-8ddb-a27fcd0c2bad-1.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/6192144b-0984-45bb-978a-9bdaa6fa13e9-0.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/61978cc9-fcd1-4493-8ac7-a523d7853a52-2.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/619ee3de-edef-4370-9373-02d90b1ccbb7-3.json',
 '/Users/ryanlnewbury/Downloads/highlevel/acousticbrainz-highlevel-json-20220623/highlevel/61/9/6198c470-0f53-4043-8c96-cc81bee7ae40-0.json',
 '/Use

In [11]:
#example json
f = open(file_list[0])
data = json.load(f)
data

{'highlevel': {'danceability': {'all': {'danceable': 0.180879756808,
    'not_danceable': 0.819120228291},
   'probability': 0.819120228291,
   'value': 'not_danceable',
   'version': {'essentia': '2.1-beta1',
    'essentia_build_sha': '8e24b98b71ad84f3024c7541412f02124a26d327',
    'essentia_git_sha': 'v2.1_beta1-228-g260734a',
    'extractor': 'music 1.0',
    'gaia': '2.4-dev',
    'gaia_git_sha': '857329b',
    'models_essentia_git_sha': 'v2.1_beta1'}},
  'gender': {'all': {'female': 0.345933228731, 'male': 0.654066741467},
   'probability': 0.654066741467,
   'value': 'male',
   'version': {'essentia': '2.1-beta1',
    'essentia_build_sha': '8e24b98b71ad84f3024c7541412f02124a26d327',
    'essentia_git_sha': 'v2.1_beta1-228-g260734a',
    'extractor': 'music 1.0',
    'gaia': '2.4-dev',
    'gaia_git_sha': '857329b',
    'models_essentia_git_sha': 'v2.1_beta1'}},
  'genre_dortmund': {'all': {'alternative': 0.017582796514,
    'blues': 0.00265991478227,
    'electronic': 0.967449188

In [12]:
data['metadata']['version'].keys()

dict_keys(['highlevel', 'lowlevel'])

In [19]:
#extraxts data for every file in directories
#caution this will take a while
dic = defaultdict(list)
for d in file_list:
        f = open(d)
        data = json.load(f)
        #doesn't collect data if no metadata
        if data.get('metadata') == None:
            continue
        #gets mbid from file title
        temp = d.split('/')[-1].split('-')
        temp.pop(-1);
        id1 = '-'.join(temp)
        #deduplicates mbids by only allowing one id into dictionary
        if id1 in dic['id']:
            continue
        else:
            dic['id'].append(id1)
        #appends new information to dicts
        dic['danceability'].append(data.get('highlevel').get('danceability').get('all').get('danceable'))
        dic['gender_male'].append(data.get('highlevel').get('gender').get('all').get('male'))
        dic['alternative'].append(data.get('highlevel').get('genre_dortmund').get('all').get('alternative'))
        dic['blues'].append(data.get('highlevel').get('genre_dortmund').get('all').get('blues'))
        dic['electronic'].append(data.get('highlevel').get('genre_dortmund').get('all').get('electronic'))
        dic['folkcountry'].append(data.get('highlevel').get('genre_dortmund').get('all').get('folkcountry'))
        dic['funksoulrnb'].append(data.get('highlevel').get('genre_dortmund').get('all').get('funksoulrnb'))
        dic['jazz'].append(data.get('highlevel').get('genre_dortmund').get('all').get('jazz'))
        dic['pop'].append(data.get('highlevel').get('genre_dortmund').get('all').get('pop'))
        dic['raphiphop'].append(data.get('highlevel').get('genre_dortmund').get('all').get('raphiphop'))
        dic['rock'].append(data.get('highlevel').get('genre_dortmund').get('all').get('rock'))
        dic['genre'].append(data.get('highlevel').get('genre_dortmund').get('value'))
        dic['acoustic'].append(data.get('highlevel').get('mood_acoustic').get('all').get('acoustic'))
        dic['aggressive'].append(data.get('highlevel').get('mood_aggressive').get('all').get('aggressive'))
        dic['mood_electronic'].append(data.get('highlevel').get('mood_electronic').get('all').get('electronic'))
        dic['happy'].append(data.get('highlevel').get('mood_happy').get('all').get('happy'))
        dic['party'].append(data.get('highlevel').get('mood_party').get('all').get('party'))
        dic['relaxed'].append(data.get('highlevel').get('mood_relaxed').get('all').get('relaxed'))
        dic['sad'].append(data.get('highlevel').get('mood_sad').get('all').get('sad'))
        dic['mood_mirex_1'].append(data.get('highlevel').get('moods_mirex').get('all').get('Cluster1'))
        dic['mood_mirex_2'].append(data.get('highlevel').get('moods_mirex').get('all').get('Cluster2'))
        dic['mood_mirex_3'].append(data.get('highlevel').get('moods_mirex').get('all').get('Cluster3'))
        dic['mood_mirex_4'].append(data.get('highlevel').get('moods_mirex').get('all').get('Cluster4'))
        dic['mood_mirex_5'].append(data.get('highlevel').get('moods_mirex').get('all').get('Cluster5'))
        dic['timbre_bright'].append(data.get('highlevel').get('timbre').get('all').get('bright'))
        dic['tonal'].append(data.get('highlevel').get('tonal_atonal').get('all').get('tonal'))
        dic['instrumental'].append(data.get('highlevel').get('voice_instrumental').get('all').get('instrumental'))
        dic['bit_rate'].append(data.get('metadata').get('audio_properties').get('bit_rate'))
        dic['codec'].append(data.get('metadata').get('audio_properties').get('codec'))
        dic['length'].append(data.get('metadata').get('audio_properties').get('length'))
        dic['lossless'].append(data.get('metadata').get('audio_properties').get('lossless'))
        dic['replay_gain'].append(data.get('metadata').get('audio_properties').get('replay_gain'))
        dic['true_genre'].append(data.get('metadata').get('tags').get('genre'))

        #same with metadata, but metadata in lists so needs an except if the list is empty
        try:
            dic['artist'].append(data.get('metadata').get('tags').get('artist')[0])
        except:
            dic['artist'].append(data.get('metadata').get('tags').get('artist'))       
        try:
            dic['album'].append(data.get('metadata').get('tags').get('album')[0])
        except:
            dic['album'].append(data.get('metadata').get('tags').get('album'))
        try:
            dic['bpm'].append(data.get('metadata').get('tags').get('bpm')[0])
        except:
            dic['bpm'].append(data.get('metadata').get('tags').get('bpm'))
        try:
            dic['year'].append(int(data.get('metadata').get('tags').get('date')[0].split('-')[0]))
        except:
            dic['year'].append(data.get('metadata').get('tags').get('date'))
        try:
            dic['date'].append(data.get('metadata').get('tags').get('date')[0])     
        except:
            dic['date'].append(data.get('metadata').get('tags').get('date'))
        try:
            dic['label'].append(data.get('metadata').get('tags').get('label')[0])
        except:
            dic['label'].append(data.get('metadata').get('tags').get('label'))
        try:
            dic['song'].append(data.get('metadata').get('tags').get('title')[0])
        except:
            dic['song'].append(data.get('metadata').get('tags').get('title'))
        try:
            dic['artistsort'].append(data.get('metadata').get('tags').get('artistsort')[0])
        except:
            dic['artistsort'].append(data.get('metadata').get('tags').get('artistsort'))

In [20]:
#creates dataframe from dict created
data = pd.DataFrame(dic)

In [21]:
len(data)

665

In [22]:
#number of rows with a value in year
data['year'].count()

643

In [23]:
#remove null years
data = data[data['year'].isnull() == False]

In [24]:
len(data)

643

In [25]:
data.head(10)

Unnamed: 0,id,danceability,gender_male,alternative,blues,electronic,folkcountry,funksoulrnb,jazz,pop,...,replay_gain,true_genre,artist,album,bpm,year,date,label,song,artistsort
0,619f63d9-9303-431b-b413-1681b49ae1f7,0.1808798,0.654067,0.0175828,0.002659915,0.967449,0.007424,0.0002948905,0.001406,0.0004379611,...,-14.42832,[Source],Kings of Convenience,Riot on an Empty Street,,2004,2004,,I'd Rather Dance With You,Kings of Convenience
1,619f60fa-b680-4735-a635-fc0f03715227,0.8901892,0.5,0.1129063,0.2394386,0.256372,0.148527,0.02319941,0.039718,0.0421632,...,-9.880486,[Classic Rock; Rock; Pop],The Beatles,Rubber Soul,,1987,1987-04-30,Parlophone,What Goes On,"Beatles, The"
2,619632d3-d916-49ad-8ddb-a27fcd0c2bad,3.000001e-14,0.377873,0.001878294,0.0001457041,0.997529,9.3e-05,4.051082e-06,0.000217,1.495819e-05,...,-5.929924,[Hard Rock],Page & Plant,Walking Into Clarksdale,,1998,1998,Atlantic,Upon a Golden Horse,
3,6192144b-0984-45bb-978a-9bdaa6fa13e9,0.3648081,0.888691,0.02261784,0.001403403,0.968951,0.002974,0.0002257532,0.000455,0.0004141238,...,-14.038452,,Grand Funk Railroad,Thirty Years of Funk 1969-1999: The Anthology,,1999,1999-06-29,Capitol Records,Crossfire,Grand Funk Railroad
4,61978cc9-fcd1-4493-8ac7-a523d7853a52,3.000001e-14,0.377873,0.0559427,0.05800022,0.638596,0.04292,0.004798885,0.139283,0.009367774,...,-8.160597,[Alternative],Neko Case,Fox Confessor Brings the Flood,,2006,2006,ANTI-,A Widow's Toast,"Case, Neko"
5,619ee3de-edef-4370-9373-02d90b1ccbb7,0.2860451,0.905335,0.1313779,0.1859132,0.170054,0.189101,0.01693741,0.024191,0.04611452,...,-7.218739,[Rock],INXS,Kick,,1987,1987-10-20,Atlantic,Never Tear Us Apart,INXS
6,6198c470-0f53-4043-8c96-cc81bee7ae40,0.9140623,0.758206,0.0009705611,0.0001010405,0.998723,9.6e-05,5.69687e-07,3e-05,5.99918e-06,...,-13.423643,[EBM],mind.in.a.box,Crossroads,,2008,2008-03-04,,Stalkers,
7,619d535a-b2b4-4e16-804c-e256f60a0267,3.000001e-14,0.377873,0.003390888,0.0005241779,0.992979,0.00038,6.315875e-05,0.001871,9.144389e-05,...,-7.973265,[Rock],SCYCS,Megahits 99: Die Erste,,1998,1998,EMI Electrola,Next November,SCYCS
8,619aa0ff-588d-4764-a7b8-25d5ec3f287f,0.04072887,0.743319,4.779294e-09,5.206847e-09,0.999974,1e-06,1.464147e-07,2.2e-05,5.81779e-08,...,-3.961552,[Soundtrack],James Newton Howard,The Fugitive,,1993,1993,La-La Land Records,Helicopter Chase / The Sewer (Synth Demos),"Howard, James Newton"
9,6192fab3-88fe-4255-8079-f5c515391ce3,3.000001e-14,0.377873,0.0007478595,0.0002796236,0.996223,0.000335,2.666766e-05,0.001979,3.585146e-05,...,-0.613031,[Modern Folk - Singer / Songwriter],"Crosby, Stills, Nash & Young",4 Way Street,0.0,1992,1992-06-23,Atlantic,Right Between the Eyes,"Crosby, Stills, Nash & Young"


In [33]:
for x in ['lowlevel','tonal','rhythm']:  
    urllib.request.urlretrieve(f"https://data.metabrainz.org/pub/musicbrainz/acousticbrainz/dumps/acousticbrainz-lowlevel-features-20220623/acousticbrainz-lowlevel-features-20220623-{x}.tar.zst"
                           ,f"/Users/ryanlnewbury/Downloads/acousticbrainz-lowlevel-features-20220623-{x}.tar.zst")
    path_to_zst = f"/Users/ryanlnewbury/Downloads/acousticbrainz-lowlevel-features-20220623-{x}.tar.zst"
    path_to_zst_extract = "/Users/ryanlnewbury/Downloads"
    extract_zst(path_to_zst,
                path_to_zst_extract)

In [36]:
#loads features extracted from acousticbrainz lowlevel features, from 3 different csvs on their website
lowlevel = pd.read_csv("/Users/ryanlnewbury/Downloads/acousticbrainz-lowlevel-features-20220623/acousticbrainz-lowlevel-features-20220623-lowlevel.csv")
rhythm = pd.read_csv("/Users/ryanlnewbury/Downloads/acousticbrainz-lowlevel-features-20220623/acousticbrainz-lowlevel-features-20220623-rhythm.csv")
tonal = pd.read_csv("/Users/ryanlnewbury/Downloads/acousticbrainz-lowlevel-features-20220623/acousticbrainz-lowlevel-features-20220623-tonal.csv")

In [37]:
lowlevel.head(10)

Unnamed: 0,mbid,submission_offset,average_loudness,dynamic_complexity,mfcc_zero_mean
0,0e11c0fd-a1da-4b88-a438-7ef55c5809ec,0,0.70328,5.612967,-722.370972
1,7fef22bd-76aa-4803-b56b-93a5d6e70662,0,0.657434,5.046805,-690.498535
2,71c0e054-b700-4fd2-a35b-95c7afc566cb,0,0.228847,5.314451,-706.800476
3,2d1201cf-59bb-4ffa-9f52-f5b3afa13346,0,0.328406,4.47598,-721.950439
4,96685213-a25c-4678-9a13-abd9ec81cf35,0,0.57007,9.305593,-784.208496
5,73b01cea-2dad-4fc2-9e61-02a31477c1b1,0,0.168545,9.074841,-792.64502
6,7c278a16-ae04-460c-88ea-39155cadcd09,0,0.854816,2.146111,-652.554626
7,19084069-642f-465f-9127-f71bcd800a05,0,0.030876,3.55784,-749.831482
8,70fda1f4-c0cf-4bb5-b25e-79b5e921b198,0,0.925762,2.719668,-646.503784
9,da39a905-7b84-4e2a-bbcf-74de3d6ebd03,0,0.475052,5.929364,-682.161255


In [38]:
rhythm.head(10)

Unnamed: 0,mbid,submission_offset,bpm,bpm_histogram_first_peak_bpm_mean,bpm_histogram_first_peak_bpm_median,bpm_histogram_second_peak_bpm_mean,bpm_histogram_second_peak_bpm_median,danceability,onset_rate
0,0e11c0fd-a1da-4b88-a438-7ef55c5809ec,0,120.763885,120.0,120.0,133.0,133.0,0.996203,2.867577
1,7fef22bd-76aa-4803-b56b-93a5d6e70662,0,125.956993,126.0,126.0,136.0,136.0,1.131311,3.568778
2,71c0e054-b700-4fd2-a35b-95c7afc566cb,0,132.617203,133.0,133.0,140.0,140.0,0.915099,2.858371
3,2d1201cf-59bb-4ffa-9f52-f5b3afa13346,0,144.318924,144.0,144.0,152.0,152.0,0.972823,2.395773
4,96685213-a25c-4678-9a13-abd9ec81cf35,0,128.347702,129.0,129.0,120.0,120.0,1.102882,2.419718
5,73b01cea-2dad-4fc2-9e61-02a31477c1b1,0,120.360603,120.0,120.0,115.0,115.0,0.974217,1.876432
6,7c278a16-ae04-460c-88ea-39155cadcd09,0,151.575623,152.0,152.0,157.0,157.0,1.139013,3.394924
7,19084069-642f-465f-9127-f71bcd800a05,0,82.642754,82.0,82.0,94.0,94.0,0.872103,5.799162
8,70fda1f4-c0cf-4bb5-b25e-79b5e921b198,0,129.358032,129.0,129.0,123.0,123.0,1.155036,3.361048
9,da39a905-7b84-4e2a-bbcf-74de3d6ebd03,0,91.101822,91.0,91.0,172.0,172.0,1.073015,2.519788


In [39]:
tonal.head(10)

Unnamed: 0,mbid,submission_offset,key_key,key_scale,tuning_frequency,tuning_equal_tempered_deviation
0,0e11c0fd-a1da-4b88-a438-7ef55c5809ec,0,A,major,434.193115,0.141634
1,7fef22bd-76aa-4803-b56b-93a5d6e70662,0,A,major,434.193115,0.177662
2,71c0e054-b700-4fd2-a35b-95c7afc566cb,0,G,major,434.193115,0.234276
3,2d1201cf-59bb-4ffa-9f52-f5b3afa13346,0,D,major,434.193115,0.219335
4,96685213-a25c-4678-9a13-abd9ec81cf35,0,A,minor,434.193115,0.164615
5,73b01cea-2dad-4fc2-9e61-02a31477c1b1,0,G,minor,442.54892,0.0
6,7c278a16-ae04-460c-88ea-39155cadcd09,0,G,major,434.193115,0.224572
7,19084069-642f-465f-9127-f71bcd800a05,0,D,minor,434.193115,0.195417
8,70fda1f4-c0cf-4bb5-b25e-79b5e921b198,0,E,major,445.112549,0.095341
9,da39a905-7b84-4e2a-bbcf-74de3d6ebd03,0,A,major,444.598633,0.032828


In [40]:
#length of the 3 csv files
len(rhythm)

29460584

In [41]:
len(tonal)

29460584

In [42]:
len(lowlevel)

29460584

In [43]:
#number of unique ids in csvs
rhythm['mbid'].nunique()

7564215

In [44]:
tonal['mbid'].nunique()

7564215

In [45]:
lowlevel['mbid'].nunique()

7564215

In [46]:
#dropping duplicates from csvs
rhythm = rhythm.drop_duplicates(subset = ['mbid'], ignore_index = True)

In [47]:
len(rhythm)

7564215

In [48]:
tonal = tonal.drop_duplicates(subset = ['mbid'], ignore_index = True)

In [49]:
len(tonal)

7564215

In [50]:
lowlevel = lowlevel.drop_duplicates(subset = ['mbid'], ignore_index = True)

In [51]:
len(lowlevel)

7564215

In [52]:
#joining all data into one dataframe
data1 = data.set_index('id').join(rhythm.set_index('mbid'),how = 'left',rsuffix = '_2')

In [53]:
data2 = data1.join(tonal.set_index('mbid'),how = 'left', rsuffix = '_2')

In [54]:
data3 = data2.join(lowlevel.set_index('mbid'),how = 'left', rsuffix = '_2')

In [56]:
data3.head(10)

Unnamed: 0_level_0,danceability,gender_male,alternative,blues,electronic,folkcountry,funksoulrnb,jazz,pop,raphiphop,...,onset_rate,submission_offset_2,key_key,key_scale,tuning_frequency,tuning_equal_tempered_deviation,submission_offset_2,average_loudness,dynamic_complexity,mfcc_zero_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619f63d9-9303-431b-b413-1681b49ae1f7,0.1808798,0.654067,0.0175828,0.002659915,0.967449,0.007424,0.0002948905,0.001406,0.0004379611,0.0001125043,...,4.326175,0,D#,major,440.508575,0.020504,0,0.927267,3.180683,-682.701538
619f60fa-b680-4735-a635-fc0f03715227,0.8901892,0.5,0.1129063,0.2394386,0.256372,0.148527,0.02319941,0.039718,0.0421632,0.01680163,...,5.457075,0,A,minor,434.193115,0.208812,0,0.905077,3.672358,-654.275208
619632d3-d916-49ad-8ddb-a27fcd0c2bad,3.000001e-14,0.377873,0.001878294,0.0001457041,0.997529,9.3e-05,4.051082e-06,0.000217,1.495819e-05,5.780928e-06,...,2.130402,0,A#,minor,434.193115,0.187084,0,0.757508,2.402322,-658.484436
6192144b-0984-45bb-978a-9bdaa6fa13e9,0.3648081,0.888691,0.02261784,0.001403403,0.968951,0.002974,0.0002257532,0.000455,0.0004141238,0.0001731058,...,3.034324,0,D,major,434.193115,0.201451,0,0.934868,2.966868,-638.99054
61978cc9-fcd1-4493-8ac7-a523d7853a52,3.000001e-14,0.377873,0.0559427,0.05800022,0.638596,0.04292,0.004798885,0.139283,0.009367774,0.006285998,...,2.792825,0,F,minor,447.174194,0.229774,0,0.639869,5.109163,-762.092285
619ee3de-edef-4370-9373-02d90b1ccbb7,0.2860451,0.905335,0.1313779,0.1859132,0.170054,0.189101,0.01693741,0.024191,0.04611452,0.01146711,...,2.844084,0,F,major,438.224518,0.086532,0,0.687216,5.789335,-678.900818
6198c470-0f53-4043-8c96-cc81bee7ae40,0.9140623,0.758206,0.0009705611,0.0001010405,0.998723,9.6e-05,5.69687e-07,3e-05,5.99918e-06,4.613626e-07,...,4.612633,0,B,minor,440.0,0.012354,0,0.8562,4.853683,-677.775269
619d535a-b2b4-4e16-804c-e256f60a0267,3.000001e-14,0.377873,0.003390888,0.0005241779,0.992979,0.00038,6.315875e-05,0.001871,9.144389e-05,3.622894e-05,...,2.691104,0,F,minor,434.193115,0.188965,0,0.619785,6.166994,-697.542725
619aa0ff-588d-4764-a7b8-25d5ec3f287f,0.04072887,0.743319,4.779294e-09,5.206847e-09,0.999974,1e-06,1.464147e-07,2.2e-05,5.81779e-08,1.89042e-08,...,2.228352,0,G,major,442.804626,0.072889,0,0.104274,6.579127,-749.649902
6192fab3-88fe-4255-8079-f5c515391ce3,3.000001e-14,0.377873,0.0007478595,0.0002796236,0.996223,0.000335,2.666766e-05,0.001979,3.585146e-05,8.066453e-06,...,2.371305,0,F,minor,434.193115,0.188651,0,0.011282,7.22609,-767.877686


In [57]:
data3.to_parquet('data.parquet')

ValueError: Duplicate column names found: ['danceability', 'gender_male', 'alternative', 'blues', 'electronic', 'folkcountry', 'funksoulrnb', 'jazz', 'pop', 'raphiphop', 'rock', 'genre', 'acoustic', 'aggressive', 'mood_electronic', 'happy', 'party', 'relaxed', 'sad', 'mood_mirex_1', 'mood_mirex_2', 'mood_mirex_3', 'mood_mirex_4', 'mood_mirex_5', 'timbre_bright', 'tonal', 'instrumental', 'bit_rate', 'codec', 'length', 'lossless', 'replay_gain', 'true_genre', 'artist', 'album', 'bpm', 'year', 'date', 'label', 'song', 'artistsort', 'submission_offset', 'bpm_2', 'bpm_histogram_first_peak_bpm_mean', 'bpm_histogram_first_peak_bpm_median', 'bpm_histogram_second_peak_bpm_mean', 'bpm_histogram_second_peak_bpm_median', 'danceability_2', 'onset_rate', 'submission_offset_2', 'key_key', 'key_scale', 'tuning_frequency', 'tuning_equal_tempered_deviation', 'submission_offset_2', 'average_loudness', 'dynamic_complexity', 'mfcc_zero_mean']