In [1]:
import pandas as pd
import boto3 as aws
from botocore.exceptions import ClientError
from pathos.pools import ProcessPool
from toolz.functoolz import compose_left
import re
import os

In [2]:
FMA_METADATA_PATH = '/Volumes/LittleJim/Datasets/fma_metadata/raw_tracks.csv'
FMA_DATA_PATH = '/Volumes/LittleJim/Datasets/fma_large/'
AWS_PROFILE = "music-full-access"
S3_BUCKET_NAME = 'ali-embedding-data'
S3_FOLDER = 'fma-previews'

In [3]:
def get_license_types(license_url):
    if not isinstance(license_url, str):
        return tuple()
    pattern = re.compile(r'\.org\/l\/([a-z\-]*)\/')
    match = pattern.search(license_url)
    if match:
        return tuple(match.groups()[0].split('-'))
    else:
        pattern = re.compile(r'zero')
        match = pattern.search(license_url)
        if match:
            return ('zero',)
        else:
            return tuple()

In [4]:
def get_track_info(csv_path):
    columns = ['track_id',
               'album_title',
               'artist_name',
               'track_url',
               'track_listens',
               'license_image_file',
               'track_title',
               'license_parent_id',
               'track_genres']
    df = pd.read_csv(csv_path)
    df = df[columns]
    df['licenses'] = df.license_image_file.map(get_license_types)  # Filter out non-commercial licenses
    df = df[~(df.licenses.map(lambda li: 'nc' in li or 'nd' in li or len(li) == 0))]
    return df

In [5]:
md = get_track_info(FMA_METADATA_PATH)

In [6]:
s3 = aws.Session(profile_name=AWS_PROFILE).client('s3')

In [7]:
def locate_path(track_id):
    folder = str(track_id // 1000).zfill(3)
    file = f"{str(track_id).zfill(6)}.mp3"
    path = os.path.join(FMA_DATA_PATH, folder, file)
    if os.path.exists(path):
        return path
    else:
        return None

In [8]:
def upload_file(file_name, bucket, object_name):
    try:
        s3.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        print(e)
        return False
    return True

In [9]:
def track_id_to_object(track_id):
    return f"{S3_FOLDER}/{track_id}FMA.mp3"

In [10]:
def upload_mp3(track_id):
    path = locate_path(track_id)
    if path is not None:
        object_name = track_id_to_object(track_id)
        result = upload_file(path, S3_BUCKET_NAME, object_name)
        return track_id
    return None

In [11]:
def get_all_track_ids(id_series):
    for val in id_series:
        yield val

In [12]:
with ProcessPool(nodes=4) as P:
    gtr = get_all_track_ids(md.track_id)
    results = P.uimap(upload_mp3, gtr)

In [13]:
results = [r for r in results if r is not None]
results_series = pd.Series(results, name='track_id')
results_series.to_csv('uploaded_FMA_track_ids.csv')
uploaded_md = md.merge(results_series)
uploaded_md.to_csv('uploaded_FMA_metadata.csv')