In [1]:
"""Build initial dataset and write it to S3."""
import time

import pandas as pd

from playlist_selection.downloading import YouTubeDownloader
from playlist_selection.parsing import SpotifyParser

with open("../data/tracks.txt") as fin:
    tracks = [list(map(str.strip, line.split("\t"))) for line in fin.readlines()]
    
tracks_df = pd.DataFrame(tracks, columns=["genre", "name", "artist"])

In [2]:
tracks_df.sample(5)

Unnamed: 0,genre,name,artist
26181,reggaeton,LA DROGA,Bad Bunny
16566,indie,Crossfire,Brandon Flowers
3229,breakbeat,Believe,The Chemical Brothers
20493,mandopop,夜色,Teresa Teng
15610,house,Blackwater - radio edit 128 strings mix,Octave One


In [4]:
CLIENT_ID = ""
CLIENT_SECRET = ""
HOST = "storage.yandexcloud.net"
SCHEMA = "https"
AWS_ACCESS_KEY_ID = "" 
AWS_SECRET_ACCESS_KEY = ""
BUCKET_NAME = "hse-project-playlist-selection"

parser = SpotifyParser(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
)

downloader = YouTubeDownloader()

In [5]:
for genre, genre_tracks in tracks_df.groupby("genre"):
    prefix = f"tracks/{genre}"
    
    song_list = genre_tracks[["name", "artist"]].values.tolist()
    tracks_meta = parser.parse(song_list)
    tracks_meta = list(filter(lambda x: x is not None, tracks_meta))

    parser.load_to_s3(
        schema=SCHEMA,
        host=HOST,
        bucket_name=BUCKET_NAME,
        tracks_meta=tracks_meta,
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        prefix=prefix,
    )
    
    list_to_download = [(x.track_name, x.artist_name[0]) for x in tracks_meta]
    downloader.download_and_save_audio(
        song_list=list_to_download,
        schema=SCHEMA,
        host=HOST,
        bucket_name=BUCKET_NAME,
        prefix=prefix,
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    
    time.sleep(60 * 10)

[2023-10-30 20:52:26] {parser.py:199} INFO - collecting meta for track:"Signifying Blues - Extended Version" artist:"Bo Diddley"
[2023-10-30 20:52:26] {parser.py:199} INFO - collecting meta for track:"Interstate" artist:"Chris Proctor"
[2023-10-30 20:52:26] {parser.py:199} INFO - collecting meta for track:"Jessie, Let Me Wipe Your Feet" artist:"Rory Block"
[2023-10-30 20:52:26] {parser.py:199} INFO - collecting meta for track:"WHITE LIGHT WHITE HEAT" artist:"Masaharu Fukuyama"
[2023-10-30 20:52:26] {parser.py:199} INFO - collecting meta for track:"Short Haired Woman" artist:"Lightnin' Hopkins"
[2023-10-30 20:52:26] {parser.py:199} INFO - collecting meta for track:"I'm Sorry" artist:"Bo Diddley"
[2023-10-30 20:52:26] {parser.py:199} INFO - collecting meta for track:"Goin' To Dallas To See My Pony Run" artist:"Lightnin' Hopkins"
[2023-10-30 20:52:26] {parser.py:199} INFO - collecting meta for track:"Death Bells" artist:"Lightnin' Hopkins"
[2023-10-30 20:52:27] {parser.py:205} INFO - no s