Program will download from https://allflac.com

Instruction:


1.   Edit the "Parameters"
2.   Always run "Parameters" and "1-time Setup" for every new runtime
3.   Run "Step 1" until it's done
4.   Run "Step 2" a few times until there's no error. If there's a prompt to go to google url, click on it and login and copy the verification code and paste it here
5.   Run "Step 3" a lot of times until it's done. If there's a prompt to go to google url, click on it and login and copy the verification code and paste it here
(gdrive limits to 12 hr per run. When it disconnect, reconnect and run Step 3 again, it'll pick up where it left)


*Estimated 1 month to completely download everything*


In [0]:
#@title Parameters
#@markdown Download from https://allflac.com

WORKERS = 16 #@param {type:"number"}
SAVELOCATION = '/gdrive/Shared drives/Team Drive/Downloads' #@param {type:"string"}
SITEURL='https://allflac.com'

import dask
dask.config.set(work_stealing=False)
dask.config.set(num_workers=WORKERS)

In [0]:
#@title 1-time Setup

from IPython.display import clear_output
from google.colab import auth, drive
import google.auth


auth.authenticate_user()
credentials, _ = google.auth.default()

!pip install gspread-pandas
clear_output()

In [0]:
#@title Step 1 - Generate page number

import pandas as pd
import requests
from bs4 import BeautifulSoup
from gspread_pandas import Spread


def getnumpage():
  try:
    r = requests.get('%s/albums/artist'%(SITEURL))
    soup = BeautifulSoup(r.text, 'html5lib')
    lastpage=soup.select('.filter_offset_pages a')[-1].get_text()
    return int(lastpage)
  except:
    return 0

spread=Spread('allflac_com2', 'albums_page', create_spread=True, create_sheet=True, creds=credentials)
spread.delete_sheet('Sheet1')
# spread.clear_sheet()
# spread.update_cells('A1', 'B1', ['link', 'done'])

urls=['%s/albums/artist?Albums_offset_page=%d'%(SITEURL, i)
      for i in range(1, getnumpage()+1)]
df=pd.DataFrame(urls, columns=['link'])
df['done']=''

spread.df_to_sheet(df, index=False, replace=True)


In [0]:
#@title Step 2 - Get Albums
#@markdown (will take a few hours, run a few time until it is finished and no error)

import pandas as pd
import dask.dataframe as dd
import requests
from bs4 import BeautifulSoup
from gspread_pandas import Spread
from urllib.parse import urljoin
from tqdm import tqdm
import traceback


def cleanup(str):
  r = str.rfind("'")
  return (str[:r] if r>0 else str).strip()


def getalbums(url):
  global spread
  try:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html5lib')
    links=soup.select(".albums a")
    albums=[[l.img.next_sibling.strip(), cleanup(l.span.text), urljoin(SITEURL,l['href'])] for l in links]
    spread.spread.values_append('albums!A1:C1', {'valueInputOption':'RAW'}, {'values':albums})
    for cell in spread.sheet.findall(url):
      spread.sheet.update_cell(cell.row, 2, "Y")
  except:
    traceback.print_exc()
    pass
  pbar.update(1)


spread=Spread('allflac_com2', 'albums_page', creds=credentials)
df1=spread.sheet_to_df(index=None)
df1.drop_duplicates('link', keep=False, inplace=True) 
df2=df1[df1['done']=='']

spread.open_sheet('albums', create=True)
spread.update_cells('A1', 'D1', ['artist', 'title', 'link', 'done'])
spread.open_sheet('albums_page')

pbar = tqdm(total=len(df1), ncols=80)
pbar.update(len(df1)-len(df2))

dd1=dd.from_pandas(df2, npartitions=WORKERS)
dd1['link'].apply(getalbums, meta='object').compute()

print("\n\nFound %d albums"%(spread.find_sheet('albums').row_count-1))

#autofit width on current sheet
spread.spread.batch_update({
    "requests": [
      {
        "autoResizeDimensions": {
          "dimensions": {
            "sheetId": spread.sheet.id,
            "dimension": "COLUMNS",
            "startIndex": 0,
            "endIndex": spread.sheet.col_count
          }
        }
      }
    ]
  })
print('Done')

In [0]:
#@title Step 3 - Download Albums
#@markdown (will take very long, maybe a few days, run a few time until it is finished)
#@markdown (google colab will stop every 12 hours, just restart and run again,)
#@markdown (the program will pickup where it left)


import os
import subprocess
import urllib.request
import shutil
from urllib.parse import urljoin
from tqdm import tqdm
import traceback

# import mutagen
import requests
# import wget
from bs4 import BeautifulSoup

import dask.dataframe as dd
from gspread_pandas import Spread
from google.colab import auth, drive

drive.mount('/gdrive')


def safefilename(name):
  return str(name).translate(str.maketrans(r'\/:*?"<>|', '_________')).strip().strip('. ')


def download_file(url, loc, file_name=None, alt_name=None):
  try:
    with urllib.request.urlopen(url) as response:
      if not file_name:
        try:
          file_name = response.info().get_filename().encode('latin-1').decode('utf-8')
        except:
          file_name = alt_name
      saveto = os.path.join(loc, os.path.basename(file_name))
      #print("Saving to: "+saveto)
      if not os.path.exists(saveto):
        if not os.path.isdir(loc):
            os.makedirs(loc)
        with open(saveto, 'wb') as f:
          shutil.copyfileobj(response, f, 16*1024*1024)
  except:
    traceback.print_exc()
    pass


def download(artist, album, url, songs):
  loc = os.path.join(SAVELOCATION, artist, album)
  tmploc = os.path.join("Downloads", artist, album)

  for i, (title, time) in enumerate(songs):
    url2 = urljoin(SITEURL, url + time.split('#')[0])
    
    if '#' in time:
      if not os.path.isdir(loc):
          os.makedirs(loc)
      tmpfile = time.split('#')[0]+'.flac'
      download_file(url2, tmploc, tmpfile)
      starttime = time.split('#')[1]
      params = ['ffmpeg', 
                '-hide_banner', '-y', 
                '-i', os.path.join(tmploc, tmpfile), 
                '-metadata', 'title="'+title.replace('"','\\"')+'"',
                '-metadata', 'track='+str(i+1),
                '-ss', starttime]
      try:
        endtime = songs[i+1][1].split('#')[1]
        params.extend(['-to', endtime])
      except:
        pass
      params.append(os.path.join(loc, '%02d. %s.flac'%(i+1, safefilename(title))))
      out=subprocess.run(params, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    else:
      download_file(url2, loc, alt_name='%02d. %s.flac'%(i+1, safefilename(title)))
  shutil.rmtree(tmploc, ignore_errors=True)
    

def downloadAlbum(row):
  link = row['link']
  page = requests.get(link)
  soup = BeautifulSoup(page.text, 'html5lib')
  if "Page not found" in soup:
      return

  # get album name and make it safer
  album = safefilename(row['title'])
  artist = safefilename(row['artist'])

  # getting variables
  url = soup.find('div', {'id': 'mp3_player'})['data-url'].replace('player', 'flac')

  playlist = soup.find('tbody', {'class': 'fp_list'}).find_all('tr')
  songs = [[x.text, '.'.join(x.find('a')['data-id'].rsplit(':', 1))] for x in playlist]
  #print(songs)
  download(artist, album, url, songs)
  for cell in spread.sheet.findall(link):
    spread.sheet.update_cell(cell.row, 4, "Y")
  pbar.update(1)
         

spread=Spread('allflac_com2', 'albums', creds=credentials)
df_albums=spread.sheet_to_df(index=None)
df_albums.drop_duplicates('link', keep=False, inplace=True)
df2=df_albums[df_albums['done']=='']

pbar = tqdm(total=len(df_albums), ncols=80)
pbar.update(len(df_albums)-len(df2))

dd1=dd.from_pandas(df2, npartitions=WORKERS)
dd1.apply(downloadAlbum, meta='object', axis=1).compute()


print("\n\nDone")