In [0]:
#
# CREDIT: https://stackoverflow.com/a/39225039
#

import requests

def progress_bar(some_iter):
    try:
        from tqdm import tqdm
        return tqdm(some_iter)
    except ModuleNotFoundError:
        return some_iter

def download_file_from_google_drive(id, destination):
  print("Trying to fetch {}".format(destination))

  def get_confirm_token(response):
    for key, value in response.cookies.items():
      if key.startswith('download_warning'):
        return value

    return None

  def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
      for chunk in progress_bar(response.iter_content(CHUNK_SIZE)):
        if chunk: # filter out keep-alive new chunks
          f.write(chunk)

  URL = "https://docs.google.com/uc?export=download"

  session = requests.Session()

  response = session.get(URL, params = { 'id' : id }, stream = True)
  token = get_confirm_token(response)

  if token:
    params = { 'id' : id, 'confirm' : token }
    response = session.get(URL, params = params, stream = True)

  save_response_content(response, destination)

In [0]:
# download from public google drive link
# https://drive.google.com/open?id=1ofOXCLTpJAjfK07RuqputX-xx_11xsuz
download_file_from_google_drive("1ofOXCLTpJAjfK07RuqputX-xx_11xsuz", \
                                "./Spotify.zip")

Trying to fetch ./Spotify.zip


149661it [02:04, 1201.46it/s]


In [0]:
# unzip file. for some issue with the original zip file, 
# python zipfile in unable to process the file

!unzip -o -q Spotify.zip

  (attempting to process anyway)
file #1:  bad zipfile offset (local header sig):  4294967296
  (attempting to re-compensate)
file #882:  bad zipfile offset (local header sig):  868912
  (attempting to re-compensate)


In [0]:
# lets extract one csv file and explore its header
import pandas as pd

songsDF = pd.read_csv("./Songs/songs1.csv")

songsDF.columns

Index(['pid', 'pos', 'artist_name', 'track_uri', 'artist_uri', 'track_name',
       'album_uri', 'duration_ms', 'album_name'],
      dtype='object')

In [0]:
# install fastparquet
# parquet is column based database that can handle a huge
# amount of data, even if it does not fit in memmory
!pip install fastparquet

Collecting fastparquet
[?25l  Downloading https://files.pythonhosted.org/packages/58/49/dccb790fa17ab3fbf84a6b848050083c7a1899e9586000e34e3e4fbf5538/fastparquet-0.3.2.tar.gz (151kB)
[K     |████████████████████████████████| 153kB 3.5MB/s 
Collecting thrift>=0.11.0
[?25l  Downloading https://files.pythonhosted.org/packages/c6/b4/510617906f8e0c5660e7d96fbc5585113f83ad547a3989b80297ac72a74c/thrift-0.11.0.tar.gz (52kB)
[K     |████████████████████████████████| 61kB 7.9MB/s 
Building wheels for collected packages: fastparquet, thrift
  Building wheel for fastparquet (setup.py) ... [?25l[?25hdone
  Created wheel for fastparquet: filename=fastparquet-0.3.2-cp36-cp36m-linux_x86_64.whl size=253216 sha256=99d254d0a8a11a64ca75f1df0ca500e6b0fa5ea76c9b45de109b899dcb4056b3
  Stored in directory: /root/.cache/pip/wheels/b9/36/13/01416a760ddcab0eb8281ec9c9ffcbed945c9b831647c8b904
  Building wheel for thrift (setup.py) ... [?25l[?25hdone
  Created wheel for thrift: filename=thrift-0.11.0-cp36-c

In [0]:
import os

from dask import dataframe as dd
from dask.dataframe import read_csv, read_parquet, to_csv, to_parquet

# create a dask data frame of all the csv files
ddf = read_csv(urlpath=os.path.join("./Songs", '*.csv'))

In [0]:
# write that dask dataframe parquet structure
# using engine='fastparquet' as dask supports that engine
ddf.to_parquet('./Spotify.parquet', engine='fastparquet')

In [0]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# copy parquet files to google drive
!cp -a ./Spotify.parquet/. './drive/My Drive/109A/project/data/'

In [0]:
# basic check of parquet file
# read into a dask dataframe
ddfP = read_parquet('./Spotify.parquet', engine='fastparquet')

In [0]:
# compute number of entries per artist
ddfP.groupby('artist_name').artist_name.count().compute()

artist_name
!llmind                                         130
"Weird Al" Yankovic                            4569
$teven Cannon                                   302
$uicideBoy$                                   13662
'In The Heights' Original Broadway Company      614
                                              ...  
The Brotherhood                                   1
Thelman Houston                                   1
Trip                                              1
Widelows                                          1
xmas songs                                        1
Name: artist_name, Length: 287739, dtype: int64

# Create a Parquet file containing all unique songs

In [0]:
ddf_u = ddf[['artist_name', 'track_name', 'album_name', 'track_uri', 'artist_uri', 'album_uri']].drop_duplicates(subset='track_uri')
ddf_u.dropna()
ddf_u = ddf_u.repartition(npartitions=20)
ddf_u.to_parquet('./Spotify_Unique_Songs.parquet', engine='fastparquet')

In [0]:
!cp -a ./Spotify_Unique_Songs.parquet/. './drive/My Drive/109A/project/Spotify_Unique_Songs/'