In [65]:
# from distributed import Client
# client = Client()
import pandas as pd
# import modin.pandas as pd
import os
import json
from pathlib import Path
from typing import Optional
from urllib.request import urlretrieve
from tqdm import tqdm
from zipfile import ZipFile
from google.cloud import bigquery
from prefect import task, flow
from prefect_gcp.cloud_storage import GcsBucket
pd.set_option("display.max_columns", None)
print("Setup Complete")

Setup Complete


In [66]:
# Seq 1-Define a function to convert the downloaded file to data frame
def read_df(file: str) -> pd.DataFrame:
    with open(file) as data_file:
        data = json.load(data_file)
        df = pd.read_json(data)
        df = pd.json_normalize(df.to_dict("records"), sep="_")
        return df
    
# file = "/Users/reneboygarcia/Library/CloudStorage/GoogleDrive-reneboygarcia@gmail.com/My Drive/Personal/Data Science Notebook/Data Engineering-Zoomcamp/week_7_capstone_project/albums-json/albums-full-info-10.json"
# df = read_df(file)
# df.head()

In [67]:
# Seq 2-Define a function to tweak the data frame
def tweak_df(df: pd.DataFrame) -> pd.DataFrame:
    print(f"Number of rows: {df.shape[0]}")
    df_ = df
    return df_

In [68]:
# Seq 3-Define a function to set a path for GCS storage and for local file
def write_local(df: pd.DataFrame, filename: str) -> Path:
    directory = Path("bandcamp")
    _file_name = filename.split(".")[0]
    path_name = directory / f"{_file_name}.parquet"
    try:
        os.makedirs(directory)
        df.to_parquet(path_name, compression="snappy", index=False)
    except OSError as error:
        print(error)
    return path_name

In [69]:
# Seq 4-Define a function to upload local file to GCS Bucket
def write_to_gcs(path: Path) -> None:
    gcs_block = GcsBucket.load("prefect-gcs-block-bandcamp")
    gcs_block.upload_from_path(from_path=path, to_path=path)
    print("Hooray, we uploaded a huge file in GCS")
    return

In [70]:
# Seq 5-Delete local file and its directory
def duduplicate(path: Path) -> None:
    try:
        path.unlink()
        full_path = path.resolve()
        full_path.parent.rmdir()
        print("Successfully deleted directory and its files")
    except OSError as error:
        print(f"Unable to find directory: {error}")

In [71]:
# Define ETL from web to gcs:
def etl_web_to_gcs(file: str):
    # Seq 1 -Read file
    df = read_df(file)
    # Seq 2 -Tweak df
    df_ = tweak_df(df)
    # Seq 3 -Set a path this will be use to convert file to parquet
    path_file = write_local(df, file)
    # Seq 4-Upload local file to GCS Bucket
    write_to_gcs(path_file)
    # Seq 5- Remove duplicate
    duduplicate(path_file)

In [79]:
# Define download progress hook
def download_progress_hook(block_num, block_size, total_size, progress_bar=None):
    if not progress_bar:
        progress_bar = tqdm(total=total_size, unit="B", unit_scale=True)
    downloaded = block_num * block_size
    progress_bar.update(downloaded - progress_bar.n)
    if downloaded >= total_size:
        progress_bar.close()
    return progress_bar

In [73]:
# Seq 0 -Download file folder from web
def fetch_data(url:str):
    folder_name = url.split("/")[-1].split("?")[0]
    file_folder = urlretrieve(url, folder_name, reporthook=download_progress_hook)
    if folder_name.endswith(".zip"):
        zip_file = ZipFile(folder_name)
        folder_name_ = os.path.commonprefix(zip_file.namelist()).strip("/")
        zip_file.extractall()
        print(f"Download Complete..extracted zip file")
        print(f"Extracted folder path: {folder_name_}")
        return folder_name_
    print(f"Download Complete..")
    return file_folder

In [74]:
# Define a parent ETL to download the files
progress_bar = None
def elt_parent_web_gcs():
    # Parameters
    dataset_url = "https://www.dropbox.com/s/a1kl5e35j4o53mz/bandcamp-items-json.zip?dl=1"

    # Execution
    # Seq 0 -Download file folder from web
    file_folder = fetch_data(dataset_url)
    # Loop through the files then run etl_web_to_gcs
    print("Running etl_web_to_gcs...this will take sometime..grab some coffee or tea")
    for file in os.listdir(file_folder)[:1]:
        if file.endswith(".json"):
            file_path = os.path.join(file_folder, file)
            print(f"Running: {file}")
            etl_web_to_gcs(file_path)
            print(f"Done uploading {file} to GCS")
    print("All files are Uploaded")

In [None]:
# dataset_url = "https://www.dropbox.com/s/a1kl5e35j4o53mz/bandcamp-items-json.zip?dl=1"
dataset_url_2 = "https://www.dropbox.com/s/wd38q80el16i19q/1000000-bandcamp-sales.zip?dl=1"
file_folder = fetch_data(dataset_url_2)
for file in os.listdir(file_folder):
    if file.endswith(".json"):
        print(file)

In [76]:
# print(os.listdir(file_folder))
# print(os.listdir("albums-json")[0])

In [None]:
if __name__ == "__main__":
    elt_parent_web_gcs()