#### Folder Structure

- dataset
  - raw
      - csv : contains unzipped csv files
      - zip : contains the original zip file from kaggle
      - staging : files unzip to csv.zip, so this is a staging area
  - data_warehouse
      - contains cleaned and transformed tables in parquet

In [1]:
import os
import zipfile

#### Define or create dataset directories

In [5]:
def make_dir(root: str, directory_name: str) -> str:
    new_directory_path = os.path.join(root, directory_name)
    os.makedirs(new_directory_path, exist_ok=True)
    return new_directory_path

In [24]:
dataset_dir = make_dir(os.getcwd(), "dataset")
raw_csv_dir = make_dir(dataset_dir, os.path.join("raw","csv"))
raw_zip_dir = make_dir(dataset_dir, os.path.join("raw", "zip"))
raw_staging_dir = make_dir(dataset_dir, os.path.join("raw", "staging"))
data_warehouse_dir = make_dir(dataset_dir, "data_warehouse")

In [13]:
zip_data = os.path.join(raw_zip_dir,"springleaf-marketing-response.zip")

#### Unzip files

In [20]:
def unzip_file(source_path:str
               , destination_path:str
               , drop_source_zip=False
              ):
    with zipfile.ZipFile(source_path, 'r') as zip_obj:
        zip_obj.extractall(destination_path)
    if drop_source_zip:
        os.remove(source_path)

In [21]:
def clean_raw_csv_directory(csv_directory: str):
    for file_name in os.listdir(csv_directory):
        file_path = os.path.join(csv_directory, file_name)
        if file_name.endswith('.csv'):
            os.remove(file_path)

In [25]:
# Start Fresh by deleting old data
clean_raw_csv_directory(raw_csv_dir)
# Unzip the first zip file and send to staging
unzip_file(source_path = zip_data
           , destination_path=raw_staging_dir)

for file_name in os.listdir(raw_staging_dir):
    if file_name.endswith('.csv.zip'):
        source_path = os.path.join(raw_staging_dir, file_name)
        unzip_file(source_path, raw_csv_dir, drop_source_zip=True)
    elif file_name.endswith('.csv'):
        os.rename(os.path.join(raw_staging_dir, file_name)
                  ,os.path.join(raw_csv_dir, file_name))