In [1]:
import os
import zipfile

from pyspark.sql.types import *
from pyspark.sql.functions import broadcast, lit

In [2]:
%run ./0-Config

In [3]:
%run ./0-Functions

In [4]:
delimiter_dataset_1_aggregate_data = ","

In [5]:
schema_dataset_1_aggregate = StructType([
  StructField("id", IntegerType(), True),
  StructField("useful_column_1", StringType(), True),
  StructField("useful_column_2", StringType(), True),
  StructField("useful_column_3", StringType(), True),
  StructField("useless_column_1", StringType(), True),
  StructField("useless_column_2", StringType(), True),
  StructField("useless_column_3", StringType(), True)
])

In [6]:
file_path_dataset_1_aggregate_data_initial = adls2uri_raw + "dataset_1/zip1.zip"
print("file_path_dataset_1_aggregate_data_initial = " + file_path_dataset_1_aggregate_data_initial)

dataset_1_aggregate_data_path_local_folder = '/Shared/dataset_1/'
print("dataset_1_aggregate_data_path_local_folder = " + dataset_1_aggregate_data_path_local_folder)

dataset_1_aggregate_data_path_local = dataset_1_aggregate_data_path_local_folder + 'zip1.zip'
print("dataset_1_aggregate_data_path_local = " + dataset_1_aggregate_data_path_local)

In [7]:
# Copy the file to the local Databricks file system since we have to unzip it to get to the contained individual file

dbutils.fs.cp(file_path_dataset_1_aggregate_data_initial, dataset_1_aggregate_data_path_local, False)

In [8]:
display(dbutils.fs.ls(dataset_1_aggregate_data_path_local_folder))

In [9]:
# Shell command to get detailed info about the zip file
# %sh
# zipinfo -v '/dbfs/Shared/dataset_1/zip1.zip'

In [10]:
# Testing shell command to unzip the data file, to compare output to Python zipfile
# We'll keep using the Python approach so we can use dynamic file names etc.

# %sh
# unzip '/dbfs/Shared/dataset_1/zip1.zip' -d '/dbfs/Shared/dataset_1/zip1/'

In [11]:
zf = zipfile.ZipFile('/dbfs' + dataset_1_aggregate_data_path_local)

# For investigation, list out the uncompressed file sizes of zipped file(s)
# uncompress_size = sum((file.file_size for file in zf.infolist()))
# print(uncompress_size)

zf.extractall(path = '/dbfs' + dataset_1_aggregate_data_path_local_folder)

zf.close()

In [12]:
display(dbutils.fs.ls(dataset_1_aggregate_data_path_local_folder))

In [13]:
# For investigation purposes, get detailed filesystem info for each file

# for f in os.scandir('/dbfs' + dataset_1_aggregate_data_path_local_folder):
#   print(f.name + " | " + str(f.stat()))

In [14]:
file_path_dataset_1_aggregate_data = dataset_1_aggregate_data_path_local_folder + "zip1.csv"
print("file_path_dataset_1_aggregate_data = " + file_path_dataset_1_aggregate_data)

In [15]:
df_dataset_1_aggregate_data = spark\
  .read\
  .format("csv")\
  .schema(schema_dataset_1_aggregate)\
  .option("header", "true")\
  .option("delimiter", delimiter_dataset_1_aggregate_data)\
  .load(file_path_dataset_1_aggregate_data)

In [16]:
df_dataset_1_aggregate_data.count()

In [17]:
# We have to fix the column names in the source data as they contain illegal characters and have other problems

df_dataset_1_aggregate_data = df_dataset_1_aggregate_data\
  .withColumnRenamed("ID", "id")\
  .withColumnRenamed("Useful Column 1 (with bad name)", "useful_column_1")

In [18]:
# EDA

display(df_dataset_1_aggregate_data.describe())

In [19]:
display(df_dataset_1_aggregate_data)

In [20]:
target_path_dataset_1_aggregate_data = adls2uri_staging1 + "Shared/data/2019/06/01/"

In [21]:
# Persist to Parquet

df_dataset_1_aggregate_data.coalesce(8).write.parquet(target_path_dataset_1_aggregate_data)

In [22]:
CleanupSparkJobFiles(targetPath_dataset_1_aggregate_data)

In [23]:
# Delete the DBFS local folder where we copied and unzipped the data file

dbutils.fs.rm(dataset_1_aggregate_data_path_local_folder, True)