# From Delta Lake to Amazon SageMaker

## 1 - Data Exploration and Visualization

In this notebook, we will...

In [1]:
import sagemaker
sagemaker.__version__

'2.107.0'

In [2]:
# Import pyspark and build Spark session
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
import numpy as np
import pandas as pd

In [3]:
# Build list of packages entries using Maven coordinates (groupId:artifactId:version)
pkg_list = []
pkg_list.append("io.delta:delta-core_2.12:1.1.0")
pkg_list.append("org.apache.hadoop:hadoop-aws:3.2.2")

packages=(",".join(pkg_list))
print('packages: '+packages)

packages: io.delta:delta-core_2.12:1.1.0,org.apache.hadoop:hadoop-aws:3.2.2


In [4]:
# Instantiate Spark via builder
# Note: we use the `ContainerCredentialsProvider` to give us access to underlying IAM role permissions

spark = (SparkSession
    .builder
    .appName("PySparkApp") 
    .config("spark.jars.packages", packages) 
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
    .config("fs.s3a.aws.credentials.provider",'com.amazonaws.auth.ContainerCredentialsProvider') 
    .getOrCreate())

sc = spark.sparkContext

print('Spark version: '+str(sc.version))



:: loading settings :: url = jar:file:/opt/conda/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-24c10868-db4e-47e7-88f8-33b9875e84c5;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
	found org.apache.hadoop#hadoop-aws;3.2.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central
:: resolution report :: resolve 699ms :: artifacts dl 62ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.563 from central in [default]
	io.delta#delta-core_2.12;1.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	--------

Spark version: 3.2.0


In [6]:
import boto3
import sagemaker

# S3 bucket for saving processing job outputs
sm_session = sagemaker.Session()
bucket = sm_session.default_bucket()
region = sm_session.boto_region_name

sm_client = boto3.client('sagemaker')
iam_role = sagemaker.get_execution_role()

print('Default bucket: '+bucket)

Default bucket: sagemaker-eu-west-1-889960878219


In [7]:
s3a_delta_table_uri=f's3a://{bucket}/delta_to_sagemaker/delta_format/'
print(s3a_delta_table_uri)

s3a://sagemaker-eu-west-1-889960878219/delta_to_sagemaker/delta_format/


In [8]:
# Create SQL command inserting the S3 path location

sql_cmd = f'SELECT * FROM delta.`{s3a_delta_table_uri}` ORDER BY medv'
print(f'SQL command: {sql_cmd}')

SQL command: SELECT * FROM delta.`s3a://sagemaker-eu-west-1-889960878219/delta_to_sagemaker/delta_format/` ORDER BY medv


In [9]:
# Execute SQL command which returns dataframe

sql_results = spark.sql(sql_cmd)
print(type(sql_results))

sql_results.show(10)

22/08/31 11:56:27 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

<class 'pyspark.sql.dataframe.DataFrame'>


[Stage 8:>                                                          (0 + 1) / 1]

+-------+---+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|   crim| zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio|     b|lstat|medv|
+-------+---+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|14.3337|  0| 18.1|   0|  0.7| 4.88| 100|1.5895| 24|666|   20.2|372.92|30.62|10.2|
|12.2472|  0| 18.1|   0|0.584|5.837|59.7|1.9976| 24|666|   20.2| 24.65|15.69|10.2|
|17.8667|  0| 18.1|   0|0.671|6.223| 100|1.3861| 24|666|   20.2|393.74|21.78|10.2|
|88.9762|  0| 18.1|   0|0.671|6.968|91.9|1.4165| 24|666|   20.2| 396.9|17.21|10.4|
|25.9406|  0| 18.1|   0|0.679|5.304|89.1|1.6475| 24|666|   20.2|127.36|26.64|10.4|
|22.0511|  0| 18.1|   0| 0.74|5.818|92.4|1.8662| 24|666|   20.2|391.45|22.11|10.5|
|24.3938|  0| 18.1|   0|  0.7|4.652| 100|1.4672| 24|666|   20.2| 396.9|28.28|10.5|
|12.8023|  0| 18.1|   0| 0.74|5.854|96.6|1.8956| 24|666|   20.2|240.52|23.79|10.8|
|15.8744|  0| 18.1|   0|0.671|6.545|99.1|1.5192| 24|666|   20.2| 396.9|21.08|10.9|
|37.

                                                                                

In [13]:
!pip install pandas-profiling

Collecting pandas-profiling
  Using cached pandas_profiling-3.2.0-py2.py3-none-any.whl (262 kB)
Collecting phik>=0.11.1
  Using cached phik-0.12.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (696 kB)
Collecting visions[type_image_path]==0.7.4
  Using cached visions-0.7.4-py3-none-any.whl (102 kB)
Collecting missingno>=0.4.2
  Using cached missingno-0.5.1-py3-none-any.whl (8.7 kB)
Collecting multimethod>=1.4
  Using cached multimethod-1.8-py3-none-any.whl (9.8 kB)
Collecting pydantic>=1.8.1
  Using cached pydantic-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
Collecting imagehash
  Using cached ImageHash-4.2.1-py2.py3-none-any.whl
Installing collected packages: pydantic, multimethod, imagehash, visions, phik, missingno, pandas-profiling
Successfully installed imagehash-4.2.1 missingno-0.5.1 multimethod-1.8 pandas-profiling-3.2.0 phik-0.12.2 pydantic-1.10.1 visions-0.7.4
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip 

In [None]:
from pandas_profiling import ProfileReport

df = sql_results.toPandas()
report = ProfileReport(df)
report

                                                                                

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]