In [1]:
#
# Copyright 2021 Rovio Entertainment Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# PRE-REQUISITES

        AWS_PROFILE=smoke
        JAR_BUCKET=<REPLACE THIS>

## IF TESTING PYTHON CHANGES MANUALLY

3. Build a zip of the python wrapper:

        cd python \
          && zip --exclude='*.pyc' --exclude='*__pycache__*' --exclude='*~' --exclude='.pytest_cache' \
            -FSr ../target/rovio_ingest.zip rovio_ingest ; cd ..

4. Copy the zip to s3:

        aws s3 --profile $AWS_PROFILE cp \
          target/rovio_ingest.zip \
          s3://$JAR_BUCKET/tmp/juho/druid/python/rovio_ingest.zip

Then invert the boolean in the cell below to use it in spark_conf.
And skip the cell that would call install_pypi_package.

## IF TESTING JAR CHANGES MANUALLY:

1. Build the package (shaded jar) on command line:

        mvn package -DskipTests

2. A) Copy the shaded jar to s3:

        aws s3 --profile $AWS_PROFILE cp \
          target/rovio-ingest-1.0.6_spark_3.0.1-SNAPSHOT.jar \
          s3://$JAR_BUCKET/tmp/juho/druid/jars/rovio-ingest-1.0.6_spark_3.0.1-SNAPSHOT.jar

2. B) Copy the plain jar to s3: 

        aws s3 --profile $AWS_PROFILE cp \
          target/original-rovio-ingest-1.0.6_spark_3.0.1-SNAPSHOT.jar \
          s3://$JAR_BUCKET/tmp/juho/druid/jars/original-rovio-ingest-1.0.6_spark_3.0.1-SNAPSHOT.jar

Then invert the boolean in the cell below to use it in spark_conf.

In [2]:
%load_ext sparkmagic.magics

In [3]:
ENV = "smoke"
PREFIX = "tmp/juho/"

In [4]:
import boto3

ssm_client = boto3.session.Session(profile_name=ENV).client(service_name="ssm")

# secrets can be added at
# https://console.aws.amazon.com/systems-manager/parameters/?region=us-east-1
def get_param(secret_name: str) -> str:
    return ssm_client.get_parameter(Name="/dataengineering/" + secret_name)["Parameter"]["Value"]

In [5]:
import json
from IPython import get_ipython

def set_spark_config(conf_dict):
    get_ipython().run_cell_magic('spark', 'config', json.dumps(conf_dict))

def create_spark_session_with_host(host):
    get_ipython().run_line_magic('spark', 'add -l python -u http://{}:8998'.format(host))

In [14]:
packages_bucket = get_param("rovio-ingest/packages_bucket")

spark_conf = {
  "conf": {
    "spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive": "true",
    "spark.sql.hive.caseSensitiveInferenceMode": "NEVER_INFER",
    "spark.pyspark.python": "python3",
    "spark.sql.session.timeZone": "UTC",
    # alternative if using a snapshot version
#    "spark.jars.repositories": "https://s01.oss.sonatype.org/content/repositories/snapshots",
#    "spark.jars.packages": "com.rovio.ingest:rovio-ingest:1.0.6_spark_3.0.1-SNAPSHOT"
    "spark.jars.packages": "com.rovio.ingest:rovio-ingest:1.0.5_spark_3.0.1"
  }
}

# Assuming AWS EMR
if True:
    spark_conf["conf"]["spark.pyspark.python"] = "python3"
    spark_conf["conf"]["spark.pyspark.virtualenv.enabled"] = "true"
    spark_conf["conf"]["spark.pyspark.virtualenv.type"] = "native"
    spark_conf["conf"]["spark.pyspark.virtualenv.bin.path"] = "/usr/bin/virtualenv"

# Enable this to test with a manually built & copied zip instead of published package from PyPI
if False:
  spark_conf["conf"]["spark.submit.pyFiles"] = \
    f"s3://{packages_bucket}/{PREFIX}druid/python/rovio_ingest.zip"

# Enable this to test with a manually built & copied jar instead of published package from maven
if False:
  spark_conf["conf"]["spark.jars"] = \
    f"s3://{packages_bucket}/{PREFIX}druid/jars/rovio-ingest-1.0.6_spark_3.0.1-SNAPSHOT.jar"
  del spark_conf["conf"]["spark.jars.packages"]

set_spark_config(spark_conf)
create_spark_session_with_host(get_param("spark3/shared/host"))

# to debug problems in session creation, see livy session logs at http://{host}:8998/ui

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7,application_1673361870104_0145,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


In [15]:
%%spark

# This extension is provided by AWS EMR
# spark.sparkContext.list_packages()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
%%spark

# This extension is provided by AWS EMR.
# If not on EMR:
#    A) install the module with pip on the cluster before creating the spark session
#    B) build a zip & use with spark.submit.pyFiles
# Use the latest stable release from PyPI.
spark.sparkContext.install_pypi_package("rovio-ingest")
# Use a specific version to install a pre-release from PyPI.
#spark.sparkContext.install_pypi_package("rovio-ingest==0.0.1.dev14")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting rovio-ingest
  Downloading https://files.pythonhosted.org/packages/e2/94/c268caf284b71b3a26f61b8f92abad35a89a18efff66f77d3424cda1ab7c/rovio_ingest-1.0.1-py3-none-any.whl
Collecting pyspark<4.0.0,>=3.0.0 (from rovio-ingest)
Collecting py4j==0.10.9 (from pyspark<4.0.0,>=3.0.0->rovio-ingest)
  Using cached https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl
Installing collected packages: py4j, pyspark, rovio-ingest
Successfully installed py4j-0.10.9 pyspark-3.1.1 rovio-ingest-1.0.1

In [16]:
%%spark

import boto3

ssm_client = boto3.session.Session(region_name="us-east-1").client(service_name="ssm")

def get_param(secret_name: str) -> str:
    return ssm_client.get_parameter(Name="/dataengineering/" + secret_name)["Parameter"]["Value"]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
%%spark
from datetime import datetime
from pyspark.sql import functions as f, types as t, SparkSession

spark: SparkSession = spark
schema = 'dau:BIGINT, revenue:DOUBLE, app_id:STRING, event_date:TIMESTAMP'
df = spark.createDataFrame([[5, 30.0, 'testclient', datetime(2019, 10, 1)],
                            [2, 15.0, 'testclient', datetime(2019, 10, 2)]],
                            schema)
df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+-------+----------+-------------------+
|dau|revenue|    app_id|         event_date|
+---+-------+----------+-------------------+
|  5|   30.0|testclient|2019-10-01 00:00:00|
|  2|   15.0|testclient|2019-10-02 00:00:00|
+---+-------+----------+-------------------+

In [24]:
%%spark

from py4j.java_gateway import java_import
from rovio_ingest import DRUID_SOURCE
from rovio_ingest.extensions.dataframe_extension import ConfKeys, add_dataframe_druid_extension

# fix df.explain on EMR 6
java_import(spark._sc._jvm, "org.apache.spark.sql.api.python.*")

add_dataframe_druid_extension()

df_prepared = df.repartition_by_druid_segment_size('event_date', segment_granularity='DAY')
df_prepared.explain(True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Parsed Logical Plan ==
Project [dau#99L, revenue#100, app_id#101, event_date#102, __PARTITION_TIME__#130, __PARTITION_NUM__#147]
+- RepartitionByExpression [__PARTITION_TIME__#130, __PARTITION_NUM__#147]
   +- Project [dau#99L, revenue#100, app_id#101, event_date#102, __PARTITION_TIME__#130, __num_rows__#140, cast((cast((__num_rows__#140 - 1) as double) / cast(5000000 as double)) as int) AS __PARTITION_NUM__#147]
      +- Project [dau#99L, revenue#100, app_id#101, event_date#102, __PARTITION_TIME__#130, __num_rows__#140]
         +- Project [dau#99L, revenue#100, app_id#101, event_date#102, __PARTITION_TIME__#130, __num_rows__#140, __num_rows__#140]
            +- Window [row_number() windowspecdefinition(__PARTITION_TIME__#130, __PARTITION_TIME__#130 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS __num_rows__#140], [__PARTITION_TIME__#130], [__PARTITION_TIME__#130 ASC NULLS FIRST]
               +- Project [dau#99L, revenue#100, app_id#101

In [25]:
%%spark
df_prepared.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- dau: long (nullable = true)
 |-- revenue: double (nullable = true)
 |-- app_id: string (nullable = true)
 |-- event_date: timestamp (nullable = true)
 |-- __PARTITION_TIME__: timestamp (nullable = true)
 |-- __PARTITION_NUM__: integer (nullable = true)

In [26]:
%%spark
df_prepared.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+-------+----------+-------------------+-------------------+-----------------+
|dau|revenue|    app_id|         event_date| __PARTITION_TIME__|__PARTITION_NUM__|
+---+-------+----------+-------------------+-------------------+-----------------+
|  5|   30.0|testclient|2019-10-01 00:00:00|2019-10-01 00:00:00|                0|
|  2|   15.0|testclient|2019-10-02 00:00:00|2019-10-02 00:00:00|                0|
+---+-------+----------+-------------------+-------------------+-----------------+

In [27]:
%%spark

DATA_SOURCE_NAME = "rovio_ingest_test_juho"

df_prepared \
    .write \
    .mode("overwrite") \
    .format(DRUID_SOURCE) \
    .option(ConfKeys.DATA_SOURCE, DATA_SOURCE_NAME) \
    .option(ConfKeys.TIME_COLUMN, "event_date") \
    .option(ConfKeys.METADATA_DB_URI, get_param("druid/metadata_db/uri")) \
    .option(ConfKeys.METADATA_DB_USERNAME, get_param("druid/metadata_db/username")) \
    .option(ConfKeys.METADATA_DB_PASSWORD, get_param("druid/metadata_db/password")) \
    .option(ConfKeys.DEEP_STORAGE_S3_BUCKET, get_param("druid/deep_storage/bucket")) \
    .option(ConfKeys.DEEP_STORAGE_S3_BASE_KEY, "druid/segments") \
    .save()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

To list the written data you can run:

    aws s3 --profile smoke ls --recursive \
      s3://{druid-deep-storage-bucket}/druid/segments/rovio_ingest_test_juho/

To see something like:

    2023-02-05 23:22:57       1058 druid/segments/rovio_ingest_test_juho/2019-10-01T00:00:00.000Z_2019-10-02T00:00:00.000Z/2023-02-05T21:22:52.965Z/0/index.zip
    2023-02-05 23:22:57       1056 druid/segments/rovio_ingest_test_juho/2019-10-02T00:00:00.000Z_2019-10-03T00:00:00.000Z/2023-02-05T21:22:52.965Z/0/index.zip

And run this in druid-sql (JDBC)

    SELECT * FROM rovio_ingest_test_juho LIMIT 10;

    __time	app_id	dau	revenue
    2019-10-01 00:00:00	testclient	5	30
    2019-10-02 00:00:00	testclient	2	15

In [28]:
%spark cleanup