In [1]:
#
# Copyright 2021 Rovio Entertainment Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

PRE-REQUISITES:

1. Build the package (shaded jar) on command line:

        mvn package -DskipTests

2. A) Copy the shaded jar to s3:

        AWS_PROFILE=smoke
        JAR_BUCKET=<REPLACE THIS>
        aws s3 --profile $AWS_PROFILE cp \
          target/rovio-ingest-1.0.0_spark_3.0.1-SNAPSHOT.jar \
          s3://$JAR_BUCKET/tmp/juho/druid/jars/rovio-ingest-1.0.0_spark_3.0.1-SNAPSHOT.jar

2. B) Copy the plain jar to s3: 

        aws s3 --profile $AWS_PROFILE cp \
          target/original-rovio-ingest-1.0.0_spark_3.0.1-SNAPSHOT.jar \
          s3://$JAR_BUCKET/tmp/juho/druid/jars/original-rovio-ingest-1.0.0_spark_3.0.1-SNAPSHOT.jar

3. Build a zip of the python wrapper:

        cd python \
          && zip --exclude='*.pyc' --exclude='*__pycache__*' --exclude='*~' --exclude='.pytest_cache' \
            -FSr ../target/rovio_ingest.zip rovio_ingest ; cd ..

4. Copy the zip to s3:

        aws s3 --profile $AWS_PROFILE cp \
          target/rovio_ingest.zip \
          s3://$JAR_BUCKET/tmp/juho/druid/python/rovio_ingest.zip

In [2]:
%load_ext sparkmagic.magics

In [3]:
ENV = "smoke"
PREFIX = "tmp/juho/"

In [4]:
import boto3

ssm_client = boto3.session.Session(profile_name=ENV).client(service_name="ssm")

# secrets can be added at
# https://console.aws.amazon.com/systems-manager/parameters/?region=us-east-1
def get_param(secret_name: str) -> str:
    return ssm_client.get_parameter(Name="/dataengineering/" + secret_name)["Parameter"]["Value"]

In [5]:
import json
from IPython import get_ipython

def set_spark_config(conf_dict):
    get_ipython().run_cell_magic('spark', 'config', json.dumps(conf_dict))

def create_spark_session_with_host(host):
    get_ipython().run_line_magic('spark', 'add -l python -u http://{}:8998'.format(host))

In [6]:
packages_bucket = get_param("rovio-ingest/packages_bucket")

spark_conf = {
  "conf": {
    "spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive": "true",
    "spark.sql.hive.caseSensitiveInferenceMode": "NEVER_INFER",
    "spark.pyspark.python": "python3",
    "spark.sql.session.timeZone": "UTC",
    "spark.submit.pyFiles": f"s3://{packages_bucket}/{PREFIX}druid/python/rovio_ingest.zip",
  }
}

if True:
  # A) if using the shaded jar, only this jar:
  spark_conf["conf"]["spark.jars"] = \
    f"s3://{packages_bucket}/{PREFIX}druid/jars/rovio-ingest-1.0.0_spark_3.0.1-SNAPSHOT.jar"
else:
  # NOTE!! The safe bet against all the hacks below is to just build the shaded jar and use it ^

  # The conf below used to work with Spark 2, but is probably broken now

  # B) any way, if using the plain jar + maven deps:

  # required to avoid guava version conflicts
  # (with shaded jar there would be no such problem)
  spark_conf["conf"]["spark.executor.userClassPathFirst"] = "true"
  spark_conf["conf"]["spark.driver.userClassPathFirst"] = "true"

  # manually specified maven dependencies as in pom.xml to be able to use a plain (not uber) jar
  # (these are not going to be needed eventually. instead only: "com.rovio:rovio-ingest:1.0")
  maven_deps = [
    "org.apache.druid:druid-server:0.13.0-incubating",
    "org.apache.druid.extensions:druid-s3-extensions:0.13.0-incubating",
    "org.apache.druid.extensions:mysql-metadata-storage:0.13.0-incubating",
    "mysql:mysql-connector-java:5.1.38",
  ]
  
  maven_excludes = [
    # UNSAFE! But seems to work as a workaround for now.
    # Without this exclusion druid ingestion fails on EMR
    #
    # The error from Livy Session log is:
    # 20/04/13 07:38:14 ERROR SparkContext: Error initializing SparkContext.
    #   java.io.FileNotFoundException:
    #   File file:/var/lib/livy/.ivy2/jars/io.netty_netty-transport-native-epoll-4.1.29.Final.jar
    #   does not exist
    #
    # On EMR instance there is actually this:
    # /var/lib/livy/.ivy2/jars/io.netty_netty-transport-native-epoll-4.1.29.Final-linux-x86_64.jar
    #
    # -> So there is a Classifier in there ("linux-x86_64"), which spark.jars.packages doesn't 
    # support! Discussed at least in these issues:
    #   https://issues.apache.org/jira/browse/SPARK-20075
    #   https://issues.apache.org/jira/browse/SPARK-24287
    #
    # -> So we just exclude it and hope that the druid modules that rovio-ingest depends on doesn't
    # actually need to import anything from it. Seems like we were lucky for now \o/
    "io.netty:netty-transport-native-epoll",
  ]
  
  # for testing, include the plain rovio-ingest like this:
  # not going to be needed eventually when using "com.rovio:rovio-ingest:1.0" in spark.jars.packages:
  spark_conf["conf"]["spark.jars"] = \
    f"s3://{packages_bucket}/{PREFIX}druid/jars/original-rovio-ingest-1.0.0_spark_3.0.1-SNAPSHOT.jar" 

  spark_conf["conf"]["spark.jars.packages"] = ",".join(maven_deps)
  spark_conf["conf"]["spark.jars.excludes"] = ",".join(maven_excludes)
  # add https://clojars.org/repo/ to avoid "module not found: org.hyperic#sigar;1.6.5.132" on EMR
  spark_conf["conf"]["spark.jars.repositories"] = "https://clojars.org/repo/"

set_spark_config(spark_conf)
create_spark_session_with_host(get_param("spark3/shared/host"))

# to debug problems in session creation, see livy session logs at http://{host}:8998/ui

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1617264445938_0029,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


In [7]:
%%spark

import boto3

ssm_client = boto3.session.Session(region_name="us-east-1").client(service_name="ssm")

def get_param(secret_name: str) -> str:
    return ssm_client.get_parameter(Name="/dataengineering/" + secret_name)["Parameter"]["Value"]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
%%spark
from datetime import datetime
from pyspark.sql import functions as f, types as t, SparkSession

spark: SparkSession = spark
schema = 'dau:BIGINT, revenue:DOUBLE, app_id:STRING, event_date:TIMESTAMP'
df = spark.createDataFrame([[5, 30.0, 'testclient', datetime(2018, 10, 1)],
                            [2, 15.0, 'testclient', datetime(2018, 10, 2)]],
                            schema)
df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+-------+----------+-------------------+
|dau|revenue|    app_id|         event_date|
+---+-------+----------+-------------------+
|  5|   30.0|testclient|2018-10-01 00:00:00|
|  2|   15.0|testclient|2018-10-02 00:00:00|
+---+-------+----------+-------------------+

In [16]:
%%spark

from py4j.java_gateway import java_import
from rovio_ingest import DRUID_SOURCE
from rovio_ingest.extensions.dataframe_extension import ConfKeys, add_dataframe_druid_extension

# fix df.explain on EMR 6
java_import(spark._sc._jvm, "org.apache.spark.sql.api.python.*")

add_dataframe_druid_extension()

df_prepared = df.repartition_by_druid_segment_size('event_date', segment_granularity='DAY')
df_prepared.explain(True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Parsed Logical Plan ==
Project [dau#0L, revenue#1, app_id#2, event_date#3, __PARTITION_TIME__#158, __PARTITION_NUM__#172]
+- RepartitionByExpression [__PARTITION_TIME__#158, __PARTITION_NUM__#172], 200
   +- Project [dau#0L, revenue#1, app_id#2, event_date#3, __PARTITION_TIME__#158, __num_rows__#165, cast((cast((__num_rows__#165 - 1) as double) / cast(5000000 as double)) as int) AS __PARTITION_NUM__#172]
      +- Project [dau#0L, revenue#1, app_id#2, event_date#3, __PARTITION_TIME__#158, __num_rows__#165]
         +- Project [dau#0L, revenue#1, app_id#2, event_date#3, __PARTITION_TIME__#158, __num_rows__#165, __num_rows__#165]
            +- Window [row_number() windowspecdefinition(__PARTITION_TIME__#158, __PARTITION_TIME__#158 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS __num_rows__#165], [__PARTITION_TIME__#158], [__PARTITION_TIME__#158 ASC NULLS FIRST]
               +- Project [dau#0L, revenue#1, app_id#2, event_date#3, __PARTITION_

In [10]:
%%spark
df_prepared.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- dau: long (nullable = true)
 |-- revenue: double (nullable = true)
 |-- app_id: string (nullable = true)
 |-- event_date: timestamp (nullable = true)
 |-- __PARTITION_TIME__: timestamp (nullable = true)
 |-- __PARTITION_NUM__: integer (nullable = true)

In [14]:
%%spark
df_prepared.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+-------+----------+-------------------+-------------------+-----------------+
|dau|revenue|    app_id|         event_date| __PARTITION_TIME__|__PARTITION_NUM__|
+---+-------+----------+-------------------+-------------------+-----------------+
|  2|   15.0|testclient|2018-10-02 00:00:00|2018-10-02 00:00:00|                0|
|  5|   30.0|testclient|2018-10-01 00:00:00|2018-10-01 00:00:00|                0|
+---+-------+----------+-------------------+-------------------+-----------------+

In [15]:
%%spark

DATA_SOURCE_NAME = "rovio_ingest_test_juho"

df_prepared \
    .write \
    .mode("overwrite") \
    .format(DRUID_SOURCE) \
    .option(ConfKeys.DATA_SOURCE, DATA_SOURCE_NAME) \
    .option(ConfKeys.TIME_COLUMN, "event_date") \
    .option(ConfKeys.METADATA_DB_URI, get_param("druid/metadata_db/uri")) \
    .option(ConfKeys.METADATA_DB_USERNAME, get_param("druid/metadata_db/username")) \
    .option(ConfKeys.METADATA_DB_PASSWORD, get_param("druid/metadata_db/password")) \
    .option(ConfKeys.DEEP_STORAGE_S3_BUCKET, get_param("druid/deep_storage/bucket")) \
    .option(ConfKeys.DEEP_STORAGE_S3_BASE_KEY, "druid/segments") \
    .save()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

To list the written data you can run:

    aws s3 --profile smoke ls --recursive \
      s3://{druid-deep-storage-bucket}/druid/segments/rovio_ingest_test_juho/

To see something like:

    2020-04-12 16:12:06        591 druid/segments/rovio_ingest_test_juho/2018-10-01T00:00:00.000Z_2018-10-02T00:00:00.000Z/2020-04-12T13:11:43.778Z/0/descriptor.json
    2020-04-12 16:12:06       1055 druid/segments/rovio_ingest_test_juho/2018-10-01T00:00:00.000Z_2018-10-02T00:00:00.000Z/2020-04-12T13:11:43.778Z/0/index.zip
    2020-04-12 16:12:06        591 druid/segments/rovio_ingest_test_juho/2018-10-02T00:00:00.000Z_2018-10-03T00:00:00.000Z/2020-04-12T13:11:43.778Z/0/descriptor.json
    2020-04-12 16:12:06       1052 druid/segments/rovio_ingest_test_juho/2018-10-02T00:00:00.000Z_2018-10-03T00:00:00.000Z/2020-04-12T13:11:43.778Z/0/index.zip

And run this in druid-sql (JDBC)

    SELECT * FROM rovio_ingest_test_juho LIMIT 10;

    __time	app_id	dau	revenue
    2018-10-01 00:00:00	testclient	5	30
    2018-10-02 00:00:00	testclient	2	15

In [17]:
%spark cleanup