{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "b3335539-b1f0-4d82-ba2e-707589396180", "metadata": {}, "outputs": [], "source": [ "!pip install sparksql-magic" ] }, { "cell_type": "code", "execution_count": 1, "id": "2655eb9a-c43d-4dcf-ab15-14defc98f4b7", "metadata": {}, "outputs": [], "source": [ "%load_ext sparksql_magic\n", "\n", "import os\n", "from pyspark.sql import SparkSession" ] }, { "cell_type": "code", "execution_count": 2, "id": "878dc5f1-5841-45a7-9a88-232b0fe90593", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 0\n", "drwxrwsrwx. 3 root 1000740000 102 Jun 20 04:45 .\n", "drwxr-xr-x. 3 root root 25 Jun 20 04:46 ..\n", "drwxr-sr-x. 2 root 1000740000 48 Jun 20 04:45 ..2023_06_20_04_45_48.2374258031\n", "lrwxrwxrwx. 1 root 1000740000 20 Jun 20 04:45 core-site.xml -> ..data/core-site.xml\n", "lrwxrwxrwx. 1 root 1000740000 32 Jun 20 04:45 ..data -> ..2023_06_20_04_45_48.2374258031\n", "lrwxrwxrwx. 1 root 1000740000 20 Jun 20 04:45 hdfs-site.xml -> ..data/hdfs-site.xml\n", "\n", "/stackable/hdfs-config\n", "Welcome to\n", " ____ __\n", " / __/__ ___ _____/ /__\n", " _\\ \\/ _ \\/ _ `/ __/ '_/\n", " /___/ .__/\\_,_/_/ /_/\\_\\ version 3.3.0\n", " /_/\n", " \n", "Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 17.0.4\n", "Branch HEAD\n", "Compiled by user ubuntu on 2022-06-09T19:58:58Z\n", "Revision f74867bddfbcdd4d08076db36851e88b15e66556\n", "Url https://github.com/apache/spark\n", "Type --help for more information.\n" ] } ], "source": [ "!ls -la /stackable/hdfs-config\n", "!echo\n", "!echo $HADOOP_CONF_DIR\n", "!pyspark --version" ] }, { "cell_type": "code", "execution_count": 3, "id": "560bab9a-2106-4d73-8003-d1f1b6e4112d", "metadata": {}, "outputs": [], "source": [ "spark = (SparkSession\n", " .builder\n", " .master(f'k8s://https://{os.environ[\"KUBERNETES_SERVICE_HOST\"]}:{os.environ[\"KUBERNETES_SERVICE_PORT\"]}')\n", " .config(\"spark.executor.instances\", \"2\")\n", " .config(\"spark.kubernetes.namespace\", open(\"/var/run/secrets/kubernetes.io/serviceaccount/namespace\", \"r\").read())\n", " .config(\"spark.kubernetes.container.image\", \"apache/spark:v3.3.0\")\n", " .config(\"spark.driver.port\", \"2222\")\n", " .config(\"spark.driver.blockManager.port\", \"7777\")\n", " .config(\"spark.kubernetes.driver.pod.name\", os.environ[\"HOSTNAME\"]) # For Pod garbage collection (owner reference)\n", " .config(\"spark.jars.packages\", \"org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.0\")\n", " #.config(\"spark.driver.extraJavaOptions\", \"-Divy.cache.dir=/tmp -Divy.home=/tmp\")\n", " #.config(\"spark.executor.extraJavaOptions\", \"-Divy.cache.dir=/tmp -Divy.home=/tmp\")\n", " .config(\"spark.sql.catalog.lakehouse\", \"org.apache.iceberg.spark.SparkCatalog\")\n", " .config(\"spark.sql.catalog.lakehouse.type\", \"hive\")\n", " .config(\"spark.sql.catalog.lakehouse.uri\", \"thrift://hive-iceberg:9083\")\n", " .appName(\"spark-job\")\n", " .getOrCreate()\n", " )" ] }, { "cell_type": "code", "execution_count": 4, "id": "bed1a4ad-4e69-4f4f-a694-bd1d94d02cad", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
namespace
default
testdata
" ], "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sparksql\n", "show schemas in lakehouse" ] }, { "cell_type": "code", "execution_count": 5, "id": "c65d6c75-6fc1-4ca9-bcd0-4f6c7959f373", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
namespacetableNameisTemporary
" ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sparksql\n", "show tables in lakehouse.default" ] }, { "cell_type": "code", "execution_count": 6, "id": "aea19cdd-0060-4b69-9bb7-e2e378dba0c0", "metadata": {}, "outputs": [], "source": [ "df_trips = spark.read.option(\"header\", \"true\").csv(\"hdfs://hdfs/data/raw/\")\n", "df_trips.createOrReplaceTempView(\"trips\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "05ab8d0a-f603-4c7a-acd2-666d39269cd2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
ride_idrideable_typestarted_atended_atstart_station_namestart_station_idend_station_nameend_station_idstart_latstart_lngend_latend_lngmember_casual
A847FADBBC638E45docked_bike2020-04-26 17:45:142020-04-26 18:12:03Eckhart Park86Lincoln Ave & Diversey Pkwy15241.8964-87.66141.9322-87.6586member
5405B80E996FF60Ddocked_bike2020-04-17 17:08:542020-04-17 17:17:03Drake Ave & Fullerton Ave503Kosciuszko Park49941.9244-87.715441.9306-87.7238member
5DD24A79A4E006F4docked_bike2020-04-01 17:54:132020-04-01 18:08:36McClurg Ct & Erie St142Indiana Ave & Roosevelt Rd25541.8945-87.617941.8679-87.623member
" ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sparksql\n", "select * from trips limit 3" ] }, { "cell_type": "code", "execution_count": 8, "id": "cef51094-d503-4314-80cf-32356d2f4853", "metadata": {}, "outputs": [], "source": [ "df_trips.write.mode(\"overwrite\").parquet(\"hdfs://hdfs/data/processed/\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "d3fe2555-353e-43ea-9b96-d66c7c3b139b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
start_station_namecount(1)
Clark St & Elm St850
Dearborn St & Erie St730
Desplaines St & Kinzie St720
" ], "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sparksql\n", "select start_station_name, count(*) from trips group by 1 order by 2 desc limit 3" ] }, { "cell_type": "code", "execution_count": 11, "id": "e9d6eca4-e16a-4752-bfc6-63f708780f30", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sparksql\n", "CREATE SCHEMA IF NOT EXISTS lakehouse.testdata LOCATION 'hdfs://hdfs/lakehouse';" ] }, { "cell_type": "code", "execution_count": 15, "id": "98fca8a5-798e-43df-a5b6-8d0cb2654f52", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sparksql\n", "CREATE OR REPLACE TABLE lakehouse.testdata.trips\n", "USING iceberg\n", "PARTITIONED BY (days(started_at))\n", "AS SELECT\n", " ride_id,\n", " rideable_type,\n", " cast(started_at as timestamp) as started_at,\n", " cast(ended_at as timestamp) as ended_at\n", "FROM trips\n", "ORDER BY started_at" ] }, { "cell_type": "code", "execution_count": 16, "id": "ac5ba249-1351-4b77-a374-3da99243f98a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
made_current_atsnapshot_idparent_idis_current_ancestor
2023-06-20 07:40:36.1940001803673990793562830nullTrue
" ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sparksql\n", "SELECT * FROM lakehouse.testdata.trips.history;" ] }, { "cell_type": "code", "execution_count": 22, "id": "a2fb3f04-b05b-47ef-92ad-88150aecfc11", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
partitionspec_idrecord_countfile_countposition_delete_record_countposition_delete_file_countequality_delete_record_countequality_delete_file_count
Row(started_at_day=datetime.date(2020, 4, 1))0229410000
Row(started_at_day=datetime.date(2020, 4, 4))0228810000
Row(started_at_day=datetime.date(2020, 4, 5))0288710000
Row(started_at_day=datetime.date(2020, 4, 2))0260810000
Row(started_at_day=datetime.date(2020, 4, 3))0353210000
Row(started_at_day=datetime.date(2020, 4, 8))0236510000
Row(started_at_day=datetime.date(2020, 4, 9))0197210000
Row(started_at_day=datetime.date(2020, 4, 6))0242710000
Row(started_at_day=datetime.date(2020, 4, 7))0518510000
Row(started_at_day=datetime.date(2020, 4, 12))0367010000
Row(started_at_day=datetime.date(2020, 4, 13))0182010000
Row(started_at_day=datetime.date(2020, 4, 10))0272110000
Row(started_at_day=datetime.date(2020, 4, 11))0445810000
Row(started_at_day=datetime.date(2020, 4, 16))0227220000
Row(started_at_day=datetime.date(2020, 4, 17))0143510000
Row(started_at_day=datetime.date(2020, 4, 14))0160910000
Row(started_at_day=datetime.date(2020, 4, 15))0141910000
Row(started_at_day=datetime.date(2020, 4, 20))0390310000
Row(started_at_day=datetime.date(2020, 4, 21))0262710000
Row(started_at_day=datetime.date(2020, 4, 18))0511610000
Row(started_at_day=datetime.date(2020, 4, 19))0515810000
Row(started_at_day=datetime.date(2020, 4, 24))0228410000
Row(started_at_day=datetime.date(2020, 4, 25))0104810000
Row(started_at_day=datetime.date(2020, 4, 22))0199510000
Row(started_at_day=datetime.date(2020, 4, 23))0228410000
Row(started_at_day=datetime.date(2020, 4, 28))0339210000
Row(started_at_day=datetime.date(2020, 4, 29))064510000
Row(started_at_day=datetime.date(2020, 4, 26))0620010000
Row(started_at_day=datetime.date(2020, 4, 27))0259510000
Row(started_at_day=datetime.date(2020, 4, 30))0256710000
" ], "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sparksql --limit 100\n", "SELECT * FROM lakehouse.testdata.trips.partitions;" ] }, { "cell_type": "code", "execution_count": null, "id": "ef6bc98e-a26a-42dc-b403-58ea676bcf5e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }