{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "b3335539-b1f0-4d82-ba2e-707589396180",
"metadata": {},
"outputs": [],
"source": [
"!pip install sparksql-magic"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "2655eb9a-c43d-4dcf-ab15-14defc98f4b7",
"metadata": {},
"outputs": [],
"source": [
"%load_ext sparksql_magic\n",
"\n",
"import os\n",
"from pyspark.sql import SparkSession"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "878dc5f1-5841-45a7-9a88-232b0fe90593",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 0\n",
"drwxrwsrwx. 3 root 1000740000 102 Jun 20 04:45 .\n",
"drwxr-xr-x. 3 root root 25 Jun 20 04:46 ..\n",
"drwxr-sr-x. 2 root 1000740000 48 Jun 20 04:45 ..2023_06_20_04_45_48.2374258031\n",
"lrwxrwxrwx. 1 root 1000740000 20 Jun 20 04:45 core-site.xml -> ..data/core-site.xml\n",
"lrwxrwxrwx. 1 root 1000740000 32 Jun 20 04:45 ..data -> ..2023_06_20_04_45_48.2374258031\n",
"lrwxrwxrwx. 1 root 1000740000 20 Jun 20 04:45 hdfs-site.xml -> ..data/hdfs-site.xml\n",
"\n",
"/stackable/hdfs-config\n",
"Welcome to\n",
" ____ __\n",
" / __/__ ___ _____/ /__\n",
" _\\ \\/ _ \\/ _ `/ __/ '_/\n",
" /___/ .__/\\_,_/_/ /_/\\_\\ version 3.3.0\n",
" /_/\n",
" \n",
"Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 17.0.4\n",
"Branch HEAD\n",
"Compiled by user ubuntu on 2022-06-09T19:58:58Z\n",
"Revision f74867bddfbcdd4d08076db36851e88b15e66556\n",
"Url https://github.com/apache/spark\n",
"Type --help for more information.\n"
]
}
],
"source": [
"!ls -la /stackable/hdfs-config\n",
"!echo\n",
"!echo $HADOOP_CONF_DIR\n",
"!pyspark --version"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "560bab9a-2106-4d73-8003-d1f1b6e4112d",
"metadata": {},
"outputs": [],
"source": [
"spark = (SparkSession\n",
" .builder\n",
" .master(f'k8s://https://{os.environ[\"KUBERNETES_SERVICE_HOST\"]}:{os.environ[\"KUBERNETES_SERVICE_PORT\"]}')\n",
" .config(\"spark.executor.instances\", \"2\")\n",
" .config(\"spark.kubernetes.namespace\", open(\"/var/run/secrets/kubernetes.io/serviceaccount/namespace\", \"r\").read())\n",
" .config(\"spark.kubernetes.container.image\", \"apache/spark:v3.3.0\")\n",
" .config(\"spark.driver.port\", \"2222\")\n",
" .config(\"spark.driver.blockManager.port\", \"7777\")\n",
" .config(\"spark.kubernetes.driver.pod.name\", os.environ[\"HOSTNAME\"]) # For Pod garbage collection (owner reference)\n",
" .config(\"spark.jars.packages\", \"org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.0\")\n",
" #.config(\"spark.driver.extraJavaOptions\", \"-Divy.cache.dir=/tmp -Divy.home=/tmp\")\n",
" #.config(\"spark.executor.extraJavaOptions\", \"-Divy.cache.dir=/tmp -Divy.home=/tmp\")\n",
" .config(\"spark.sql.catalog.lakehouse\", \"org.apache.iceberg.spark.SparkCatalog\")\n",
" .config(\"spark.sql.catalog.lakehouse.type\", \"hive\")\n",
" .config(\"spark.sql.catalog.lakehouse.uri\", \"thrift://hive-iceberg:9083\")\n",
" .appName(\"spark-job\")\n",
" .getOrCreate()\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "bed1a4ad-4e69-4f4f-a694-bd1d94d02cad",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
"
],
"text/plain": [
""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sparksql\n",
"show schemas in lakehouse"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c65d6c75-6fc1-4ca9-bcd0-4f6c7959f373",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"namespace | tableName | isTemporary |
"
],
"text/plain": [
""
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sparksql\n",
"show tables in lakehouse.default"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "aea19cdd-0060-4b69-9bb7-e2e378dba0c0",
"metadata": {},
"outputs": [],
"source": [
"df_trips = spark.read.option(\"header\", \"true\").csv(\"hdfs://hdfs/data/raw/\")\n",
"df_trips.createOrReplaceTempView(\"trips\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "05ab8d0a-f603-4c7a-acd2-666d39269cd2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"ride_id | rideable_type | started_at | ended_at | start_station_name | start_station_id | end_station_name | end_station_id | start_lat | start_lng | end_lat | end_lng | member_casual |
A847FADBBC638E45 | docked_bike | 2020-04-26 17:45:14 | 2020-04-26 18:12:03 | Eckhart Park | 86 | Lincoln Ave & Diversey Pkwy | 152 | 41.8964 | -87.661 | 41.9322 | -87.6586 | member |
5405B80E996FF60D | docked_bike | 2020-04-17 17:08:54 | 2020-04-17 17:17:03 | Drake Ave & Fullerton Ave | 503 | Kosciuszko Park | 499 | 41.9244 | -87.7154 | 41.9306 | -87.7238 | member |
5DD24A79A4E006F4 | docked_bike | 2020-04-01 17:54:13 | 2020-04-01 18:08:36 | McClurg Ct & Erie St | 142 | Indiana Ave & Roosevelt Rd | 255 | 41.8945 | -87.6179 | 41.8679 | -87.623 | member |
"
],
"text/plain": [
""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sparksql\n",
"select * from trips limit 3"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "cef51094-d503-4314-80cf-32356d2f4853",
"metadata": {},
"outputs": [],
"source": [
"df_trips.write.mode(\"overwrite\").parquet(\"hdfs://hdfs/data/processed/\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "d3fe2555-353e-43ea-9b96-d66c7c3b139b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"start_station_name | count(1) |
Clark St & Elm St | 850 |
Dearborn St & Erie St | 730 |
Desplaines St & Kinzie St | 720 |
"
],
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sparksql\n",
"select start_station_name, count(*) from trips group by 1 order by 2 desc limit 3"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "e9d6eca4-e16a-4752-bfc6-63f708780f30",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sparksql\n",
"CREATE SCHEMA IF NOT EXISTS lakehouse.testdata LOCATION 'hdfs://hdfs/lakehouse';"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "98fca8a5-798e-43df-a5b6-8d0cb2654f52",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sparksql\n",
"CREATE OR REPLACE TABLE lakehouse.testdata.trips\n",
"USING iceberg\n",
"PARTITIONED BY (days(started_at))\n",
"AS SELECT\n",
" ride_id,\n",
" rideable_type,\n",
" cast(started_at as timestamp) as started_at,\n",
" cast(ended_at as timestamp) as ended_at\n",
"FROM trips\n",
"ORDER BY started_at"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "ac5ba249-1351-4b77-a374-3da99243f98a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"made_current_at | snapshot_id | parent_id | is_current_ancestor |
2023-06-20 07:40:36.194000 | 1803673990793562830 | null | True |
"
],
"text/plain": [
""
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sparksql\n",
"SELECT * FROM lakehouse.testdata.trips.history;"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "a2fb3f04-b05b-47ef-92ad-88150aecfc11",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"partition | spec_id | record_count | file_count | position_delete_record_count | position_delete_file_count | equality_delete_record_count | equality_delete_file_count |
Row(started_at_day=datetime.date(2020, 4, 1)) | 0 | 2294 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 4)) | 0 | 2288 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 5)) | 0 | 2887 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 2)) | 0 | 2608 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 3)) | 0 | 3532 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 8)) | 0 | 2365 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 9)) | 0 | 1972 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 6)) | 0 | 2427 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 7)) | 0 | 5185 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 12)) | 0 | 3670 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 13)) | 0 | 1820 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 10)) | 0 | 2721 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 11)) | 0 | 4458 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 16)) | 0 | 2272 | 2 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 17)) | 0 | 1435 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 14)) | 0 | 1609 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 15)) | 0 | 1419 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 20)) | 0 | 3903 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 21)) | 0 | 2627 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 18)) | 0 | 5116 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 19)) | 0 | 5158 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 24)) | 0 | 2284 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 25)) | 0 | 1048 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 22)) | 0 | 1995 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 23)) | 0 | 2284 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 28)) | 0 | 3392 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 29)) | 0 | 645 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 26)) | 0 | 6200 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 27)) | 0 | 2595 | 1 | 0 | 0 | 0 | 0 |
Row(started_at_day=datetime.date(2020, 4, 30)) | 0 | 2567 | 1 | 0 | 0 | 0 | 0 |
"
],
"text/plain": [
""
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sparksql --limit 100\n",
"SELECT * FROM lakehouse.testdata.trips.partitions;"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef6bc98e-a26a-42dc-b403-58ea676bcf5e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}