In [1]:
import org.wololo.geojson.GeoJSONFactory
import org.wololo.jts2geojson.GeoJSONReader
import org.wololo.geojson.Feature
import org.apache.spark.sql.DataFrame

def readLineOfGeojson(filePath: String): DataFrame = {
    val colnames = Seq("TRIP_ID", "CALL_TYPE", "ORIGIN_STAND", "TAXI_ID", "TIMESTAMP", "DAY_TYPE", "MISSING_DATA",  "geometry")
    sc.textFile(filePath).map(line => {
        val feature = GeoJSONFactory.create(line).asInstanceOf[Feature]
        val reader = new GeoJSONReader
        (
            feature.getProperties.get("TRIP_ID").asInstanceOf[Long]
            ,feature.getProperties.get("CALL_TYPE").asInstanceOf[String]
            ,feature.getProperties.get("ORIGIN_STAND").asInstanceOf[String]
            ,feature.getProperties.get("TAXI_ID").asInstanceOf[Integer]
            ,feature.getProperties.get("TIMESTAMP").asInstanceOf[String]
            ,feature.getProperties.get("DAY_TYPE").asInstanceOf[String]
            ,feature.getProperties.get("MISSING_DATA").asInstanceOf[Boolean]
            ,reader.read(feature.getGeometry)
        )
    }).toDF(colnames: _*)
}


Intitializing Scala interpreter ...

Spark Web UI available at http://5cbf7cf21bed:4040
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1674990521445)
SparkSession available as 'spark'


import org.wololo.geojson.GeoJSONFactory
import org.wololo.jts2geojson.GeoJSONReader
import org.wololo.geojson.Feature
import org.apache.spark.sql.DataFrame
readGeojson: (filePath: String)org.apache.spark.sql.DataFrame


In [2]:
val testFile = "/home/iceberg/data/test_portotaxi.geojson"
val df_test = readLineOfGeojson(testFile)
df_test.createOrReplaceTempView("test_portotaxi")
df_test.show

+-------------------+---------+------------+--------+--------------------+--------+------------+--------------------+
|            TRIP_ID|CALL_TYPE|ORIGIN_STAND| TAXI_ID|           TIMESTAMP|DAY_TYPE|MISSING_DATA|            geometry|
+-------------------+---------+------------+--------+--------------------+--------+------------+--------------------+
|1373017604620000351|        C|            |20000351|java.util.Gregori...|       A|       false|MULTIPOINT ((-8.6...|
|1373277091620000446|        C|            |20000446|java.util.Gregori...|       A|       false|MULTIPOINT ((-8.6...|
|1373462211620000500|        C|            |20000500|java.util.Gregori...|       A|       false|MULTIPOINT ((-8.6...|
|1374755673620000663|        C|            |20000663|java.util.Gregori...|       A|       false|MULTIPOINT ((-8.6...|
|1375977626620000337|        C|            |20000337|java.util.Gregori...|       A|       false|MULTIPOINT ((-8.6...|
|1375984752620000337|        C|            |20000337|jav

testFile: String = /home/iceberg/data/test_portotaxi.geojson
df_test: org.apache.spark.sql.DataFrame = [TRIP_ID: bigint, CALL_TYPE: string ... 6 more fields]


In [3]:
df_test.printSchema

root
 |-- TRIP_ID: long (nullable = false)
 |-- CALL_TYPE: string (nullable = true)
 |-- ORIGIN_STAND: string (nullable = true)
 |-- TAXI_ID: integer (nullable = true)
 |-- TIMESTAMP: string (nullable = true)
 |-- DAY_TYPE: string (nullable = true)
 |-- MISSING_DATA: boolean (nullable = false)
 |-- geometry: geometry (nullable = true)



In [4]:
spark.sql("SELECT * FROM test_portotaxi").count()

res2: Long = 50


## 1. Create a iceberg table with geometry type


The `write.parquet.geometry.encoding` has 3 possible values:

- `nested-list`: the most efficient (usually have small file size, faster reading and writing)

- `wkb-bbox`

- `wkb`

In [5]:
spark.sql("DROP TABLE IF EXISTS demo.db.test_portotaxi")


spark.sql("""
CREATE TABLE IF NOT EXISTS demo.db.test_portotaxi 
(
  TRIP_ID LONG,
  CALL_TYPE STRING,
  ORIGIN_STAND STRING,
  TAXI_ID INTEGER,
  TIMESTAMP STRING,
  DAY_TYPE STRING,
  MISSING_DATA BOOLEAN,
  geometry GEOMETRY
)
USING iceberg
TBLPROPERTIES ('write.parquet.geometry.encoding' = 'nested-list')
""")
          
spark.sql("INSERT INTO demo.db.test_portotaxi SELECT * FROM test_portotaxi")

res3: org.apache.spark.sql.DataFrame = []


## 2. Table Summary

In [6]:
spark.sql("""SELECT
    summary['total-records'] as total_records,
    summary['total-files-size'] / 1024 / 1024 as file_size_in_mb 
    FROM demo.db.test_portotaxi.snapshots
""").show()

+-------------+-------------------+
|total_records|    file_size_in_mb|
+-------------+-------------------+
|           50|0.13652706146240234|
+-------------+-------------------+



## 3. Run Spatial Query

In [7]:
val bbox = "POLYGON ((-8.6079 41.1489, -8.6089 41.1472, -8.6066 41.1470, -8.6061 41.1483, -8.6079 41.1489))"
spark.sql(s"""
SELECT count(*)
FROM demo.db.test_portotaxi
WHERE ST_Within(geometry, ST_GeomFromText('${bbox}'))
""").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



bbox: String = POLYGON ((-8.6079 41.1489, -8.6089 41.1472, -8.6066 41.1470, -8.6061 41.1483, -8.6079 41.1489))
