In [None]:
import org.wololo.geojson.GeoJSONFactory
import org.wololo.jts2geojson.GeoJSONReader
import org.wololo.geojson.Feature
import org.apache.spark.sql.DataFrame

def readLineOfGeojson(filePath: String): DataFrame = {
    val colnames = Seq("TRIP_ID", "CALL_TYPE", "ORIGIN_STAND", "TAXI_ID", "TIMESTAMP", "DAY_TYPE", "MISSING_DATA",  "geometry")
    sc.textFile(filePath).map(line => {
        val feature = GeoJSONFactory.create(line).asInstanceOf[Feature]
        val reader = new GeoJSONReader
        (
            feature.getProperties.get("TRIP_ID").asInstanceOf[Long]
            ,feature.getProperties.get("CALL_TYPE").asInstanceOf[String]
            ,feature.getProperties.get("ORIGIN_STAND").asInstanceOf[String]
            ,feature.getProperties.get("TAXI_ID").asInstanceOf[Integer]
            ,feature.getProperties.get("TIMESTAMP").asInstanceOf[String]
            ,feature.getProperties.get("DAY_TYPE").asInstanceOf[String]
            ,feature.getProperties.get("MISSING_DATA").asInstanceOf[Boolean]
            ,reader.read(feature.getGeometry)
        )
    }).toDF(colnames: _*)
}


In [None]:
val testFile = "/home/iceberg/data/test_portotaxi.geojson"
val df_test = readLineOfGeojson(testFile)
df_test.createOrReplaceTempView("test_portotaxi")
df_test.show

In [None]:
df_test.printSchema

In [None]:
spark.sql("SELECT * FROM test_portotaxi").count()

## 1. Create a iceberg table with geometry type


The `write.parquet.geometry.encoding` has 3 possible values:

- `nested-list`: the most efficient (usually have small file size, faster reading and writing)

- `wkb-bbox`

- `wkb`

In [None]:
spark.sql("DROP TABLE IF EXISTS demo.db.test_portotaxi")


spark.sql("""
CREATE TABLE IF NOT EXISTS demo.db.test_portotaxi 
(
  TRIP_ID LONG,
  CALL_TYPE STRING,
  ORIGIN_STAND STRING,
  TAXI_ID INTEGER,
  TIMESTAMP STRING,
  DAY_TYPE STRING,
  MISSING_DATA BOOLEAN,
  geometry GEOMETRY
)
USING iceberg
TBLPROPERTIES ('write.parquet.geometry.encoding' = 'nested-list')
""")
          
spark.sql("INSERT INTO demo.db.test_portotaxi SELECT * FROM test_portotaxi")

## 2. Table Summary

In [None]:
spark.sql("""SELECT
    summary['total-records'] as total_records,
    summary['total-files-size'] / 1024 / 1024 as file_size_in_mb 
    FROM demo.db.test_portotaxi.snapshots
""").show()

## 3. Run Spatial Query

In [None]:
val bbox = "POLYGON ((-8.6079 41.1489, -8.6089 41.1472, -8.6066 41.1470, -8.6061 41.1483, -8.6079 41.1489))"
spark.sql(s"""
SELECT count(*)
FROM demo.db.test_portotaxi
WHERE ST_Within(geometry, IcebergSTGeomFromText('${bbox}'))
""").show()