In [0]:
%python
!pip install rdflib
dbutils.library.restartPython() 

In [0]:
%python

df = spark.table("silver_layer_yellow_taxi_including_features").limit(100)  # load 100 rows

import rdflib
from rdflib import Namespace, Literal, RDF, XSD

g = rdflib.Graph()  # create RDF graph
TAXI = Namespace("http://bosch.org/nytaxi/")
g.bind("taxi", TAXI)

def make_uri(entity_type, entity_id):  # helper to create URIs
    return TAXI[f"{entity_type}/{entity_id}"]

# transform DataFrame to RDF triples
for idx, row in enumerate(df.collect()):
    trip_uri = make_uri("Trip", idx+1)
    pickup_loc_uri = make_uri("Location", row["PULocationID"])
    dropoff_loc_uri = make_uri("Location", row["DOLocationID"])
    
    g.add((trip_uri, RDF.type, TAXI.Trip))
    g.add((pickup_loc_uri, RDF.type, TAXI.Location))
    g.add((dropoff_loc_uri, RDF.type, TAXI.Location))
    
    g.add((trip_uri, TAXI.pickupLocation, pickup_loc_uri))
    g.add((trip_uri, TAXI.dropoffLocation, dropoff_loc_uri))
    g.add((trip_uri, TAXI.pickupTime, Literal(row["tpep_pickup_datetime"], datatype=XSD.dateTime)))
    g.add((trip_uri, TAXI.dropoffTime, Literal(row["tpep_dropoff_datetime"], datatype=XSD.dateTime)))
    g.add((trip_uri, TAXI.tripDistance, Literal(row["trip_distance"], datatype=XSD.float)))
    g.add((trip_uri, TAXI.fareAmount, Literal(row["fare_amount"], datatype=XSD.decimal)))
    g.add((trip_uri, TAXI.passengerCount, Literal(row["passenger_count"], datatype=XSD.int)))

from pyspark.sql.types import StringType, StructType, StructField

triples_list = [(str(s), str(p), str(o)) for s, p, o in g]  # convert graph to list of triples
schema = StructType([StructField("s", StringType(), True),
                     StructField("p", StringType(), True),
                     StructField("o", StringType(), True)])
triples_df = spark.createDataFrame(triples_list, schema)  # create Spark DataFrame
triples_df.write.mode("overwrite").saveAsTable("gold_yellow_taxi_rdf_triples")  # save to Hive

# hardcoded SPARQL query: trips between two times
query_hardcoded = """
PREFIX taxi: <http://bosch.org/nytaxi/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?trip ?pickupTime ?dropoffTime
WHERE {
    ?trip taxi:pickupTime ?pickupTime .
    ?trip taxi:dropoffTime ?dropoffTime .
    FILTER (?pickupTime >= "2025-07-01T11:00:00"^^xsd:dateTime &&
            ?pickupTime <= "2025-07-01T12:00:00"^^xsd:dateTime)
}
"""

for row in g.query(query_hardcoded):  # run query and print results
    print(row)
