In [0]:
%python
!pip install rdflib
dbutils.library.restartPython() 

In [0]:
%python
# Load first 100 rows from the silver layer of Yellow Taxi trips
df = spark.table("silver_layer_yellow_taxi_including_features").limit(100)

# Import rdflib for creating RDF graphs
import rdflib
from rdflib import Namespace, Literal, RDF, XSD

# Create an empty RDF graph
g = rdflib.Graph()  

# Define a namespace for all our taxi-related RDF entities
TAXI = Namespace("http://bosch.org/nytaxi/")
g.bind("taxi", TAXI)  # bind prefix for readability in serialized graphs

# Helper function to create unique URIs for entities (Trips, Locations, etc.)
def make_uri(entity_type, entity_id):
    return TAXI[f"{entity_type}/{entity_id}"]

# Transform each row in the DataFrame into RDF triples
# This is the core of the RDF modeling: each row becomes a Trip entity with linked Locations
for idx, row in enumerate(df.collect()):
    trip_uri = make_uri("Trip", idx+1)  # create a unique URI for this trip
    pickup_loc_uri = make_uri("Location", row["PULocationID"])
    dropoff_loc_uri = make_uri("Location", row["DOLocationID"])
    
    # Add RDF type statements
    g.add((trip_uri, RDF.type, TAXI.Trip))
    g.add((pickup_loc_uri, RDF.type, TAXI.Location))
    g.add((dropoff_loc_uri, RDF.type, TAXI.Location))
    
    # Link the trip to its pickup and dropoff locations
    g.add((trip_uri, TAXI.pickupLocation, pickup_loc_uri))
    g.add((trip_uri, TAXI.dropoffLocation, dropoff_loc_uri))
    
    # Add literal properties for the trip (times, distance, fare, passengers)
    g.add((trip_uri, TAXI.pickupTime, Literal(row["tpep_pickup_datetime"], datatype=XSD.dateTime)))
    g.add((trip_uri, TAXI.dropoffTime, Literal(row["tpep_dropoff_datetime"], datatype=XSD.dateTime)))
    g.add((trip_uri, TAXI.tripDistance, Literal(row["trip_distance"], datatype=XSD.float)))
    g.add((trip_uri, TAXI.fareAmount, Literal(row["fare_amount"], datatype=XSD.decimal)))
    g.add((trip_uri, TAXI.passengerCount, Literal(row["passenger_count"], datatype=XSD.int)))

# Convert RDF graph into a list of triples (subject, predicate, object)
triples_list = [(str(s), str(p), str(o)) for s, p, o in g]

# Define Spark schema for storing triples in a DataFrame
from pyspark.sql.types import StringType, StructType, StructField
schema = StructType([
    StructField("s", StringType(), True),
    StructField("p", StringType(), True),
    StructField("o", StringType(), True)
])

# Create a Spark DataFrame from the RDF triples
triples_df = spark.createDataFrame(triples_list, schema)

# Save the triples as a Delta/Hive table for downstream use
triples_df.write.mode("overwrite").saveAsTable("gold_yellow_taxi_rdf_triples")

# Example SPARQL query: select trips within a certain pickup time range
query_hardcoded = """
PREFIX taxi: <http://bosch.org/nytaxi/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?trip ?pickupTime ?dropoffTime
WHERE {
    ?trip taxi:pickupTime ?pickupTime .
    ?trip taxi:dropoffTime ?dropoffTime .
    FILTER (?pickupTime >= "2025-07-01T11:00:00"^^xsd:dateTime &&
            ?pickupTime <= "2025-07-01T12:00:00"^^xsd:dateTime)
}
"""

# Run the SPARQL query on the RDF graph and print results
for row in g.query(query_hardcoded):
    print(row)
