In [3]:
from azureml.opendatasets import NycTlcGreen
from datetime import datetime,timedelta
from dateutil import parser,relativedelta
import pyspark.sql.functions as f

from pyspark.sql.functions import year, month, dayofmonth, dayofweek, hour,to_date
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, DoubleType, StringType
from delta.tables import DeltaTable


StatementMeta(, d28bbdc7-b92a-47a4-bac3-d44af145618a, 7, Finished, Available)

In [4]:
%%sql 
SET spark.sql.parquet.vorder.enabled=TRUE

StatementMeta(, d28bbdc7-b92a-47a4-bac3-d44af145618a, 8, Finished, Available)

<Spark SQL result set with 1 rows and 2 fields>

In [5]:
end_date = parser.parse('2018-06-06')
start_date = parser.parse('2014-05-01')


StatementMeta(, d28bbdc7-b92a-47a4-bac3-d44af145618a, 9, Finished, Available)

In [6]:
schema = StructType([
    StructField('vendorID', IntegerType(), True),
    StructField('lpepPickupDatetime', TimestampType(), True),
    StructField('lpepDropoffDatetime', TimestampType(), True),
    StructField('passengerCount', IntegerType(), True),
    StructField('tripDistance', DoubleType(), True),
    StructField('puLocationId', StringType(), True),
    StructField('doLocationId', StringType(), True),
    StructField('pickupLongitude', DoubleType(), True),
    StructField('pickupLatitude', DoubleType(), True),
    StructField('dropoffLongitude', DoubleType(), True),
    StructField('dropoffLatitude', DoubleType(), True),
    StructField('rateCodeID', IntegerType(), True),
    StructField('storeAndFwdFlag', StringType(), True),
    StructField('paymentType', IntegerType(), True),
    StructField('fareAmount', DoubleType(), True),
    StructField('extra', DoubleType(), True),
    StructField('mtaTax', DoubleType(), True),
    StructField('improvementSurcharge', StringType(), True),
    StructField('tipAmount', DoubleType(), True),
    StructField('tollsAmount', DoubleType(), True),
    StructField('ehailFee', DoubleType(), True),
    StructField('totalAmount', DoubleType(), True),
    StructField('tripType', DoubleType(), True)
   
])

StatementMeta(, d28bbdc7-b92a-47a4-bac3-d44af145618a, 10, Finished, Available)

In [7]:

while start_date < end_date:
    # Define the end date for this chunk (one month later)
    chunk_end_date = min(start_date + relativedelta.relativedelta(months=1), end_date)

    # Load the data for this chunk
    nyc_tlc = NycTlcGreen(start_date=start_date, end_date=chunk_end_date)
    nyc_tlc_df = nyc_tlc.to_pandas_dataframe()
    nyc_tlc_df_spark = spark.createDataFrame(nyc_tlc_df, schema)

    # Transform the DataFrame
    nyc_tlc_df_transformed = nyc_tlc_df_spark.withColumn('year', f.year('lpepPickupDatetime')) \
        .withColumn('month', f.month('lpepPickupDatetime')) \
        .withColumn('date', f.to_date('lpepPickupDatetime')) \
        .withColumn('day_of_month', f.dayofmonth('lpepPickupDatetime')) \
        .withColumn('day_of_week', f.dayofweek('lpepPickupDatetime')) \
        .withColumn('hour', f.hour('lpepPickupDatetime'))

    # Save the transformed data as a Delta table, partitioned by month
    nyc_tlc_df_transformed.write.format('delta').option("overwriteSchema", "true").partitionBy("month").saveAsTable("NYCGreenTaxi", mode="append")

    # Update the start date for the next chunk (one month later)
    start_date = chunk_end_date



StatementMeta(, d28bbdc7-b92a-47a4-bac3-d44af145618a, 11, Finished, Available)

[Info] read from /tmp/tmpluaskkqx/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2014/puMonth=5/part-00055-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2625-2.c000.snappy.parquet
[Info] read from /tmp/tmpluaskkqx/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2014/puMonth=6/part-00122-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2692-1.c000.snappy.parquet
[Info] read from /tmp/tmp0ui56kpo/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2014/puMonth=6/part-00122-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2692-1.c000.snappy.parquet
[Info] read from /tmp/tmp0ui56kpo/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2014/puMonth=7/part-00194-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2764-1.c000.snappy.parquet
[Info] read from /tmp/tmp4rlfevhr/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2014/puMonth=7/part-00194-tid-4753095944193949832



In [8]:
mssparkutils.fs.ls('Tables')

StatementMeta(, d28bbdc7-b92a-47a4-bac3-d44af145618a, 12, Finished, Available)

[FileInfo(path=abfss://a035ebc3-8528-4e7d-b2ba-82f859ca46be@onelake.dfs.fabric.microsoft.com/4d2adeea-ee60-4833-9d0e-e72df7bec6b9/Tables/date_managed, name=date_managed, size=0),
 FileInfo(path=abfss://a035ebc3-8528-4e7d-b2ba-82f859ca46be@onelake.dfs.fabric.microsoft.com/4d2adeea-ee60-4833-9d0e-e72df7bec6b9/Tables/holidays_managed, name=holidays_managed, size=0),
 FileInfo(path=abfss://a035ebc3-8528-4e7d-b2ba-82f859ca46be@onelake.dfs.fabric.microsoft.com/4d2adeea-ee60-4833-9d0e-e72df7bec6b9/Tables/nycgreentaxi, name=nycgreentaxi, size=0)]

In [9]:
%%sql
DESCRIBE DETAIL nycgreentaxi

StatementMeta(, d28bbdc7-b92a-47a4-bac3-d44af145618a, 13, Finished, Available)

<Spark SQL result set with 1 rows and 13 fields>

In [15]:
# Path to the _delta_log directory

tablebasepath="abfss://YoutubeDemo@onelake.dfs.fabric.microsoft.com/NYCTaxiLakeHouse.Lakehouse/Tables/nycgreentaxi"
tablename=f'{tablebasepath}/_delta_log'
# Get a list of all JSON files in the _delta_log directory
log_files = [file.path for file in mssparkutils.fs.ls(tablename) if file.name.endswith(".json")]
# Check if there are any log files
if log_files:
    # Read the first log file
    data = mssparkutils.fs.head(log_files[0])
    
    # Print the contents of the file
    print(data)
else:
    print("No log files found.")

StatementMeta(, d28bbdc7-b92a-47a4-bac3-d44af145618a, 19, Finished, Available)

{"commitInfo":{"timestamp":1685536214283,"operation":"CREATE TABLE AS SELECT","operationParameters":{"isManaged":"true","description":null,"partitionBy":"[\"month\"]","properties":"{}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"2","numOutputRows":"1421543","numOutputBytes":"38331246"},"tags":{"VORDER":"true"},"engineInfo":"Apache-Spark/3.3.1.5.2-92314920 Delta-Lake/2.2.0.4","txnId":"fe79cbba-d2a5-407c-8c3e-533b162ba5f3"}}
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"dd6d3a95-a931-4814-a690-91d752ef4f45","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"vendorID\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"lpepPickupDatetime\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"lpepDropoffDatetime\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"passengerCount\",\"type\":\"integer\",\"nullable\