In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

from delta import *

spark = (
    SparkSession
    .builder.appName('Delta')
    .master('local[*]')
    .config('spark.dynamicAllocation.enabled', 'false')
    .config('spark.jars.packages', 'io.delta:delta-core_2.12:2.4.0')
    .config('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension')
    .config('spark.sql.catalog.spark_catalog',
            'org.apache.spark.sql.delta.catalog.DeltaCatalog')

    .getOrCreate()            
)

sc = spark.sparkContext

In [28]:
import pyspark
pyspark.__version__

'3.4.0'

In [29]:
yello_taxi_schema = StructType([
    StructField('VendorID', IntegerType(), True),
    StructField('tpep_pickup_datetime', TimestampType(), True),
    StructField('tpep_dropoff_datetime', TimestampType(), True),
    StructField('passenger_count', DoubleType(), True),
    StructField('trip_distance', DoubleType(), True),
    StructField('RatecodeID', DoubleType(), True),
    StructField('store_and_fwd_flag', StringType(), True),
    StructField('PULocationID', IntegerType(), True),
    StructField('DOLocationID', IntegerType(), True),
    StructField('payment_type', IntegerType(), True),
    StructField('fare_amount', DoubleType(), True),
    StructField('extra', DoubleType(), True),
    StructField('mta_tax', DoubleType(), True),
    StructField('tip_amount', DoubleType(), True),
    StructField('tolls_amount', DoubleType(), True),
    StructField('improvement_surcharge', DoubleType(), True),
    StructField('total_amount', DoubleType(), True),
    StructField('congestion_surcharge', DoubleType(), True),
    StructField('airport_fee', DoubleType(), True),
])

In [30]:
yellow_taxi_df = (
    spark
        .read
        .option('header', 'true')
        .schema(yello_taxi_schema)
        .csv('../data/YellowTaxis_202210.csv')
)

In [31]:
yellow_taxi_df.show(10, truncate=False)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|1       |2022-10-01 03:03:41 |2022-10-01 03:18:39  |1.0            |1.7          |1.0       |N                 |249         |107         |1           |9.5        |3.0  |0.5    |2.65     

In [32]:
spark.sql("""
    CREATE DATABASE IF NOT EXISTS TaxisDB
          
          """)

DataFrame[]

In [33]:
(yellow_taxi_df
    .write
    .mode('overwrite')
    .partitionBy('VendorID')
    .format('parquet')
    .option('path', '/home/raddy/projects/DataLab/spark-tutorials/data/output/YelloTaxis.parquet')
    .saveAsTable('TaxisDB.YellowTaxisParquet')
)

23/11/11 20:02:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:24 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:24 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:24 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:25 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:25 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:25 WARN MemoryManager: Total allocation exceeds 95.00% 

In [34]:


(yellow_taxi_df
    .write
    .mode('overwrite')
    .partitionBy('VendorID')
    .format('delta')
    .option('path', '/home/raddy/projects/DataLab/spark-tutorials/data/output/YelloTaxis.delta')
    .saveAsTable('TaxisDB.YellowTaxis')
)

23/11/11 20:02:44 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:45 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:45 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:45 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:46 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:46 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/11/11 20:02:49 WARN MemoryManager: Total allocation exceeds 95.00% 

In [35]:
spark.sql("""
SELECT * FROM TaxisDB.YellowTaxis

""").show(10, truncate=False)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|1       |2022-10-29 06:24:16 |2022-10-29 06:47:49  |1.0            |5.3          |1.0       |N                 |113         |151         |1           |18.0       |3.0  |0.5    |1.0      

In [36]:
spark.sql("""
DESCRIBE HISTORY TaxisDB.YellowTaxis
""").show(truncate=False)

+-------+-----------------------+------+--------+---------------------------------+----------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+----------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation                        |operationParameters                                                                     |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                      |userMetadata|engineInfo                         |
+-------+-----------------------+------+--------+---------------------------------+----------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------

In [37]:
spark.sql(
"""
CREATE TABLE TaxisDB.YellowTaxisNew
(
VendorId    INT     COMMENT 'Vendor providing ride',

PickupTime  TIMESTAMP,
DropTime    TIMESTAMP,

PickupLocationId    INT NOT NULL,
DropLocationID  INT,

PassengerCount  DOUBLE,
TripDistance    DOUBLE,

RateCodeId  DOUBLE,
StoreAndFwdFlag STRING,
PaymentType INT,

FareAmount  DOUBLE,
Extra       DOUBLE,
MtaTax      DOUBLE,
TipAmount   DOUBLE,
TollsAmount DOUBLE,
ImprovementSurcharge DOUBLE,
TotalAmount DOUBLE,
CongestionSurcharge DOUBLE,
AirportFee DOUBLE

)    

USING DELTA

LOCATION "/home/raddy/projects/DataLab/spark-tutorials/data/output/YelloTaxisNew.delta"

PARTITIONED BY (VendorId)

COMMENT 'This table stores ride information for Yellow Taxis'

"""
)

AnalysisException: [TABLE_OR_VIEW_ALREADY_EXISTS] Cannot create table or view `TaxisDB`.`YellowTaxisNew` because it already exists.
Choose a different name, drop or replace the existing object, or add the IF NOT EXISTS clause to tolerate pre-existing objects.

In [38]:
spark.sql(
"""
DESCRIBE TABLE EXTENDED TaxisDB.YellowTaxisNew
"""
).show(50, truncate=False)

+----------------------------+---------------------------------------------------------------------------------+---------------------+
|col_name                    |data_type                                                                        |comment              |
+----------------------------+---------------------------------------------------------------------------------+---------------------+
|VendorId                    |int                                                                              |Vendor providing ride|
|PickupTime                  |timestamp                                                                        |null                 |
|DropTime                    |timestamp                                                                        |null                 |
|PickupLocationId            |int                                                                              |null                 |
|DropLocationID              |int                      

In [42]:
spark.sql("""
        SELECT INPUT_FILE_NAME()
          ,VendorID
          ,PULocationID
          ,passenger_count

          FROM TaxisDB.YellowTaxis
          WHERE VendorId=3

          """).show(10, truncate=False)

+-----------------+--------+------------+---------------+
|input_file_name()|VendorID|PULocationID|passenger_count|
+-----------------+--------+------------+---------------+
+-----------------+--------+------------+---------------+

