In [6]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, StructType, StructField
from pyspark.sql.types import DoubleType, TimestampType, StringType


from IPython.display import *


data_path = '../data/'

spark = (
    
            SparkSession
            .builder
            .appName('SparkCourse') 
            .master('local[*]') 
            .config('spark.dynamicAllocation.enabled', 'false')
            .config('spark.sql.adaptive.enabled', 'false')
            
            .getOrCreate()
    )

sc = spark.sparkContext

spark

### PARTITIONING 

    1. In Memory -> chunks of data read in memory
    2. Disk Partitioning -> Writing output to disk by physically partitioning data based on columns 

    default:
    spark.default.parallelism = 4 
    spark.sql.files.maxPartitionBytes = 128 MB 

In [2]:
def getDataFrameStats(df, column_name):
    out_df = df.withColumn('partition_number', F.spark_partition_id()).groupBy('partition_number').agg(
        F.count('*').alias('record_count'),
        F.min(column_name).alias('min_column_value'),
        F.max(column_name).alias('max_column_value')
    ).orderBy('partition_number')

    return out_df
    

In [3]:
sc.defaultParallelism

8

In [4]:
taxi_zones_df = spark.read.option('inferSchema', 'true').csv('../data/TaxiZones.csv')

                                                                                

In [5]:
print('Partitions = ' + str(taxi_zones_df.rdd.getNumPartitions() ))
print('Record Count = ' + str(taxi_zones_df.count()))

Partitions = 1
Record Count = 265


In [6]:
yellow_taxi_df = (spark
                    .read
                    .option('header', 'true')
                    .option('inferShema', 'true')
                    .csv('../data/YellowTaxis_202210.csv')
)

In [7]:
spark.conf.set('spark.sql.files.maxPartitionBytes', '64m')

In [8]:
print('Partitions = ' + str(yellow_taxi_df.rdd.getNumPartitions()))
print('Record Count = ' + str(yellow_taxi_df.count()) )

Partitions = 8




Record Count = 3675412


                                                                                

In [9]:
yellow_taxi_df.show(10, truncate=False)

+--------+-----------------------------+-----------------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime         |tpep_dropoff_datetime        |passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+-----------------------------+-----------------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|1       |2022-10-01T05:33:41.000+05:30|2022-10-01T05:48:39.000+05:30|1.0            |1.7          |1.0       |N                 |249   

In [10]:
yellow_taxi_df_rp = yellow_taxi_df.repartition(14) ## eound robin

yellow_taxi_df_rp_range = yellow_taxi_df.repartitionByRange(14, 'PULocationID') ## reduce skewness

yellow_taxi_df_coalesce = yellow_taxi_df.coalesce(1)

In [11]:
getDataFrameStats(yellow_taxi_df_coalesce, 'PULocationID').show(10)

[Stage 8:>                                                          (0 + 1) / 1]

+----------------+------------+----------------+----------------+
|partition_number|record_count|min_column_value|max_column_value|
+----------------+------------+----------------+----------------+
|               0|     3675412|               1|              98|
+----------------+------------+----------------+----------------+



                                                                                

##### Repartition Method -> increse number of partition 
    1. Wide transformation -> shuffling 
    2. Partition Options 
        1. Round robin -> creates equal sized partitions 
        2. Hash -> co-locates data based on columns 
        3. reduce skewness -> some partitions have much more data tna others reduce partiotion size 

##### Coalesce method decrease number of partitions: 
    1. decrease partitions

### MEMORY MANAGEMENT 

    1. JVM HEAP memory is used for Spark activity
    2. Non-heap memory(overhead memory) - non JVM processes like VM overheads, buffers, udfs etc.

##### Driver container 

    spark.driver.cores =2
    spark.driver.memory =8g
    spark.driver.memoryOverhead = max(10% of memory or 384 MB)

##### Executor container

    spark.executor.cores =4
    spark.executor.memory =14g
    spark.memory.fraction =(14_000 - 30) * 0.6

#### Executor JVM HEAP
    1. Reserved Memory = 300m
    2. Spark Memory = spark.memory.fraction(for task execution, caching, shufling df operations etc.)
        2.1 Storage Memory = Spark Memory/2(storing cached data)
        2.2 Execution Memory = Spark Meory/2(df ops, task execution, shuffling...)
    3. User Memory =(14_000 - 30) * 0.4(for stroring data structures created by user, metadata, RDD operations, UDFs, etc)


#### CACHING METHODS
    -> Lazy operation
    -> Uses LRU(Least-Recently-Used)
    cache() -> no argumets
    persist()
        MEMORY_ONLY -> Partitions that can fit in memory are cached; others are recomputed each time
        MEMORY_AND_DISK -> Partitions that can fit in memory are cached others are spilled to local disk 
        of Worker
        DISK_ONLY:Onlys stored on disk; pulled when required
    unpesist() -> not required

In [12]:
yellow_taxi_df = spark.read.option('header', 'true').option('inferSchema', 'true').csv('../data/YellowTaxis_202210.csv')

                                                                                

In [13]:
yellow_taxi_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [14]:
yellow_taxi_df_grouped = (
    yellow_taxi_df
        .dropDuplicates()
        .groupBy('PULocationID')
        .agg(F.sum('total_amount'))
)


In [15]:
yellow_taxi_df_grouped.show(10)

                                                                                

+------------+------------------+
|PULocationID| sum(total_amount)|
+------------+------------------+
|         148| 795087.6300000009|
|         243|16739.699999999993|
|          31| 935.1500000000001|
|         137| 762800.5900000014|
|          85|3172.5599999999995|
|         251|            521.63|
|          65|          80190.23|
|         255| 51031.77999999998|
|          53|1601.2899999999993|
|         133|3281.4800000000014|
+------------+------------------+
only showing top 10 rows



In [16]:
yellow_taxi_df_grouped.write.option('header', 'true').option('dateFormat', 'yyyy-MM-dd HH:mm:ss.S').mode('overwrite').csv('../data/results/cached_test')

                                                                                

In [17]:
import pyspark

In [18]:
yellow_taxi_df_grouped.persist(pyspark.StorageLevel.MEMORY_AND_DISK)

DataFrame[PULocationID: int, sum(total_amount): double]

In [19]:
yellow_taxi_df_grouped.write.option('header', 'true').option('dateFormat', 'yyyy-MM-dd HH:mm:ss.S').mode('overwrite').csv('../data/results/cached_test_1')

                                                                                

In [20]:
yellow_taxi_df_grouped.unpersist()

DataFrame[PULocationID: int, sum(total_amount): double]

#### JOIN STRATEGIES AND BROADCAST JOINS

##### 1. BroadCasting

In [3]:
spark.conf.get('spark.sql.autoBroadcastJoinThreshold')

#spark.conf.get('spark.sql.autoBroadcastJoinThreshold', '-1') disable


'10485760b'

In [12]:
yellow_taxi_schema = StructType([
        StructField('VendorId', IntegerType(), True),
        StructField('PickupTime', TimestampType(), True),
        StructField('DropTime', TimestampType(), True),
        StructField('PassengerCount', DoubleType(), True),
        StructField('TripDistance', DoubleType(), True),
        StructField('RateCodeId', DoubleType(), True),
        StructField('StoreAndFwdFlag', StringType(), True),
        StructField('PickupLocationId', IntegerType(), True),
        StructField('DropLocationId', IntegerType(), True),
        StructField('PaymentType', IntegerType(), True),
        StructField('FareAmount', DoubleType(), True),
        StructField('Extra', DoubleType(), True),
        StructField('MtaTax', DoubleType(), True),
        StructField('TipAmount', DoubleType(), True),
        StructField('TollsAmount', DoubleType(), True),
        StructField('ImprovementSurchange', DoubleType(), True),
        StructField('TotalAmount', DoubleType(), True),
        StructField('CongestionSurcharge', DoubleType(), True),
        StructField('AirportFee', DoubleType(), True),
])
yellow_taxi_df.count()

                                                                                

3675412

In [10]:
yellow_taxi_df = spark.read.option('header', 'true').schema(yellow_taxi_schema).csv('../data/YellowTaxis_202210.csv')

In [11]:
yellow_taxi_df.show(10)

+--------+-------------------+-------------------+--------------+------------+----------+---------------+----------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+
|VendorId|         PickupTime|           DropTime|PassengerCount|TripDistance|RateCodeId|StoreAndFwdFlag|PickupLocationId|DropLocationId|PaymentType|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurchange|TotalAmount|CongestionSurcharge|AirportFee|
+--------+-------------------+-------------------+--------------+------------+----------+---------------+----------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+
|       1|2022-10-01 03:03:41|2022-10-01 03:18:39|           1.0|         1.7|       1.0|              N|             249|           107|          1|       9.5|  3.0|   0.5|     2.65|        0.0|                 0.3|    

23/09/17 16:01:56 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee
 Schema: VendorId, PickupTime, DropTime, PassengerCount, TripDistance, RateCodeId, StoreAndFwdFlag, PickupLocationId, DropLocationId, PaymentType, FareAmount, Extra, MtaTax, TipAmount, TollsAmount, ImprovementSurchange, TotalAmount, CongestionSurcharge, AirportFee
Expected: PickupTime but found: tpep_pickup_datetime
CSV file: file:///home/raddy/projects/DataLab/data/YellowTaxis_202210.csv


In [16]:
taxi_zones_schema = "PickupLocationId INT, Borough STRING, Zone STRING, ServiceZone STRING"

taxi_zones_df = spark.read.schema(taxi_zones_schema).csv('../data/TaxiZones.csv')

taxi_zones_df.count()

265

In [15]:
taxi_zones_df.show(10)

+----------------+-------------+--------------------+-----------+
|PickupLocationId|      Borough|                Zone|ServiceZone|
+----------------+-------------+--------------------+-----------+
|               1|          EWR|      Newark Airport|        EWR|
|               2|       Queens|         Jamaica Bay|  Boro Zone|
|               3|        Bronx|Allerton/Pelham G...|  Boro Zone|
|               4|    Manhattan|       Alphabet City|Yellow Zone|
|               5|Staten Island|       Arden Heights|  Boro Zone|
|               6|Staten Island|Arrochar/Fort Wad...|  Boro Zone|
|               7|       Queens|             Astoria|  Boro Zone|
|               8|       Queens|        Astoria Park|  Boro Zone|
|               9|       Queens|          Auburndale|  Boro Zone|
|              10|       Queens|        Baisley Park|  Boro Zone|
+----------------+-------------+--------------------+-----------+
only showing top 10 rows



In [22]:
joined_df_1 = yellow_taxi_df.join( F.broadcast(taxi_zones_df), yellow_taxi_df.PickupLocationId == taxi_zones_df.PickupLocationId)

In [23]:
joined_df_1.show(10, truncate=False)

+--------+-------------------+-------------------+--------------+------------+----------+---------------+----------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+----------------+---------+------------------------------+-----------+
|VendorId|PickupTime         |DropTime           |PassengerCount|TripDistance|RateCodeId|StoreAndFwdFlag|PickupLocationId|DropLocationId|PaymentType|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurchange|TotalAmount|CongestionSurcharge|AirportFee|PickupLocationId|Borough  |Zone                          |ServiceZone|
+--------+-------------------+-------------------+--------------+------------+----------+---------------+----------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+----------------+---------+------------------------------+-----------+
|1       |

23/09/17 16:14:43 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee
 Schema: VendorId, PickupTime, DropTime, PassengerCount, TripDistance, RateCodeId, StoreAndFwdFlag, PickupLocationId, DropLocationId, PaymentType, FareAmount, Extra, MtaTax, TipAmount, TollsAmount, ImprovementSurchange, TotalAmount, CongestionSurcharge, AirportFee
Expected: PickupTime but found: tpep_pickup_datetime
CSV file: file:///home/raddy/projects/DataLab/data/YellowTaxis_202210.csv


##### 2. Bucketing

In [24]:
yellow_taxi_df.createOrReplaceTempView('YellowTaxis1_Unbucketed')
yellow_taxi_df.createOrReplaceTempView('YellowTaxis2_Unbucketed')

In [38]:
spark.sql("""
    SELECT * FROM 
          YellowTaxis1_Unbucketed b1
          JOIN YellowTaxis2_Unbucketed b2 ON
          b1.DropLocationId = b2.PickupLocationId
          
          """).show(10)

23/09/17 17:14:52 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee
 Schema: VendorId, PickupTime, DropTime, PassengerCount, TripDistance, RateCodeId, StoreAndFwdFlag, PickupLocationId, DropLocationId, PaymentType, FareAmount, Extra, MtaTax, TipAmount, TollsAmount, ImprovementSurchange, TotalAmount, CongestionSurcharge, AirportFee
Expected: PickupTime but found: tpep_pickup_datetime
CSV file: file:///home/raddy/projects/DataLab/data/YellowTaxis_202210.csv
23/09/17 17:15:02 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DO

+--------+-------------------+-------------------+--------------+------------+----------+---------------+----------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+--------+-------------------+-------------------+--------------+------------+----------+---------------+----------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+
|VendorId|         PickupTime|           DropTime|PassengerCount|TripDistance|RateCodeId|StoreAndFwdFlag|PickupLocationId|DropLocationId|PaymentType|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurchange|TotalAmount|CongestionSurcharge|AirportFee|VendorId|         PickupTime|           DropTime|PassengerCount|TripDistance|RateCodeId|StoreAndFwdFlag|PickupLocationId|DropLocationId|PaymentType|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurchange|TotalAmo

                                                                                

In [31]:
(yellow_taxi_df.write
 .bucketBy(4,'DropLocationId')
 .option('header', 'true')
 .option('dateFormat', 'yyyy-MM-dd HH:mm:ss.S')
 .mode('overwrite').format('csv')
 .saveAsTable('YelloTaxis_PickupBukcet1'))

23/09/17 17:11:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee
 Schema: VendorId, PickupTime, DropTime, PassengerCount, TripDistance, RateCodeId, StoreAndFwdFlag, PickupLocationId, DropLocationId, PaymentType, FareAmount, Extra, MtaTax, TipAmount, TollsAmount, ImprovementSurchange, TotalAmount, CongestionSurcharge, AirportFee
Expected: PickupTime but found: tpep_pickup_datetime
CSV file: file:///home/raddy/projects/DataLab/data/YellowTaxis_202210.csv
                                                                                

In [32]:
(yellow_taxi_df.write
 .bucketBy(4,'PickupLocationId')
 .option('header', 'true')
 .option('dateFormat', 'yyyy-MM-dd HH:mm:ss.S')
 .mode('overwrite').format('csv')
 .saveAsTable('YelloTaxis_PickupBukcet2'))

23/09/17 17:11:52 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee
 Schema: VendorId, PickupTime, DropTime, PassengerCount, TripDistance, RateCodeId, StoreAndFwdFlag, PickupLocationId, DropLocationId, PaymentType, FareAmount, Extra, MtaTax, TipAmount, TollsAmount, ImprovementSurchange, TotalAmount, CongestionSurcharge, AirportFee
Expected: PickupTime but found: tpep_pickup_datetime
CSV file: file:///home/raddy/projects/DataLab/data/YellowTaxis_202210.csv
                                                                                

In [37]:
spark.sql( """
    SELECT * FROM 
    YelloTaxis_PickupBukcet1 b1 JOIN
    YelloTaxis_PickupBukcet2 b2 on b1.DropLocationId = b2.PickupLocationId
""").show(10)

[Stage 18:>                                                         (0 + 1) / 1]

+--------+-------------------+-------------------+--------------+------------+----------+---------------+----------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+--------+-------------------+-------------------+--------------+------------+----------+---------------+----------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+
|VendorId|         PickupTime|           DropTime|PassengerCount|TripDistance|RateCodeId|StoreAndFwdFlag|PickupLocationId|DropLocationId|PaymentType|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurchange|TotalAmount|CongestionSurcharge|AirportFee|VendorId|         PickupTime|           DropTime|PassengerCount|TripDistance|RateCodeId|StoreAndFwdFlag|PickupLocationId|DropLocationId|PaymentType|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurchange|TotalAmo

                                                                                

#### DYNAMIC RESOURCE ALLOCATION
    spark.dynamicAllocation:
        .enabled=true
        .shuffleTracking.enabled=true
        .minExecutors=0
        .maxExecutors=5
        .schedulerBacklogTimeout=1s
        .executorIdleTimeout=60s