In [15]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName("Spark Read Cmpressed Files"). \
    master("yarn"). \
    config('spark.executor.instances','3'). \
    config('spark.executor.memory','4g'). \
    config('spark.executor.cores','3'). \
    config('spark.dynamicAllocation.enabled','False'). \
    getOrCreate()

In [14]:
spark.stop()

In [16]:
spark.sparkContext.applicationId

'application_1745651200635_11279'

In [17]:
 #Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)


In [18]:
# Read Employee one single gzip file
schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"


emp_one_gzip = spark.read.format("csv").schema(schema).option("header", True).load("BigDataset/employee_records_10GB.csv.gz")

In [19]:
#Write Action

emp_one_gzip.write.format("noop").mode("overwrite").save()

In [None]:
#result

#Time taken for reading full data: 3+ mints (running)
#No of cores used: 1 cores

In [4]:
# Read Employee multiple gzip file
schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"


emp_multiple_gzip = spark.read.format("csv").schema(schema).option("header", True).load("datasetgzip/employee_records*.csv.gz")

In [5]:
#write Action

emp_multiple_gzip.write.format("noop").mode("overwrite").save()

In [None]:
#result

#Time taken for reading full data: 4 sec
#No of cores used: 4 cores -> 1 core for one file

In [6]:
# Read Employee unzip + csv file
schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp_nozip = spark.read.format("csv").schema(schema).option("header", True).load("employee_records_10GB.csv")


In [7]:
#write Action

emp_nozip.write.format("noop").mode("overwrite").save()

In [None]:
#result

#Time taken for reading full data: 25 sec
#No of cores used: 9 (88)

In [8]:
#read same file in parquet + snappy compressd format

emp_parquet_snappy = spark.read.format("parquet").load("employee_records_10GB_parquet")

In [9]:
#write Action

emp_parquet_snappy.write.format("noop").mode("overwrite").save()

In [None]:
#result

#Time taken for reading full data: 13 sec
#No of cores used: 44 (9)

In [10]:
#read same file in ORC + snappy compressd format

emp_orc_snappy = spark.read.format("orc").load("employee_records_10GB_ORC")

In [11]:
#write Action

emp_orc_snappy.write.format("noop").mode("overwrite").save()

In [None]:
#result

#Time taken for reading full data: 11 sec
#No of cores used: 9 (44)

In [20]:
spark.stop()

Conclusion:
1. gizp is non spittable file format, so Use only one core to read file
2. Snappy Parquet is Spittable format, used many cores to read file

Non-Splittable compression formats:
1. Gzip (.gz)
2. Bzip
3. Snappy(Raw)- SequenceFiles
4. LZ4(Raw)

Spittable Compression formats:
1. bzip2 (.bz2)
2. LZO (with indexing)
3. Parquet (with Snappy)
4. ORC (with Zlib/Snappy)
5. Avro (with deflate/Snappy)


