### The DataFrame Reader is a built in API within the DataFrame that allows you to read various source files such as CSV, JSON, and other BigData file types such as Parquet, ORC and AVRO

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("FireIncident").getOrCreate()

In [5]:
data = spark.read.csv("fire-incidents.csv",inferSchema=True, header=True)

In [6]:
data.printSchema()

root
 |-- IncidentNumber: integer (nullable = true)
 |-- ExposureNumber: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Address: string (nullable = true)
 |-- IncidentDate: string (nullable = true)
 |-- CallNumber: string (nullable = true)
 |-- AlarmDtTm: string (nullable = true)
 |-- ArrivalDtTm: timestamp (nullable = true)
 |-- CloseDtTm: timestamp (nullable = true)
 |-- City: string (nullable = true)
 |-- ZIPCode: string (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- SuppressionUnits: integer (nullable = true)
 |-- SuppressionPersonnel: integer (nullable = true)
 |-- EMSUnits: integer (nullable = true)
 |-- EMSPersonnel: integer (nullable = true)
 |-- OtherUnits: integer (nullable = true)
 |-- OtherPersonnel: integer (nullable = true)
 |-- FirstUnitOnScene: string (nullable = true)
 |-- EstimatedPropertyLoss: integer (nullable = true)
 |-- EstimatedContentsLoss: double (

In [8]:
data.select("IncidentNumber","IncidentDate","City").show(10)

+--------------+--------------------+-------------+
|IncidentNumber|        IncidentDate|         City|
+--------------+--------------------+-------------+
|      20104668|2020-09-11T00:00:...|San Francisco|
|      20104708|2020-09-11T00:00:...|San Francisco|
|      20104648|2020-09-10T00:00:...|San Francisco|
|      20104598|2020-09-10T00:00:...|San Francisco|
|      20104575|2020-09-10T00:00:...|San Francisco|
|      20104477|2020-09-10T00:00:...|San Francisco|
|      20104443|2020-09-10T00:00:...|San Francisco|
|      20104605|2020-09-10T00:00:...|San Francisco|
|      20104474|2020-09-10T00:00:...|San Francisco|
|      20104652|2020-09-10T00:00:...|San Francisco|
+--------------+--------------------+-------------+
only showing top 10 rows



### The "Select" statement / function is referred to as a projection, where you project(select) the column that yourequire and spark will resolve at schema level soon after an action has been called

In [9]:
data.columns

['IncidentNumber',
 'ExposureNumber',
 'ID',
 'Address',
 'IncidentDate',
 'CallNumber',
 'AlarmDtTm',
 'ArrivalDtTm',
 'CloseDtTm',
 'City',
 'ZIPCode',
 'Battalion',
 'StationArea',
 'Box',
 'SuppressionUnits',
 'SuppressionPersonnel',
 'EMSUnits',
 'EMSPersonnel',
 'OtherUnits',
 'OtherPersonnel',
 'FirstUnitOnScene',
 'EstimatedPropertyLoss',
 'EstimatedContentsLoss',
 'FireFatalities',
 'FireInjuries',
 'CivilianFatalities',
 'CivilianInjuries',
 'NumberofAlarms',
 'PrimarySituation',
 'MutualAid',
 'ActionTakenPrimary',
 'ActionTakenSecondary',
 'ActionTakenOther',
 'DetectorAlertedOccupants',
 'PropertyUse',
 'AreaofFireOrigin',
 'IgnitionCause',
 'IgnitionFactorPrimary',
 'IgnitionFactorSecondary',
 'HeatSource',
 'ItemFirstIgnited',
 'HumanFactorsAssociatedwithIgnition',
 'StructureType',
 'StructureStatus',
 'FloorofFireOrigin',
 'FireSpread',
 'NoFlameSpead',
 'Numberoffloorswithminimumdamage',
 'Numberoffloorswithsignificantdamage',
 'Numberoffloorswithheavydamage',
 'Numbe

In [11]:
output_path = '/home/shyam/NiceSoftwareSolutions/ApacheSpark3_for_Data_Engineering_and_Analytics_with_Python/Structured_API-Spark_DataFrame/output/fireincidents'

In [12]:
data.write.format("parquet").mode("overwrite").save(output_path)