In [0]:
### This code analyzes fire incidents in San Francisco
### Steps
### Get CSV
### Create table and rename columns
### Repartition (Qty) and byPeriod?
### Convert to Parquet
### Load Parquet
### Compare Load time CSV x Parquet
### Exploratory Analysis
### Use Spark SQL - Group By + Sort Operations
### Test ML Library?
### Test Graphiz?
### Test Spark Streaming?

In [0]:
#Import libs
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType, TimestampType #Used for schema
from pyspark.sql.functions import * # in order to use Date Functions - Ex.: extract Year from Date
#from pyspark.sql.functions import to_timestamp

In [0]:
schema = StructType([
StructField("CallNumber",StringType(),True),
StructField("UnitID",StringType(),True),
StructField("IncidentNumber",StringType(),True),
StructField("CallType",StringType(),True),
StructField("CallDate",StringType(),True),
StructField("WatchDate",StringType(),True),
StructField("ReceivedDtTm",StringType(),True),
StructField("EntryDtTm",StringType(),True),
StructField("DispatchDtTm",StringType(),True),
StructField("ResponseDtTm",StringType(),True),
StructField("OnSceneDtTm",StringType(),True),
StructField("TransportDtTm",StringType(),True),
StructField("HospitalDtTm",StringType(),True),
StructField("CallFinalDisposition",StringType(),True),
StructField("AvailableDtTm",StringType(),True),
StructField("Address",StringType(),True),
StructField("City",StringType(),True),
StructField("ZipcodeofIncident",StringType(),True),
StructField("Battalion",StringType(),True),
StructField("StationArea",StringType(),True),
StructField("Box",StringType(),True),
StructField("OriginalPriority",StringType(),True),
StructField("Priority",StringType(),True),
StructField("FinalPriority",StringType(),True),
StructField("ALSUnit",StringType(),True),
StructField("CallTypeGroup",StringType(),True),
StructField("NumberofAlarms",StringType(),True),
StructField("UnitType",StringType(),True),
StructField("Unitsequenceincalldispatch",StringType(),True),
StructField("FirePreventionDistrict",StringType(),True),
StructField("SupervisorDistrict",StringType(),True),
StructField("Neighborhooods-AnalysisBoundaries",StringType(),True),
StructField("RowID",StringType(),True),
StructField("case_location",StringType(),True),
StructField("AnalysisNeighborhoods",StringType(),True)
  ])

In [0]:
#Get the file on this link
#https://data.sfgov.org/Public-Safety/Fire-Department-Calls-for-Service/nuek-vuh3
  
# File location and type
file_location = "/FileStore/tables/Fire_Department_Calls_for_Service.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(schema) \
  .load(file_location)

display(df)

CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,ReceivedDtTm,EntryDtTm,DispatchDtTm,ResponseDtTm,OnSceneDtTm,TransportDtTm,HospitalDtTm,CallFinalDisposition,AvailableDtTm,Address,City,ZipcodeofIncident,Battalion,StationArea,Box,OriginalPriority,Priority,FinalPriority,ALSUnit,CallTypeGroup,NumberofAlarms,UnitType,Unitsequenceincalldispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhooods-AnalysisBoundaries,RowID,case_location,AnalysisNeighborhoods
210690030,T03,21030278,Alarms,03/10/2021,03/09/2021,03/10/2021 12:16:03 AM,03/10/2021 12:18:36 AM,03/10/2021 12:19:01 AM,03/10/2021 12:20:15 AM,,,,Fire,03/10/2021 12:28:42 AM,1300 Block of MARKET ST,San Francisco,94102.0,B02,36,3111,3,3,3,False,Alarm,1,TRUCK,3,2.0,6.0,Tenderloin,210690030-T03,POINT (-122.41697932641094 37.7770834641944),36.0
210391607,E19,21017645,Alarms,02/08/2021,02/08/2021,02/08/2021 01:00:14 PM,02/08/2021 01:01:36 PM,02/08/2021 01:01:40 PM,02/08/2021 01:03:21 PM,02/08/2021 01:05:44 PM,,,Fire,02/08/2021 01:18:09 PM,400 Block of SERRANO DR,San Francisco,94132.0,B08,19,8581,3,3,3,True,Alarm,1,ENGINE,1,8.0,7.0,Lakeshore,210391607-E19,POINT (-122.48045074945836 37.7190118676788),16.0
210391164,T04,21017596,Alarms,02/08/2021,02/08/2021,02/08/2021 10:54:56 AM,02/08/2021 10:56:50 AM,02/08/2021 10:56:57 AM,02/08/2021 10:57:07 AM,02/08/2021 10:59:34 AM,,,Fire,02/08/2021 11:06:42 AM,600 Block of LONG BRIDGE ST,San Francisco,94158.0,B03,4,2264,3,3,3,False,Alarm,1,TRUCK,1,3.0,6.0,Mission Bay,210391164-T04,POINT (-122.39227179213904 37.77288298280324),4.0
210683008,T03,21030233,Structure Fire,03/09/2021,03/09/2021,03/09/2021 08:57:47 PM,03/09/2021 08:59:01 PM,03/09/2021 08:59:04 PM,03/09/2021 08:59:28 PM,03/09/2021 09:01:48 PM,,,Fire,03/09/2021 09:05:42 PM,600 Block of POLK ST,San Francisco,94102.0,B02,3,3114,3,3,3,False,Alarm,1,TRUCK,1,2.0,6.0,Tenderloin,210683008-T03,POINT (-122.4190191572421 37.78264403568861),36.0
210391034,E16,21017578,Citizen Assist / Service Call,02/08/2021,02/08/2021,02/08/2021 10:18:53 AM,02/08/2021 10:19:52 AM,02/08/2021 10:19:58 AM,02/08/2021 10:20:42 AM,02/08/2021 10:27:42 AM,,,Fire,02/08/2021 10:53:27 AM,FRANKLIN ST/FILBERT ST,San Francisco,94123.0,B04,16,3233,3,3,3,True,Alarm,1,ENGINE,1,4.0,2.0,Marina,210391034-E16,POINT (-122.42581353320875 37.79927566930728),13.0
210390767,T19,21017552,Other,02/08/2021,02/08/2021,02/08/2021 08:50:27 AM,02/08/2021 08:54:27 AM,02/08/2021 08:55:28 AM,02/08/2021 08:57:51 AM,,,,Fire,02/08/2021 09:02:22 AM,"CALL BOX: JOHN DALY BL/MISSION ST,DC",Daly City,,B09,33,9922,3,3,3,True,Fire,1,TRUCK,9,,,,210390767-T19,POINT (-122.46239390119047 37.7049649190675),
210681495,T03,21030060,Alarms,03/09/2021,03/09/2021,03/09/2021 12:47:08 PM,03/09/2021 12:49:37 PM,03/09/2021 12:49:48 PM,03/09/2021 12:51:10 PM,03/09/2021 12:53:01 PM,,,Fire,03/09/2021 12:58:21 PM,900 Block of SUTTER ST,San Francisco,94109.0,B04,3,1557,3,3,3,False,Alarm,1,TRUCK,1,4.0,3.0,Nob Hill,210681495-T03,POINT (-122.4160172278388 37.788337139908855),21.0
210681261,E08,21030034,Alarms,03/09/2021,03/09/2021,03/09/2021 11:37:16 AM,03/09/2021 11:39:15 AM,03/09/2021 11:39:29 AM,03/09/2021 11:40:38 AM,,,,Fire,03/09/2021 11:41:28 AM,100 Block of BRANNAN ST,San Francisco,94107.0,B03,35,2134,3,3,3,True,Alarm,1,ENGINE,4,3.0,6.0,Financial District/South Beach,210681261-E08,POINT (-122.38904057366992 37.78432123494197),8.0
210382984,B05,21017398,Alarms,02/07/2021,02/07/2021,02/07/2021 09:18:38 PM,02/07/2021 09:20:02 PM,02/07/2021 09:21:15 PM,02/07/2021 09:21:15 PM,02/07/2021 09:25:11 PM,,,Fire,02/07/2021 09:35:06 PM,2100 Block of FELL ST,San Francisco,94117.0,B05,12,4554,3,3,3,False,Alarm,1,CHIEF,2,5.0,5.0,Lone Mountain/USF,210382984-B05,POINT (-122.45328305705388 37.77213783914884),18.0
210382403,T05,21017307,Alarms,02/07/2021,02/07/2021,02/07/2021 05:24:48 PM,02/07/2021 05:26:12 PM,02/07/2021 05:26:24 PM,02/07/2021 05:27:46 PM,02/07/2021 05:30:04 PM,,,Fire,02/07/2021 05:50:46 PM,1400 Block of GEARY BLVD,San Francisco,94109.0,B04,38,3323,3,3,3,False,Alarm,1,TRUCK,2,4.0,5.0,Japantown,210382403-T05,POINT (-122.4263666364095 37.78531206735949),15.0


In [0]:
df.printSchema()

In [0]:
from_pattern = "MM/dd/yyyy"
to_pattern = "yyyy-MM-dd"

In [0]:
df = df.withColumn("CallDateTs",to_timestamp(df["CallDate"], from_pattern))
df.display()

CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,ReceivedDtTm,EntryDtTm,DispatchDtTm,ResponseDtTm,OnSceneDtTm,TransportDtTm,HospitalDtTm,CallFinalDisposition,AvailableDtTm,Address,City,ZipcodeofIncident,Battalion,StationArea,Box,OriginalPriority,Priority,FinalPriority,ALSUnit,CallTypeGroup,NumberofAlarms,UnitType,Unitsequenceincalldispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhooods-AnalysisBoundaries,RowID,case_location,AnalysisNeighborhoods,CallDateTs
210690030,T03,21030278,Alarms,03/10/2021,03/09/2021,03/10/2021 12:16:03 AM,03/10/2021 12:18:36 AM,03/10/2021 12:19:01 AM,03/10/2021 12:20:15 AM,,,,Fire,03/10/2021 12:28:42 AM,1300 Block of MARKET ST,San Francisco,94102.0,B02,36,3111,3,3,3,False,Alarm,1,TRUCK,3,2.0,6.0,Tenderloin,210690030-T03,POINT (-122.41697932641094 37.7770834641944),36.0,2021-03-10T00:00:00.000+0000
210391607,E19,21017645,Alarms,02/08/2021,02/08/2021,02/08/2021 01:00:14 PM,02/08/2021 01:01:36 PM,02/08/2021 01:01:40 PM,02/08/2021 01:03:21 PM,02/08/2021 01:05:44 PM,,,Fire,02/08/2021 01:18:09 PM,400 Block of SERRANO DR,San Francisco,94132.0,B08,19,8581,3,3,3,True,Alarm,1,ENGINE,1,8.0,7.0,Lakeshore,210391607-E19,POINT (-122.48045074945836 37.7190118676788),16.0,2021-02-08T00:00:00.000+0000
210391164,T04,21017596,Alarms,02/08/2021,02/08/2021,02/08/2021 10:54:56 AM,02/08/2021 10:56:50 AM,02/08/2021 10:56:57 AM,02/08/2021 10:57:07 AM,02/08/2021 10:59:34 AM,,,Fire,02/08/2021 11:06:42 AM,600 Block of LONG BRIDGE ST,San Francisco,94158.0,B03,4,2264,3,3,3,False,Alarm,1,TRUCK,1,3.0,6.0,Mission Bay,210391164-T04,POINT (-122.39227179213904 37.77288298280324),4.0,2021-02-08T00:00:00.000+0000
210683008,T03,21030233,Structure Fire,03/09/2021,03/09/2021,03/09/2021 08:57:47 PM,03/09/2021 08:59:01 PM,03/09/2021 08:59:04 PM,03/09/2021 08:59:28 PM,03/09/2021 09:01:48 PM,,,Fire,03/09/2021 09:05:42 PM,600 Block of POLK ST,San Francisco,94102.0,B02,3,3114,3,3,3,False,Alarm,1,TRUCK,1,2.0,6.0,Tenderloin,210683008-T03,POINT (-122.4190191572421 37.78264403568861),36.0,2021-03-09T00:00:00.000+0000
210391034,E16,21017578,Citizen Assist / Service Call,02/08/2021,02/08/2021,02/08/2021 10:18:53 AM,02/08/2021 10:19:52 AM,02/08/2021 10:19:58 AM,02/08/2021 10:20:42 AM,02/08/2021 10:27:42 AM,,,Fire,02/08/2021 10:53:27 AM,FRANKLIN ST/FILBERT ST,San Francisco,94123.0,B04,16,3233,3,3,3,True,Alarm,1,ENGINE,1,4.0,2.0,Marina,210391034-E16,POINT (-122.42581353320875 37.79927566930728),13.0,2021-02-08T00:00:00.000+0000
210390767,T19,21017552,Other,02/08/2021,02/08/2021,02/08/2021 08:50:27 AM,02/08/2021 08:54:27 AM,02/08/2021 08:55:28 AM,02/08/2021 08:57:51 AM,,,,Fire,02/08/2021 09:02:22 AM,"CALL BOX: JOHN DALY BL/MISSION ST,DC",Daly City,,B09,33,9922,3,3,3,True,Fire,1,TRUCK,9,,,,210390767-T19,POINT (-122.46239390119047 37.7049649190675),,2021-02-08T00:00:00.000+0000
210681495,T03,21030060,Alarms,03/09/2021,03/09/2021,03/09/2021 12:47:08 PM,03/09/2021 12:49:37 PM,03/09/2021 12:49:48 PM,03/09/2021 12:51:10 PM,03/09/2021 12:53:01 PM,,,Fire,03/09/2021 12:58:21 PM,900 Block of SUTTER ST,San Francisco,94109.0,B04,3,1557,3,3,3,False,Alarm,1,TRUCK,1,4.0,3.0,Nob Hill,210681495-T03,POINT (-122.4160172278388 37.788337139908855),21.0,2021-03-09T00:00:00.000+0000
210681261,E08,21030034,Alarms,03/09/2021,03/09/2021,03/09/2021 11:37:16 AM,03/09/2021 11:39:15 AM,03/09/2021 11:39:29 AM,03/09/2021 11:40:38 AM,,,,Fire,03/09/2021 11:41:28 AM,100 Block of BRANNAN ST,San Francisco,94107.0,B03,35,2134,3,3,3,True,Alarm,1,ENGINE,4,3.0,6.0,Financial District/South Beach,210681261-E08,POINT (-122.38904057366992 37.78432123494197),8.0,2021-03-09T00:00:00.000+0000
210382984,B05,21017398,Alarms,02/07/2021,02/07/2021,02/07/2021 09:18:38 PM,02/07/2021 09:20:02 PM,02/07/2021 09:21:15 PM,02/07/2021 09:21:15 PM,02/07/2021 09:25:11 PM,,,Fire,02/07/2021 09:35:06 PM,2100 Block of FELL ST,San Francisco,94117.0,B05,12,4554,3,3,3,False,Alarm,1,CHIEF,2,5.0,5.0,Lone Mountain/USF,210382984-B05,POINT (-122.45328305705388 37.77213783914884),18.0,2021-02-07T00:00:00.000+0000
210382403,T05,21017307,Alarms,02/07/2021,02/07/2021,02/07/2021 05:24:48 PM,02/07/2021 05:26:12 PM,02/07/2021 05:26:24 PM,02/07/2021 05:27:46 PM,02/07/2021 05:30:04 PM,,,Fire,02/07/2021 05:50:46 PM,1400 Block of GEARY BLVD,San Francisco,94109.0,B04,38,3323,3,3,3,False,Alarm,1,TRUCK,2,4.0,5.0,Japantown,210382403-T05,POINT (-122.4263666364095 37.78531206735949),15.0,2021-02-07T00:00:00.000+0000


In [0]:
df.groupby(year("CallDateTs")).count().show()

In [0]:
df = df.withColumn("CallYear",year(df["CallDateTs"]))
df.display()

CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,ReceivedDtTm,EntryDtTm,DispatchDtTm,ResponseDtTm,OnSceneDtTm,TransportDtTm,HospitalDtTm,CallFinalDisposition,AvailableDtTm,Address,City,ZipcodeofIncident,Battalion,StationArea,Box,OriginalPriority,Priority,FinalPriority,ALSUnit,CallTypeGroup,NumberofAlarms,UnitType,Unitsequenceincalldispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhooods-AnalysisBoundaries,RowID,case_location,AnalysisNeighborhoods,CallDateTs,CallYear
210690030,T03,21030278,Alarms,03/10/2021,03/09/2021,03/10/2021 12:16:03 AM,03/10/2021 12:18:36 AM,03/10/2021 12:19:01 AM,03/10/2021 12:20:15 AM,,,,Fire,03/10/2021 12:28:42 AM,1300 Block of MARKET ST,San Francisco,94102.0,B02,36,3111,3,3,3,False,Alarm,1,TRUCK,3,2.0,6.0,Tenderloin,210690030-T03,POINT (-122.41697932641094 37.7770834641944),36.0,2021-03-10T00:00:00.000+0000,2021
210391607,E19,21017645,Alarms,02/08/2021,02/08/2021,02/08/2021 01:00:14 PM,02/08/2021 01:01:36 PM,02/08/2021 01:01:40 PM,02/08/2021 01:03:21 PM,02/08/2021 01:05:44 PM,,,Fire,02/08/2021 01:18:09 PM,400 Block of SERRANO DR,San Francisco,94132.0,B08,19,8581,3,3,3,True,Alarm,1,ENGINE,1,8.0,7.0,Lakeshore,210391607-E19,POINT (-122.48045074945836 37.7190118676788),16.0,2021-02-08T00:00:00.000+0000,2021
210391164,T04,21017596,Alarms,02/08/2021,02/08/2021,02/08/2021 10:54:56 AM,02/08/2021 10:56:50 AM,02/08/2021 10:56:57 AM,02/08/2021 10:57:07 AM,02/08/2021 10:59:34 AM,,,Fire,02/08/2021 11:06:42 AM,600 Block of LONG BRIDGE ST,San Francisco,94158.0,B03,4,2264,3,3,3,False,Alarm,1,TRUCK,1,3.0,6.0,Mission Bay,210391164-T04,POINT (-122.39227179213904 37.77288298280324),4.0,2021-02-08T00:00:00.000+0000,2021
210683008,T03,21030233,Structure Fire,03/09/2021,03/09/2021,03/09/2021 08:57:47 PM,03/09/2021 08:59:01 PM,03/09/2021 08:59:04 PM,03/09/2021 08:59:28 PM,03/09/2021 09:01:48 PM,,,Fire,03/09/2021 09:05:42 PM,600 Block of POLK ST,San Francisco,94102.0,B02,3,3114,3,3,3,False,Alarm,1,TRUCK,1,2.0,6.0,Tenderloin,210683008-T03,POINT (-122.4190191572421 37.78264403568861),36.0,2021-03-09T00:00:00.000+0000,2021
210391034,E16,21017578,Citizen Assist / Service Call,02/08/2021,02/08/2021,02/08/2021 10:18:53 AM,02/08/2021 10:19:52 AM,02/08/2021 10:19:58 AM,02/08/2021 10:20:42 AM,02/08/2021 10:27:42 AM,,,Fire,02/08/2021 10:53:27 AM,FRANKLIN ST/FILBERT ST,San Francisco,94123.0,B04,16,3233,3,3,3,True,Alarm,1,ENGINE,1,4.0,2.0,Marina,210391034-E16,POINT (-122.42581353320875 37.79927566930728),13.0,2021-02-08T00:00:00.000+0000,2021
210390767,T19,21017552,Other,02/08/2021,02/08/2021,02/08/2021 08:50:27 AM,02/08/2021 08:54:27 AM,02/08/2021 08:55:28 AM,02/08/2021 08:57:51 AM,,,,Fire,02/08/2021 09:02:22 AM,"CALL BOX: JOHN DALY BL/MISSION ST,DC",Daly City,,B09,33,9922,3,3,3,True,Fire,1,TRUCK,9,,,,210390767-T19,POINT (-122.46239390119047 37.7049649190675),,2021-02-08T00:00:00.000+0000,2021
210681495,T03,21030060,Alarms,03/09/2021,03/09/2021,03/09/2021 12:47:08 PM,03/09/2021 12:49:37 PM,03/09/2021 12:49:48 PM,03/09/2021 12:51:10 PM,03/09/2021 12:53:01 PM,,,Fire,03/09/2021 12:58:21 PM,900 Block of SUTTER ST,San Francisco,94109.0,B04,3,1557,3,3,3,False,Alarm,1,TRUCK,1,4.0,3.0,Nob Hill,210681495-T03,POINT (-122.4160172278388 37.788337139908855),21.0,2021-03-09T00:00:00.000+0000,2021
210681261,E08,21030034,Alarms,03/09/2021,03/09/2021,03/09/2021 11:37:16 AM,03/09/2021 11:39:15 AM,03/09/2021 11:39:29 AM,03/09/2021 11:40:38 AM,,,,Fire,03/09/2021 11:41:28 AM,100 Block of BRANNAN ST,San Francisco,94107.0,B03,35,2134,3,3,3,True,Alarm,1,ENGINE,4,3.0,6.0,Financial District/South Beach,210681261-E08,POINT (-122.38904057366992 37.78432123494197),8.0,2021-03-09T00:00:00.000+0000,2021
210382984,B05,21017398,Alarms,02/07/2021,02/07/2021,02/07/2021 09:18:38 PM,02/07/2021 09:20:02 PM,02/07/2021 09:21:15 PM,02/07/2021 09:21:15 PM,02/07/2021 09:25:11 PM,,,Fire,02/07/2021 09:35:06 PM,2100 Block of FELL ST,San Francisco,94117.0,B05,12,4554,3,3,3,False,Alarm,1,CHIEF,2,5.0,5.0,Lone Mountain/USF,210382984-B05,POINT (-122.45328305705388 37.77213783914884),18.0,2021-02-07T00:00:00.000+0000,2021
210382403,T05,21017307,Alarms,02/07/2021,02/07/2021,02/07/2021 05:24:48 PM,02/07/2021 05:26:12 PM,02/07/2021 05:26:24 PM,02/07/2021 05:27:46 PM,02/07/2021 05:30:04 PM,,,Fire,02/07/2021 05:50:46 PM,1400 Block of GEARY BLVD,San Francisco,94109.0,B04,38,3323,3,3,3,False,Alarm,1,TRUCK,2,4.0,5.0,Japantown,210382403-T05,POINT (-122.4263666364095 37.78531206735949),15.0,2021-02-07T00:00:00.000+0000,2021


In [0]:
resultPath = "/user/df_fire"
df.write.mode("overwrite").format("parquet").partitionBy("CallYear").parquet(resultPath)