# Predicting Fire Department False Alarms

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType
from pyspark.sql.functions import *
from pyspark.ml.feature import *
from pyspark.ml import Pipeline

## LOAD IN DATA 
SF FIRE DEPT CALLS

In [0]:
ACCESSY_KEY_ID = "AKIAJBRYNXGHORDHZB4A"
SECERET_ACCESS_KEY = "a0BzE1bSegfydr3%2FGE3LSPM6uIV5A4hOUfpH8aFF" 

mounts_list = [
{'bucket':'databricks-corp-training/sf_open_data/', 'mount_folder':'/mnt/sf_open_data'}
]

for mount_point in mounts_list:
  bucket = mount_point['bucket']
  mount_folder = mount_point['mount_folder']
  try:
    dbutils.fs.ls(mount_folder)
    dbutils.fs.unmount(mount_folder)
  except:
    pass
  finally: #If MOUNT_FOLDER does not exist
    dbutils.fs.mount("s3a://"+ ACCESSY_KEY_ID + ":" + SECERET_ACCESS_KEY + "@" + bucket,mount_folder)

In [0]:
%fs ls /mnt/sf_open_data/fire_dept_calls_for_service/

path,name,size
dbfs:/mnt/sf_open_data/fire_dept_calls_for_service/Fire_Department_Calls_for_Service.csv,Fire_Department_Calls_for_Service.csv,1634673683


In [0]:
df = spark.read.csv("dbfs:/mnt/sf_open_data/fire_dept_calls_for_service/Fire_Department_Calls_for_Service.csv", header = True, inferSchema = True)

In [0]:
df.count()

In [0]:
df.printSchema()

## SPECIFY SCHEMA

In [0]:
fireSchema = StructType([StructField('CallNumber', IntegerType(), True),
                     StructField('UnitID', StringType(), True),
                     StructField('IncidentNumber', IntegerType(), True),
                     StructField('CallType', StringType(), True),                  
                     StructField('CallDate', StringType(), True),       
                     StructField('WatchDate', StringType(), True),       
                     StructField('ReceivedDtTm', StringType(), True),       
                     StructField('EntryDtTm', StringType(), True),       
                     StructField('DispatchDtTm', StringType(), True),       
                     StructField('ResponseDtTm', StringType(), True),       
                     StructField('OnSceneDtTm', StringType(), True),       
                     StructField('TransportDtTm', StringType(), True),                  
                     StructField('HospitalDtTm', StringType(), True),       
                     StructField('CallFinalDisposition', StringType(), True),       
                     StructField('AvailableDtTm', StringType(), True),       
                     StructField('Address', StringType(), True),       
                     StructField('City', StringType(), True),       
                     StructField('ZipcodeofIncident', IntegerType(), True),       
                     StructField('Battalion', StringType(), True),                 
                     StructField('StationArea', StringType(), True),       
                     StructField('Box', StringType(), True),       
                     StructField('OriginalPriority', StringType(), True),       
                     StructField('Priority', StringType(), True),       
                     StructField('FinalPriority', IntegerType(), True),       
                     StructField('ALSUnit', BooleanType(), True),       
                     StructField('CallTypeGroup', StringType(), True),
                     StructField('NumberofAlarms', IntegerType(), True),
                     StructField('UnitType', StringType(), True),
                     StructField('Unitsequenceincalldispatch', IntegerType(), True),
                     StructField('FirePreventionDistrict', StringType(), True),
                     StructField('SupervisorDistrict', StringType(), True),
                     StructField('NeighborhoodDistrict', StringType(), True),
                     StructField('Location', StringType(), True),
                     StructField('RowID', StringType(), True)])

In [0]:
df = spark.read.csv("dbfs:/mnt/sf_open_data/fire_dept_calls_for_service/Fire_Department_Calls_for_Service.csv", header=True, schema = fireSchema).cache()

## PROCESS DATA

In [0]:
display(df.head(20))

CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,ReceivedDtTm,EntryDtTm,DispatchDtTm,ResponseDtTm,OnSceneDtTm,TransportDtTm,HospitalDtTm,CallFinalDisposition,AvailableDtTm,Address,City,ZipcodeofIncident,Battalion,StationArea,Box,OriginalPriority,Priority,FinalPriority,ALSUnit,CallTypeGroup,NumberofAlarms,UnitType,Unitsequenceincalldispatch,FirePreventionDistrict,SupervisorDistrict,NeighborhoodDistrict,Location,RowID
142480332,B02,14086309,Alarms,09/05/2014,09/04/2014,09/05/2014 03:15:13 AM,09/05/2014 03:17:26 AM,09/05/2014 03:18:18 AM,09/05/2014 03:20:30 AM,09/05/2014 03:24:11 AM,04/25/2016 01:15:16 PM,04/25/2016 01:15:16 PM,Fire,09/05/2014 03:33:20 AM,1600 Block of HAIGHT ST,San Francisco,94117,B05,12,4525,3,3,3,True,Alarm,1,CHIEF,3,5.0,5,Haight Ashbury,"(37.7695711762103, -122.449920089485)",142480332-B02
153022542,T02,15115908,Structure Fire,10/29/2015,10/29/2015,10/29/2015 03:39:06 PM,10/29/2015 03:39:25 PM,10/29/2015 03:39:49 PM,10/29/2015 03:40:55 PM,10/29/2015 03:43:21 PM,04/25/2016 01:07:30 PM,04/25/2016 01:07:30 PM,Fire,10/29/2015 03:51:21 PM,BATTERY ST/VALLEJO ST,San Francisco,94111,B01,13,1155,3,3,3,False,Alarm,1,TRUCK,4,1.0,3,Financial District/South Beach,"(37.7995314468258, -122.401240243673)",153022542-T02
143451112,AM04,14122741,Medical Incident,12/11/2014,12/11/2014,12/11/2014 09:02:07 AM,12/11/2014 09:03:01 AM,12/11/2014 09:03:11 AM,12/11/2014 09:06:19 AM,12/11/2014 09:20:16 AM,12/11/2014 09:20:26 AM,12/11/2014 09:43:41 AM,Code 2 Transport,12/11/2014 10:06:26 AM,300 Block of BUENA VISTA AVE,San Francisco,94117,B05,21,5136,3,3,3,False,Potentially Life-Threatening,1,PRIVATE,1,5.0,8,Castro/Upper Market,"(37.7668035178194, -122.440704687809)",143451112-AM04
141660300,E01,14057129,Medical Incident,06/15/2014,06/14/2014,06/15/2014 02:04:57 AM,06/15/2014 02:06:42 AM,06/15/2014 02:10:01 AM,06/15/2014 02:12:55 AM,06/15/2014 02:24:55 AM,04/25/2016 01:16:45 PM,04/25/2016 01:16:45 PM,Code 2 Transport,06/15/2014 02:51:39 AM,0 Block of HALLAM ST,San Francisco,94103,B03,1,2313,2,2,2,True,Non Life-threatening,1,ENGINE,2,2.0,6,South of Market,"(37.7756902570435, -122.408609057895)",141660300-E01
152633454,E36,15100829,Outside Fire,09/20/2015,09/20/2015,09/20/2015 08:15:00 PM,09/20/2015 08:15:53 PM,09/20/2015 08:16:17 PM,09/20/2015 08:18:07 PM,04/25/2016 01:08:14 PM,04/25/2016 01:08:14 PM,04/25/2016 01:08:14 PM,Fire,09/20/2015 08:22:11 PM,MARKET ST/VAN NESS AV,San Francisco,94103,B02,36,3211,3,3,3,True,Fire,1,ENGINE,1,2.0,6,Mission,"(37.7751470741622, -122.419255607214)",152633454-E36
160941229,62,16037213,Medical Incident,04/03/2016,04/03/2016,04/03/2016 10:11:05 AM,04/03/2016 10:13:32 AM,04/03/2016 10:13:50 AM,04/03/2016 10:14:04 AM,04/03/2016 10:17:26 AM,04/03/2016 10:28:20 AM,04/03/2016 11:00:27 AM,Code 2 Transport,04/03/2016 11:27:46 AM,CABRILLO ST/LA PLAYA,San Francisco,94121,B07,34,7277,2,2,2,True,Non Life-threatening,1,MEDIC,1,7.0,1,Outer Richmond,"(37.7732594685752, -122.510036956026)",160941229-62
142672360,E43,14093558,Medical Incident,09/24/2014,09/24/2014,09/24/2014 03:07:36 PM,09/24/2014 03:08:31 PM,09/24/2014 03:09:33 PM,09/24/2014 03:11:28 PM,04/25/2016 01:14:55 PM,04/25/2016 01:14:55 PM,04/25/2016 01:14:55 PM,Code 2 Transport,09/24/2014 03:11:52 PM,4900 Block of MISSION ST,San Francisco,94112,B09,43,6123,2,2,2,False,Potentially Life-Threatening,1,ENGINE,2,9.0,11,Excelsior,"(37.7188461081754, -122.439092837429)",142672360-E43
152052982,E11,15078184,Medical Incident,07/24/2015,07/24/2015,07/24/2015 05:45:39 PM,07/24/2015 05:49:36 PM,07/24/2015 05:50:18 PM,07/24/2015 05:51:14 PM,07/24/2015 05:52:56 PM,04/25/2016 01:09:17 PM,04/25/2016 01:09:17 PM,Code 2 Transport,07/24/2015 06:02:50 PM,1500 Block of DOLORES ST,San Francisco,94110,B06,11,5576,3,3,3,True,Potentially Life-Threatening,1,ENGINE,1,6.0,8,Noe Valley,"(37.7450456172368, -122.424347725679)",152052982-E11
150172539,RC3,15006796,Medical Incident,01/17/2015,01/17/2015,01/17/2015 04:56:52 PM,01/17/2015 04:58:19 PM,01/17/2015 05:07:34 PM,01/17/2015 05:07:34 PM,01/17/2015 05:36:03 PM,04/25/2016 01:12:46 PM,04/25/2016 01:12:46 PM,Code 3 Transport,01/17/2015 05:36:07 PM,BERNAL HEIGHTS BL/FOLSOM ST,San Francisco,94110,B06,11,5663,2,2,2,True,Potentially Life-Threatening,1,RESCUE CAPTAIN,5,6.0,9,Bernal Heights,"(37.7435915885579, -122.412277634027)",150172539-RC3
160921973,65,16036463,Medical Incident,04/01/2016,04/01/2016,04/01/2016 02:12:55 PM,04/01/2016 02:14:42 PM,04/01/2016 02:16:11 PM,04/01/2016 02:16:19 PM,04/01/2016 02:25:50 PM,04/01/2016 02:36:59 PM,04/01/2016 02:56:19 PM,Code 2 Transport,04/01/2016 03:26:51 PM,1800 Block of CHESTNUT ST,San Francisco,94123,B04,16,3445,2,2,2,True,Non Life-threatening,1,MEDIC,2,4.0,2,Marina,"(37.8011854069176, -122.433790937219)",160921973-65


In [0]:
pattern1 = "M/d/yyyy"
pattern2 = "M/d/yyyy h:m:s a"
df = df.withColumn("CallDateTS", to_date(df["CallDate"], pattern1)).drop(df.CallDate)
df = df.withColumn("WatchDateTS", to_date(df["WatchDate"], pattern1)).drop(df.WatchDate)
df = df.withColumn("ReceivedDateTS", unix_timestamp(df["ReceivedDtTm"], pattern2).cast("timestamp")).drop(df.ReceivedDtTm)
df = df.withColumn("EntryDateTS", unix_timestamp(df["EntryDtTm"], pattern2).cast("timestamp")).drop(df.EntryDtTm)
df = df.withColumn("DispatchDateTS", unix_timestamp(df["DispatchDtTm"], pattern2).cast("timestamp")).drop(df.DispatchDtTm)
df = df.withColumn("ResponseDateTS", unix_timestamp(df["ResponseDtTm"], pattern2).cast("timestamp")).drop(df.ResponseDtTm)
df = df.withColumn("OnSceneDateTS", unix_timestamp(df["OnSceneDtTm"], pattern2).cast("timestamp")).drop(df.OnSceneDtTm)
df = df.withColumn("TransportDateTS", unix_timestamp(df["TransportDtTm"], pattern2).cast("timestamp")).drop(df.TransportDtTm)
df = df.withColumn("HospitalDateTS", unix_timestamp(df["HospitalDtTm"], pattern2).cast("timestamp")).drop(df.HospitalDtTm)
df = df.withColumn("AvailableDateTS", unix_timestamp(df["AvailableDtTm"], pattern2).cast("timestamp")).drop(df.AvailableDtTm)

In [0]:
display(df.head(5))

CallNumber,UnitID,IncidentNumber,CallType,CallFinalDisposition,Address,City,ZipcodeofIncident,Battalion,StationArea,Box,OriginalPriority,Priority,FinalPriority,ALSUnit,CallTypeGroup,NumberofAlarms,UnitType,Unitsequenceincalldispatch,FirePreventionDistrict,SupervisorDistrict,NeighborhoodDistrict,Location,RowID,CallDateTS,WatchDateTS,ReceivedDateTS,EntryDateTS,DispatchDateTS,ResponseDateTS,OnSceneDateTS,TransportDateTS,HospitalDateTS,AvailableDateTS
142480332,B02,14086309,Alarms,Fire,1600 Block of HAIGHT ST,San Francisco,94117,B05,12,4525,3,3,3,True,Alarm,1,CHIEF,3,5,5,Haight Ashbury,"(37.7695711762103, -122.449920089485)",142480332-B02,2014-09-05,2014-09-04,2014-09-05T03:15:13.000+0000,2014-09-05T03:17:26.000+0000,2014-09-05T03:18:18.000+0000,2014-09-05T03:20:30.000+0000,2014-09-05T03:24:11.000+0000,2016-04-25T13:15:16.000+0000,2016-04-25T13:15:16.000+0000,2014-09-05T03:33:20.000+0000
153022542,T02,15115908,Structure Fire,Fire,BATTERY ST/VALLEJO ST,San Francisco,94111,B01,13,1155,3,3,3,False,Alarm,1,TRUCK,4,1,3,Financial District/South Beach,"(37.7995314468258, -122.401240243673)",153022542-T02,2015-10-29,2015-10-29,2015-10-29T15:39:06.000+0000,2015-10-29T15:39:25.000+0000,2015-10-29T15:39:49.000+0000,2015-10-29T15:40:55.000+0000,2015-10-29T15:43:21.000+0000,2016-04-25T13:07:30.000+0000,2016-04-25T13:07:30.000+0000,2015-10-29T15:51:21.000+0000
143451112,AM04,14122741,Medical Incident,Code 2 Transport,300 Block of BUENA VISTA AVE,San Francisco,94117,B05,21,5136,3,3,3,False,Potentially Life-Threatening,1,PRIVATE,1,5,8,Castro/Upper Market,"(37.7668035178194, -122.440704687809)",143451112-AM04,2014-12-11,2014-12-11,2014-12-11T09:02:07.000+0000,2014-12-11T09:03:01.000+0000,2014-12-11T09:03:11.000+0000,2014-12-11T09:06:19.000+0000,2014-12-11T09:20:16.000+0000,2014-12-11T09:20:26.000+0000,2014-12-11T09:43:41.000+0000,2014-12-11T10:06:26.000+0000
141660300,E01,14057129,Medical Incident,Code 2 Transport,0 Block of HALLAM ST,San Francisco,94103,B03,1,2313,2,2,2,True,Non Life-threatening,1,ENGINE,2,2,6,South of Market,"(37.7756902570435, -122.408609057895)",141660300-E01,2014-06-15,2014-06-14,2014-06-15T02:04:57.000+0000,2014-06-15T02:06:42.000+0000,2014-06-15T02:10:01.000+0000,2014-06-15T02:12:55.000+0000,2014-06-15T02:24:55.000+0000,2016-04-25T13:16:45.000+0000,2016-04-25T13:16:45.000+0000,2014-06-15T02:51:39.000+0000
152633454,E36,15100829,Outside Fire,Fire,MARKET ST/VAN NESS AV,San Francisco,94103,B02,36,3211,3,3,3,True,Fire,1,ENGINE,1,2,6,Mission,"(37.7751470741622, -122.419255607214)",152633454-E36,2015-09-20,2015-09-20,2015-09-20T20:15:00.000+0000,2015-09-20T20:15:53.000+0000,2015-09-20T20:16:17.000+0000,2015-09-20T20:18:07.000+0000,2016-04-25T13:08:14.000+0000,2016-04-25T13:08:14.000+0000,2016-04-25T13:08:14.000+0000,2015-09-20T20:22:11.000+0000


## LOAD DATA
FIRE INCIDENTS

In [0]:
%fs ls /mnt/sf_open_data/fire_incidents

path,name,size
dbfs:/mnt/sf_open_data/fire_incidents/FireIncidents.json,FireIncidents.json,315648895
dbfs:/mnt/sf_open_data/fire_incidents/Fire_Incidents.csv,Fire_Incidents.csv,143518161


In [0]:
incidentsDF = spark.read.csv('dbfs:/mnt/sf_open_data/fire_incidents/Fire_Incidents.csv', header =True, inferSchema = True).cache()

In [0]:
incidentsDF.printSchema()

## PROCESS DATA

In [0]:
incidentsDF.columns

In [0]:
incidentsDF= incidentsDF.withColumnRenamed("Incident Number", "IncidentNumber")

In [0]:
incidentsDF.columns[0]

## JOIN TABLES

In [0]:
DF = df.join(incidentsDF, "IncidentNumber").select(df["IncidentNumber"], df["OriginalPriority"], df["NumberofAlarms"], incidentsDF["Primary Situation"], incidentsDF["Exposure Number"], 
                                                   incidentsDF["Fire Fatalities"], incidentsDF["Fire Injuries"], incidentsDF["Detector Failure Reason"]).dropDuplicates().cache()

In [0]:
DF = DF.withColumn("FA", (DF["Primary Situation"].like('700 %')).cast('integer'))

In [0]:
model_data = DF.select("FA","NumberofAlarms","Exposure Number", "Fire Fatalities", "Fire Injuries")
display(df2.head(10))

FA,NumberofAlarms,Exposure Number,Fire Fatalities,Fire Injuries
0,1,0,0,0
0,1,0,0,0
0,1,0,0,0
0,1,0,0,0
0,1,0,0,0
0,1,0,0,0
0,1,0,0,0
0,1,0,0,0
0,1,0,0,0
0,1,0,0,0


In [0]:
model_data.count()

## PIPELINE

In [0]:
vec_assembler = VectorAssembler(inputCols=["NumberofAlarms", "Exposure Number", "Fire Fatalities", "Fire Injuries"], outputCol="features")

pipe = Pipeline(stages=[vec_assembler])

In [0]:
piped_data = pipe.fit(model_data).transform(model_data)
output = piped_data.select("FA", "features")

In [0]:
train_data, test_data = output.randomSplit([.6, .4], seed = 3)

# Apply ML Models

In [0]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol = "FA", 
                                                  predictionCol = "prediction", 
                                                  metricName="accuracy")

f1_evaluator = MulticlassClassificationEvaluator(labelCol = "FA", 
                                                  predictionCol = "prediction", 
                                                  metricName="f1")

## DECISION TREE MODEL

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

dtc = DecisionTreeClassifier(labelCol = "FA", featuresCol = "features")
dt_model = dtc.fit(train_data)

In [0]:
dt_predictions = dt_model.transform(test_data)

In [0]:
acc_evaluator.evaluate(dt_predictions)

In [0]:
f1_evaluator.evaluate(dt_predictions)

## RANDOM FOREST MODEL

In [0]:
from pyspark.ml.classification import RandomForestClassifier

In [0]:
rfc = RandomForestClassifier(labelCol = 'FA', featuresCol = "features")
rf_model = rfc.fit(train_data)
rf_predictions = rf_model.transform(test_data)

In [0]:
acc_evaluator.evaluate(rf_predictions)

In [0]:
f1_evaluator.evaluate(rf_predictions)

## LOGISTIC REGRESSION MODEL

In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
logreg = LogisticRegression(labelCol = "FA", featuresCol = 'features')
lr_model = logreg.fit(train_data)
lr_predictions = lr_model.transform(test_data)

In [0]:
acc_evaluator.evaluate(lr_predictions)

In [0]:
f1_evaluator.evaluate(lr_predictions)

## LINEAR REGRESSION MODEL

In [0]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol="FA", featuresCol="features")
lrModel = lr.fit(train_data)

In [0]:
print(lrModel.coefficients, lrModel.intercept)

In [0]:
testResults = lrModel.evaluate(test_data)

In [0]:
print(testResults.rootMeanSquaredError, testResults.meanSquaredError, testResults.r2)

# Hyperparameter Tuning

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="FA", predictionCol="prediction", metricName="rmse")

In [0]:
print(evaluator.evaluate(dt_predictions))
print(evaluator.evaluate(rf_predictions))

## RANDOM FOREST

In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder().addGrid(rfr.maxDepth, [2]).addGrid(rfr.maxBins, [5]).build()

crossval = CrossValidator(estimator = rfr, estimatorParamMaps=paramGrid, evaluator = evaluator, numFolds = 2)

cv_model = crossval.fit(train_data)

In [0]:
cv_predictions = cv_model.transform(test_data)
evaluator.evaluate(cv_predictions)

In [0]:
print(cv_model.bestModel._java_obj.getMaxDepth())
print(cv_model.bestModel._java_obj.getMaxBins())

## ELASTIC NET

In [0]:
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1]).addGrid(lr.elasticNetParam, [0.5]).build()
crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4)
ev_model = crossval.fit(train_data)

In [0]:
ev_predictions = ev_model.transform(test_data)
evaluator.evaluate(ev_predictions)

In [0]:
print(ev_model.bestModel._java_obj.getRegParam())
print(ev_model.bestModel._java_obj.getElasticNetParam())