In [1]:
import os, sys
import pyspark
from pyspark.sql import SQLContext

from pyspark.sql import SparkSession # to create dataframes
import pyspark.sql.functions as F # to work with dataframes siimilar to rdd.map()

sc = pyspark.SparkContext.getOrCreate()
#sqlContext = SQLContext(sc)
sqlContext = SparkSession.builder.appName("test").enableHiveSupport().getOrCreate()

import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

from pyspark.sql.functions import lit # to fill new column with custom values
from pyspark.sql.functions import col # to select a dataframe column


In [2]:
# to read parquet file
#df1 = sqlContext.read.parquet('data/FireDepartmentSample.parquet')
df1 = sqlContext.read.parquet('/home/shashank/Documents/gitWorkspace/SFFD-Spark-Project/Data/FireDepartmentSample.parquet')
df1 = df1.repartition(8)
df1.rdd.getNumPartitions()

8

In [3]:
df1.printSchema()

root
 |-- Call_Number: long (nullable = true)
 |-- Unit_ID: string (nullable = true)
 |-- Incident_Number: long (nullable = true)
 |-- Call_Type: string (nullable = true)
 |-- Call_Date: string (nullable = true)
 |-- Watch_Date: string (nullable = true)
 |-- Received_DtTm: string (nullable = true)
 |-- Entry_DtTm: string (nullable = true)
 |-- Dispatch_DtTm: string (nullable = true)
 |-- Response_DtTm: string (nullable = true)
 |-- On_Scene_DtTm: string (nullable = true)
 |-- Transport_DtTm: string (nullable = true)
 |-- Hospital_DtTm: string (nullable = true)
 |-- Call_Final_Disposition: string (nullable = true)
 |-- Available_DtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode_of_Incident: double (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- Station_Area: double (nullable = true)
 |-- Box: double (nullable = true)
 |-- Original_Priority: string (nullable = true)
 |-- Priority: string (nullable = true

In [4]:
df1.count()

72742

In [5]:
df1.describe('Priority').show()

+-------+-------------------+
|summary|           Priority|
+-------+-------------------+
|  count|              72742|
|   mean| 2.6964165267433016|
| stddev|0.45987062035684634|
|    min|                  1|
|    max|                  I|
+-------+-------------------+



In [6]:
df1.select('Unit_Type').distinct().show()

+--------------+
|     Unit_Type|
+--------------+
|       AIRPORT|
|         MEDIC|
|         CHIEF|
|  RESCUE SQUAD|
|RESCUE CAPTAIN|
|         TRUCK|
| INVESTIGATION|
|        ENGINE|
|       SUPPORT|
|       PRIVATE|
+--------------+



In [7]:
df1.select('Unit_ID').distinct().count()

212

In [8]:
df1.select('RowID').distinct().count() , df1.count()

(72742, 72742)

In [9]:
# data augmentation


# change format of dates from string to datetime/ date

# Call_Date
df1 = df1.withColumn('Call_Date', F.to_date('Call_Date', 'MM/dd/yyyy'))
# Received_DtTm
df1 = df1.withColumn('Received_DtTm', F.to_timestamp('Received_DtTm', 'MM/dd/yyyy HH:mm'))
# Dispatch_DtTm
df1 = df1.withColumn('Dispatch_DtTm', F.to_timestamp('Dispatch_DtTm', 'MM/dd/yyyy HH:mm'))
# Response_DtTm
df1 = df1.withColumn('Response_DtTm', F.to_timestamp('Response_DtTm', 'MM/dd/yyyy HH:mm'))
# On_Scene_DtTm
df1 = df1.withColumn('On_Scene_DtTm', F.to_timestamp('On_Scene_DtTm', 'MM/dd/yyyy HH:mm'))
# Available_DtTm
df1 = df1.withColumn('Available_DtTm', F.to_timestamp('Available_DtTm', 'MM/dd/yyyy HH:mm'))


# month of the year when call was received
df1 = df1.withColumn('Received_month', F.month('Received_DtTm'))

# Hour of the day when call was received
df1 = df1.withColumn('Received_Hour', F.hour('Received_DtTm'))

# response_time = On_Scene_DtTm - Received_DtTm
df1 = df1.withColumn('response_time',  (F.unix_timestamp('On_Scene_DtTm') - F.unix_timestamp('Received_DtTm'))/60  )





df1.limit(5).toPandas()

Unnamed: 0,Call_Number,Unit_ID,Incident_Number,Call_Type,Call_Date,Watch_Date,Received_DtTm,Entry_DtTm,Dispatch_DtTm,Response_DtTm,On_Scene_DtTm,Transport_DtTm,Hospital_DtTm,Call_Final_Disposition,Available_DtTm,Address,City,Zipcode_of_Incident,Battalion,Station_Area,Box,Original_Priority,Priority,Final_Priority,ALS_Unit,Call_Type_Group,Number_of_Alarms,Unit_Type,Unit_sequence_in_call_dispatch,Fire_Prevention_District,Supervisor_District,Neighborhooods_-_Analysis_Boundaries,Location,RowID,__index_level_0__,Received_month,Received_Hour,response_time
0,183541336,89,18148679,Medical Incident,2018-12-20,12/20/2018,2018-12-20 11:03:00,12/20/2018 11:04,2018-12-20 11:06:00,2018-12-20 11:06:00,2018-12-20 11:20:00,12/20/2018 11:58,12/20/2018 12:17,Code 2 Transport,2018-12-20 12:52:00,FULTON ST/14TH AV,San Francisco,94118.0,B07,31.0,7144.0,2,2,2,True,Potentially Life-Threatening,1,MEDIC,1,7,1,Outer Richmond,"(37.773114421905646, -122.47229734092831)",183541336-89,42823,12,11,17.0
1,183441999,KM06,18144512,Medical Incident,2018-12-10,12/10/2018,2018-12-10 13:28:00,12/10/2018 13:28,2018-12-10 13:29:00,NaT,NaT,,,SFPD,2018-12-10 13:31:00,0 Block of DAKOTA ST,San Francisco,94107.0,B10,37.0,2614.0,A,2,2,False,Non Life-threatening,1,PRIVATE,2,10,10,Potrero Hill,"(37.75362874594843, -122.39552313858634)",183441999-KM06,34612,12,13,
2,190033206,B01,19001249,Alarms,2019-01-03,1/3/2019,2019-01-03 19:07:00,1/3/2019 19:10,2019-01-03 19:10:00,2019-01-03 19:10:00,2019-01-03 19:17:00,,,Fire,2019-01-03 19:29:00,2336-A JONES ST,San Francisco,94133.0,B01,28.0,1515.0,3,3,3,False,Alarm,1,CHIEF,3,1,3,Russian Hill,"(37.80289695118548, -122.41635097322745)",190033206-B01,54899,1,19,10.0
3,190202768,E36,19008569,Medical Incident,2019-01-20,1/20/2019,2019-01-20 18:26:00,1/20/2019 18:27,2019-01-20 18:27:00,2019-01-20 18:29:00,NaT,,,Code 2 Transport,2019-01-20 18:35:00,1400 Block of MARKET ST,San Francisco,94102.0,B02,36.0,3219.0,3,3,3,True,Potentially Life-Threatening,1,ENGINE,2,2,6,Tenderloin,"(37.77591186688677, -122.41846357838125)",190202768-E36,67745,1,18,
4,183483352,E05,18146420,Medical Incident,2018-12-14,12/14/2018,2018-12-14 21:43:00,12/14/2018 21:43,2018-12-14 21:44:00,2018-12-14 21:45:00,2018-12-14 21:46:00,,,Code 2 Transport,2018-12-14 21:53:00,2200 Block of MARKET ST,San Francisco,94114.0,B05,6.0,5231.0,3,3,3,False,Potentially Life-Threatening,1,ENGINE,1,5,8,Castro/Upper Market,"(37.764951660709535, -122.431937558766)",183483352-E05,38321,12,21,3.0


In [10]:
df1.filter(df1['response_time'] > 0).count(), df1.count()

(58625, 72742)

In [11]:
df1.registerTempTable('temptable')

In [12]:
df2 = sqlContext.sql("\
SELECT \
    A.RowID as temp_rowid,\
    SUM(B.response_time)/COUNT(B.response_time) AS avg_response_history \
FROM \
    temptable A JOIN temptable B ON A.Unit_ID = B.Unit_ID \
WHERE \
     (B.Received_DtTm >= A.Received_DtTm - INTERVAL 1 DAY) and (B.Received_DtTm < A.Received_DtTm) \
GROUP BY A.RowID\
")

df2.show(10)

+---------------+--------------------+
|     temp_rowid|avg_response_history|
+---------------+--------------------+
|183373368-AM108|               15.75|
|   183461142-94|   9.333333333333334|
|183210345-AM122|                24.5|
|  183241510-E36|   5.571428571428571|
|  183313762-RS2|                 7.5|
|   190080251-86|                12.9|
|  190161792-RS1|               343.0|
|  190072988-E02|   7.142857142857143|
|  183451711-RC3|                 8.0|
|  183382542-B08|                null|
+---------------+--------------------+
only showing top 10 rows



In [13]:
df1 = df1.join(df2, df1.RowID == df2.temp_rowid, 'outer')
df1 = df1.drop('temp_rowid')
df1.printSchema()

root
 |-- Call_Number: long (nullable = true)
 |-- Unit_ID: string (nullable = true)
 |-- Incident_Number: long (nullable = true)
 |-- Call_Type: string (nullable = true)
 |-- Call_Date: date (nullable = true)
 |-- Watch_Date: string (nullable = true)
 |-- Received_DtTm: timestamp (nullable = true)
 |-- Entry_DtTm: string (nullable = true)
 |-- Dispatch_DtTm: timestamp (nullable = true)
 |-- Response_DtTm: timestamp (nullable = true)
 |-- On_Scene_DtTm: timestamp (nullable = true)
 |-- Transport_DtTm: string (nullable = true)
 |-- Hospital_DtTm: string (nullable = true)
 |-- Call_Final_Disposition: string (nullable = true)
 |-- Available_DtTm: timestamp (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode_of_Incident: double (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- Station_Area: double (nullable = true)
 |-- Box: double (nullable = true)
 |-- Original_Priority: string (nullable = true)
 |-- Priority: string (nu

In [14]:
df1.write.parquet('/home/shashank/Documents/gitWorkspace/SFFD-Spark-Project/Data/FireNew.parquet')

In [15]:
df1.limit(5).toPandas()

Unnamed: 0,Call_Number,Unit_ID,Incident_Number,Call_Type,Call_Date,Watch_Date,Received_DtTm,Entry_DtTm,Dispatch_DtTm,Response_DtTm,On_Scene_DtTm,Transport_DtTm,Hospital_DtTm,Call_Final_Disposition,Available_DtTm,Address,City,Zipcode_of_Incident,Battalion,Station_Area,Box,Original_Priority,Priority,Final_Priority,ALS_Unit,Call_Type_Group,Number_of_Alarms,Unit_Type,Unit_sequence_in_call_dispatch,Fire_Prevention_District,Supervisor_District,Neighborhooods_-_Analysis_Boundaries,Location,RowID,__index_level_0__,Received_month,Received_Hour,response_time,avg_response_history
0,183050174,64,18127698,Medical Incident,2018-11-01,10/31/2018,2018-11-01 01:06:00,11/1/2018 1:07,2018-11-01 01:07:00,2018-11-01 01:08:00,NaT,,,Code 2 Transport,2018-11-01 01:10:00,1600 Block of 45TH AV,San Francisco,94122.0,B08,23.0,7661.0,3,3,3,True,Potentially Life-Threatening,1,MEDIC,3,8.0,4,Sunset/Parkside,"(37.755806416201054, -122.5045977128629)",183050174-64,52,11,1,,7.5
1,183051938,86,18127873,Medical Incident,2018-11-01,11/1/2018,2018-11-01 12:23:00,11/1/2018 12:24,2018-11-01 12:24:00,2018-11-01 12:24:00,2018-11-01 12:31:00,11/1/2018 12:53,11/1/2018 13:09,Code 2 Transport,2018-11-01 13:44:00,300 Block of ARGUELLO BLVD,San Francisco,94118.0,B07,10.0,7114.0,3,3,3,True,Potentially Life-Threatening,1,MEDIC,2,7.0,1,Inner Richmond,"(37.78443958806357, -122.45915822168865)",183051938-86,379,11,12,8.0,10.666667
2,183052328,50,18127908,Medical Incident,2018-11-01,11/1/2018,2018-11-01 13:49:00,11/1/2018 13:49,2018-11-01 13:49:00,2018-11-01 13:49:00,2018-11-01 13:53:00,11/1/2018 13:58,11/1/2018 14:06,Code 2 Transport,2018-11-01 14:30:00,1000 Block of MISSION ST,San Francisco,94103.0,B03,1.0,2251.0,A,2,2,True,Non Life-threatening,1,MEDIC,1,3.0,6,South of Market,"(37.78013037449471, -122.40987144993309)",183052328-50,437,11,13,4.0,4.0
3,183052600,RA48,18127935,Medical Incident,2018-11-01,11/1/2018,2018-11-01 14:47:00,11/1/2018 14:50,2018-11-01 14:50:00,2018-11-01 14:51:00,2018-11-01 14:54:00,,,Code 2 Transport,2018-11-01 15:36:00,1400 Block of HALIBUT CT,Treasure Isla,94130.0,B03,48.0,2931.0,2,2,2,False,Potentially Life-Threatening,1,MEDIC,1,,6,Treasure Island,"(37.82481904268439, -122.37503044221887)",183052600-RA48,478,11,14,7.0,6.0
4,183053166,RC1,18127985,Medical Incident,2018-11-01,11/1/2018,2018-11-01 17:00:00,11/1/2018 17:02,2018-11-01 17:12:00,2018-11-01 17:13:00,2018-11-01 17:15:00,,,Medical Examiner,2018-11-01 17:53:00,MINNA ST/NEW MONTGOMERY ST,San Francisco,94105.0,B03,1.0,2157.0,3,E,3,True,Potentially Life-Threatening,1,RESCUE CAPTAIN,3,3.0,6,Financial District/South Beach,"(37.78701078768754, -122.39997496189893)",183053166-RC1,584,11,17,15.0,8.5


In [16]:
df1.printSchema()

root
 |-- Call_Number: long (nullable = true)
 |-- Unit_ID: string (nullable = true)
 |-- Incident_Number: long (nullable = true)
 |-- Call_Type: string (nullable = true)
 |-- Call_Date: date (nullable = true)
 |-- Watch_Date: string (nullable = true)
 |-- Received_DtTm: timestamp (nullable = true)
 |-- Entry_DtTm: string (nullable = true)
 |-- Dispatch_DtTm: timestamp (nullable = true)
 |-- Response_DtTm: timestamp (nullable = true)
 |-- On_Scene_DtTm: timestamp (nullable = true)
 |-- Transport_DtTm: string (nullable = true)
 |-- Hospital_DtTm: string (nullable = true)
 |-- Call_Final_Disposition: string (nullable = true)
 |-- Available_DtTm: timestamp (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode_of_Incident: double (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- Station_Area: double (nullable = true)
 |-- Box: double (nullable = true)
 |-- Original_Priority: string (nullable = true)
 |-- Priority: string (nu

In [22]:
df3 = sqlContext.read.parquet('/home/shashank/Documents/gitWorkspace/SFFD-Spark-Project/Data/FireNew.parquet')
df3 = df1.repartition(8)
df3.limit(5).toPandas()

Unnamed: 0,Call_Number,Unit_ID,Incident_Number,Call_Type,Call_Date,Watch_Date,Received_DtTm,Entry_DtTm,Dispatch_DtTm,Response_DtTm,On_Scene_DtTm,Transport_DtTm,Hospital_DtTm,Call_Final_Disposition,Available_DtTm,Address,City,Zipcode_of_Incident,Battalion,Station_Area,Box,Original_Priority,Priority,Final_Priority,ALS_Unit,Call_Type_Group,Number_of_Alarms,Unit_Type,Unit_sequence_in_call_dispatch,Fire_Prevention_District,Supervisor_District,Neighborhooods_-_Analysis_Boundaries,Location,RowID,__index_level_0__,Received_month,Received_Hour,response_time,avg_response_history
0,190050868,67,19001894,Medical Incident,2019-01-05,1/5/2019,2019-01-05 09:13:00,1/5/2019 9:14,2019-01-05 09:14:00,2019-01-05 09:15:00,2019-01-05 09:18:00,,,Medical Examiner,2019-01-05 10:22:00,1400 Block of 10TH AVE,San Francisco,94122.0,B08,22.0,7334.0,E,2,2,True,Potentially Life-Threatening,1,MEDIC,2,8.0,7,Inner Sunset,"(37.76119934311896, -122.46711833553167)",190050868-67,56135,1,9,5.0,10.5
1,183452877,T48,18145005,Medical Incident,2018-12-11,12/11/2018,2018-12-11 16:48:00,12/11/2018 16:50,2018-12-11 17:11:00,2018-12-11 17:13:00,NaT,,,Code 2 Transport,2018-12-11 17:15:00,1200 Block of EXPOSITION DR,Treasure Isla,94130.0,B03,48.0,2931.0,3,3,3,False,Potentially Life-Threatening,1,TRUCK,4,,6,Treasure Island,"(37.829470112515025, -122.37178247625785)",183452877-T48,35605,12,16,,
2,183462284,57,18145343,Medical Incident,2018-12-12,12/12/2018,2018-12-12 14:29:00,12/12/2018 14:29,2018-12-12 14:30:00,2018-12-12 14:30:00,2018-12-12 14:36:00,12/12/2018 14:48,12/12/2018 14:55,Code 2 Transport,2018-12-12 15:32:00,1100 Block of MASONIC AVE,San Francisco,94117.0,B05,21.0,4466.0,C,3,3,True,Potentially Life-Threatening,1,MEDIC,1,5.0,5,Haight Ashbury,"(37.770667264403556, -122.4455084353367)",183462284-57,36247,12,14,7.0,12.0
3,183183572,91,18133704,Medical Incident,2018-11-14,11/14/2018,2018-11-14 18:31:00,11/14/2018 18:32,2018-11-14 18:36:00,2018-11-14 18:36:00,NaT,,,No Merit,2018-11-14 18:43:00,100 Block of 6TH ST,San Francisco,94103.0,B03,1.0,2251.0,E,3,3,True,Potentially Life-Threatening,1,MEDIC,4,3.0,6,South of Market,"(37.78079208027532, -122.40838574549893)",183183572-91,12150,11,18,,12.0
4,183240253,62,18135988,Medical Incident,2018-11-20,11/19/2018,2018-11-20 03:21:00,11/20/2018 3:36,2018-11-20 03:37:00,2018-11-20 03:37:00,NaT,,,Code 2 Transport,2018-11-20 03:41:00,400 Block of TURK ST,San Francisco,94102.0,B02,3.0,1554.0,2,2,2,True,Non Life-threatening,1,MEDIC,2,2.0,6,Tenderloin,"(37.78244849821827, -122.41632744176792)",183240253-62,16792,11,3,,7.5


In [24]:
df3.filter(df3['response_time'] > 0).count(),df3.filter(df3['avg_response_history'] > 0).count(), df3.count()

(58625, 69928, 72742)