In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
#from pyspark.sql import functions as f
from pyspark.sql.functions import *

In [3]:
if __name__ == "__main__":
    spark = (SparkSession
            .builder
            .appName("sf-fire") 
            .getOrCreate())
#spark.conf.set("spark.executor.memory", "2g")

In [4]:
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
StructField('UnitID', StringType(), True),
StructField('IncidentNumber', IntegerType(), True),
StructField('CallType', StringType(), True),
StructField('CallDate', StringType(), True),
StructField('WatchDate', StringType(), True),
StructField('CallFinalDisposition', StringType(), True),
StructField('AvailableDtTm', StringType(), True),
StructField('Address', StringType(), True),
StructField('City', StringType(), True),
StructField('Zipcode', IntegerType(), True),
StructField('Battalion', StringType(), True),
StructField('StationArea', StringType(), True),
StructField('Box', StringType(), True),
StructField('OriginalPriority', StringType(), True),
StructField('Priority', StringType(), True),
StructField('FinalPriority', IntegerType(), True),
StructField('ALSUnit', BooleanType(), True),
StructField('CallTypeGroup', StringType(), True),
StructField('NumAlarms', IntegerType(), True),
StructField('UnitType', StringType(), True),
StructField('UnitSequenceInCallDispatch', IntegerType(), True),
StructField('FirePreventionDistrict', StringType(), True),
StructField('SupervisorDistrict', StringType(), True),
StructField('Neighborhood', StringType(), True),
StructField('Location', StringType(), True),
StructField('RowID', StringType(), True),
StructField('Delay', FloatType(), True)])

In [5]:
sf_fire_file = "C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/datasets/sf-fire-calls.csv"

In [6]:
fire_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema)

In [7]:
print(fire_df.printSchema())

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

In [8]:
fire_df.show(n=10)

+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+------------

In [9]:
parquet_path = "C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/datasets/sf-fire-calls2.parquet"

In [10]:
#fire_df.write.format("parquet").save(parquet_path)

In [11]:
#spark.stop()

### Basic SELECT FROM WHERE Statement

In [12]:
few_fire_df = (fire_df
              .select("IncidentNumber", "AvailableDtTm","CallType")
              .where(col("CallType") != "Medical Incident"))
few_fire_df.show(5, truncate=False)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



### Count distinct call types

In [13]:
calltype_df = (fire_df
               .select("CallType")
               .where(col("CallType").isNotNull())
               .agg(countDistinct("CallType").alias("DistinctCallTypes")))
calltype_df.show()

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+



### Count # of Incidents Grouped by Call Type

In [14]:
### Basic groupby ###
##callcounts_df = (fire_df
##                 .groupBy("CallType").count()
##                 .where(col("CallType").isNotNull())
##                )
##callcounts_df.show()

In [15]:
### more comprehensive way, can group and then aggregate how we want, add alias, sort, etc. ###
callcounts_df = (fire_df
                 .groupBy("CallType")
                 .agg(count("IncidentNumber").alias("CallCount"))
                 .where(col("CallType").isNotNull())
                 .sort(desc("CallCount"))
                )
callcounts_df.show()

+--------------------+---------+
|            CallType|CallCount|
+--------------------+---------+
|    Medical Incident|   113794|
|      Structure Fire|    23319|
|              Alarms|    19406|
|   Traffic Collision|     7013|
|Citizen Assist / ...|     2524|
|               Other|     2166|
|        Outside Fire|     2094|
|        Vehicle Fire|      854|
|Gas Leak (Natural...|      764|
|        Water Rescue|      755|
|Odor (Strange / U...|      490|
|   Electrical Hazard|      482|
|Elevator / Escala...|      453|
|Smoke Investigati...|      391|
|          Fuel Spill|      193|
|              HazMat|      124|
|Industrial Accidents|       94|
|           Explosion|       89|
|Train / Rail Inci...|       57|
|  Aircraft Emergency|       36|
+--------------------+---------+
only showing top 20 rows



### Renaming Columns

In [16]:
### Original df col naming
(fire_df
.select("Delay")
.where(col("Delay") > 5)
.show(5, False))

### New df col naming
new_fire_df = (fire_df
              .withColumnRenamed("Delay","ResponseDelayedinMins"))

(new_fire_df
.select("ResponseDelayedinMins")
.where(col("ResponseDelayedinMins") > 5)
.show(5, False))

+-----+
|Delay|
+-----+
|5.35 |
|6.25 |
|5.2  |
|5.6  |
|7.25 |
+-----+
only showing top 5 rows

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



### Modifying Columns
CallData, WatchDate, AvailableDtTm should all be DateTime but are saved as strings. Need to convert to actual dates.
So, we create new cols with converted data, and then drop old cols

In [17]:
fire_ts_df = (new_fire_df
             .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
             .drop("CallDate")
             .withColumn("OnWatchDate", to_timestamp(col("WatchDate"),"MM/dd/yyyy"))
             .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"),"MM/dd/yyyy hh:mm:ss a"))
             .drop("AvailableDtTm")
             )

(fire_ts_df
    .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
    .show(5, False))

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



Now we can query using date functions like month(), year(), and day().

In [18]:
(fire_ts_df
    .select(year('IncidentDate'))
    .distinct()
    .orderBy(year('IncidentDate'))
    .show())

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



### Aggregations
Already did this above, but looking for most common types of calls

In [19]:
(fire_ts_df
    .groupBy("CallType")
    .agg(count("IncidentNumber").alias("CallCount"))
    .where(col("CallType").isNotNull())
    .sort(desc("CallCount"))
    .show(10, truncate=False)
)

+-------------------------------+---------+
|CallType                       |CallCount|
+-------------------------------+---------+
|Medical Incident               |113794   |
|Structure Fire                 |23319    |
|Alarms                         |19406    |
|Traffic Collision              |7013     |
|Citizen Assist / Service Call  |2524     |
|Other                          |2166     |
|Outside Fire                   |2094     |
|Vehicle Fire                   |854      |
|Gas Leak (Natural and LP Gases)|764      |
|Water Rescue                   |755      |
+-------------------------------+---------+
only showing top 10 rows



### More Basic Aggregate Functions

In [23]:
(fire_ts_df
    .select(sum("NumAlarms").alias("TotalNumAlarms")
            ,avg("ResponseDelayedinMins").alias("AvgResponseTime")
            ,min("ResponseDelayedinMins").alias("MinResponseTime")
            ,max("ResponseDelayedinMins").alias("MaxResponseTime")
           )
    .show())

+--------------+-----------------+---------------+---------------+
|TotalNumAlarms|  AvgResponseTime|MinResponseTime|MaxResponseTime|
+--------------+-----------------+---------------+---------------+
|        176170|3.892364154521585|    0.016666668|        1844.55|
+--------------+-----------------+---------------+---------------+



### Dataset Practice

#### 1. What were all the different types of fire calls in 2018?

In [49]:
### Read up on execution order --> If distinct is placed ahead of where, returns zero rows ###
(fire_ts_df
    .where(year(col("IncidentDate")) == 2018)
    .select("CallType")
    .distinct()
    .orderBy("CallType")
    .show()
)

+--------------------+
|            CallType|
+--------------------+
|              Alarms|
|       Assist Police|
|Citizen Assist / ...|
|   Electrical Hazard|
|Elevator / Escala...|
|           Explosion|
|          Fuel Spill|
|Gas Leak (Natural...|
|              HazMat|
|    Medical Incident|
|Odor (Strange / U...|
|               Other|
|        Outside Fire|
|Smoke Investigati...|
|      Structure Fire|
|  Suspicious Package|
|   Traffic Collision|
|Train / Rail Inci...|
|        Vehicle Fire|
|        Water Rescue|
+--------------------+



#### 2. What months within the year 2018 saw the highest number of fire calls?

In [60]:
### Again when where is placed 'lower', does not run. Perhaps runs best when assembled with standard sql execution order (from..where..groupby..select..etc)
(fire_ts_df
    .where(col("CallType").isNotNull())
    .where(year(col("IncidentDate")) == 2018)
    .groupBy("CallType")
    .agg(count("IncidentNumber").alias("CallCount"))
    .sort(desc("CallCount"))
    .show(10, truncate=False)
)

+-------------------------------+---------+
|CallType                       |CallCount|
+-------------------------------+---------+
|Medical Incident               |7004     |
|Alarms                         |1144     |
|Structure Fire                 |906      |
|Traffic Collision              |433      |
|Outside Fire                   |153      |
|Other                          |114      |
|Citizen Assist / Service Call  |113      |
|Gas Leak (Natural and LP Gases)|69       |
|Water Rescue                   |43       |
|Elevator / Escalator Rescue    |36       |
+-------------------------------+---------+
only showing top 10 rows



#### 3. Which neighborhood in San Francisco generated the most fire calls in 2018?

In [71]:
(fire_ts_df
    .where(col("Neighborhood").isNotNull())
    .where(year(col("IncidentDate")) == 2018)
    .groupBy("Neighborhood")
    .agg(count("IncidentNumber").alias("IncidentCount"))
    .sort(desc("IncidentCount"))
    .show(10, truncate=False)
)

+------------------------------+-------------+
|Neighborhood                  |IncidentCount|
+------------------------------+-------------+
|Tenderloin                    |1393         |
|South of Market               |1053         |
|Mission                       |913          |
|Financial District/South Beach|772          |
|Bayview Hunters Point         |522          |
|Western Addition              |352          |
|Sunset/Parkside               |346          |
|Nob Hill                      |295          |
|Hayes Valley                  |291          |
|Outer Richmond                |262          |
+------------------------------+-------------+
only showing top 10 rows



#### 4. Which neighborhoods had the worst response times to fire calls in 2018?

In [70]:
(fire_ts_df
    .where(col("Neighborhood").isNotNull())
    .where(year(col("IncidentDate")) == 2018)
    .groupBy("Neighborhood")
    .agg(avg("ResponseDelayedinMins").alias("AvgResponseTime"))
    .sort(desc("AvgResponseTime"))
    .show(10, truncate=False)
)

+------------------------------+-----------------+
|Neighborhood                  |AvgResponseTime  |
+------------------------------+-----------------+
|Chinatown                     |6.190314101143033|
|Presidio                      |5.829227011272873|
|Treasure Island               |5.453703684111436|
|McLaren Park                  |4.74404764175415 |
|Bayview Hunters Point         |4.620561962212182|
|Presidio Heights              |4.594131482319093|
|Inner Sunset                  |4.438095217981896|
|Inner Richmond                |4.364728709292966|
|Financial District/South Beach|4.344084616885593|
|Haight Ashbury                |4.266428579390049|
+------------------------------+-----------------+
only showing top 10 rows



#### 5. Which week in the year in 2018 had the most fire calls?

In [73]:
(fire_ts_df
    #.where(col("Neighborhood").isNotNull())
    .where(year(col("IncidentDate")) == 2018)
    .groupBy(weekofyear("IncidentDate"))
    .agg(count("IncidentNumber").alias("IncidentCount"))
    .sort(desc("IncidentCount"))
    .show(10, truncate=False)
)

+------------------------+-------------+
|weekofyear(IncidentDate)|IncidentCount|
+------------------------+-------------+
|22                      |259          |
|40                      |255          |
|43                      |250          |
|25                      |249          |
|1                       |246          |
|44                      |244          |
|13                      |243          |
|32                      |243          |
|11                      |240          |
|5                       |236          |
+------------------------+-------------+
only showing top 10 rows



#### 6. Is there a correlation between neighborhood, zip code, and number of fire calls?

In [75]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

In [108]:
df_corr = (fire_ts_df
        #.select("Neighborhood", "Zipcode", "IncidentCount")
        .where(col("Zipcode").isNotNull())
        #.where(year(col("IncidentDate")) == 2018)
        .groupBy("Zipcode")
        .agg(count("IncidentNumber").alias("IncidentCount"))
        .sort(desc("IncidentCount"))
        #.show(n=5, truncate=False)
        )
df_corr.show()

+-------+-------------+
|Zipcode|IncidentCount|
+-------+-------------+
|  94102|        21840|
|  94103|        20897|
|  94110|        14801|
|  94109|        14686|
|  94124|         9236|
|  94112|         8421|
|  94115|         7812|
|  94107|         6941|
|  94122|         6355|
|  94133|         6246|
|  94117|         5804|
|  94114|         5175|
|  94118|         5157|
|  94134|         5009|
|  94121|         4555|
|  94132|         4321|
|  94105|         4236|
|  94108|         4084|
|  94116|         3933|
|  94123|         3719|
+-------+-------------+
only showing top 20 rows



In [111]:
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=df_corr.columns, outputCol=vector_col)
df_vector = assembler.transform(df_corr).select(vector_col)

matrix = Correlation.corr(df_vector, vector_col)

matrix.collect()[0]["pearson({})".format(vector_col)].values


array([ 1.        , -0.52625243, -0.52625243,  1.        ])

#### 8. How can we use Parquet files or SQL tables to store this data and read it back?

In [112]:
parquet_path = "C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/datasets/sf-fire-calls-ts_df.parquet"
fire_df.write.format("parquet").save(parquet_path)

In [113]:
spark.stop()