In [0]:
fire_df = spark.read\
    .format("csv")\
        .option("header","true")\
            .option("inferSchema","true")\
                .load("/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv") 

In [0]:
display(fire_df)

In [0]:
# change column names to remove spaces
new_fire_df = fire_df\
    .withColumnRenamed("Call Number", "CallNumber")\
    .withColumnRenamed("Unit ID", "UnitID")\
    .withColumnRenamed("Incident Number", "IncidentNumber")\
    .withColumnRenamed("Call Date", "CallDate")\
    .withColumnRenamed("Watch Date", "WatchDate")\
    .withColumnRenamed("Call Final Disposition", "CallFinalDisposition")\
    .withColumnRenamed("Available DtTm", "AvailableDtTm")

In [0]:
display(new_fire_df)

In [0]:
new_fire_df.printSchema()

In [0]:
from pyspark.sql.functions import to_timestamp, round
temp_fire_df = new_fire_df\
    .withColumn("AvailableDtTm", to_timestamp("AvailableDtTm", "MM/DD/YYYY hh:mm:ss a"))
    .withColumn("Delay", round("Delay",2))

In [0]:
# cache dataframe in memory for faster execution
fire_df.cache()

In [0]:
# 1. How many distinct types of calls were made to the fire department?

# Approach 1 - SQL approach 1. convert data into temporary view 2. run queries

# 1. create view
# fire_df.createOrReplaceTempView("fireView")
# 2. create dataframe
q1_sql_df = spark.sql("""SELECT COUNT(DISTINCT CallType) FROM global_temp.fireView WHERE CallType is not NULL""")
display(q1_sql_df)

# Approach 2 - Dataframe transformations
q1_df = fire_df.where("CallType is not null")\
                .select("CallType")\
                .distinct()
print(q1_df.count())

In [0]:
# 2. What are distinct types of calls made to the fire department?

q2_df = fire_df.where("CallType is not null")\
                .select("CallType")\
                .distinct()

display(q2_df)
# SELECT DISTINCT CallType FROM global_temp.fireview

In [0]:
# 3. Find out all responses or delayed times greater than 5 mins?

q3_df = fire_df.where("Delay > 5")\
                .select("*")

display(q3_df)

In [0]:
# 4. What were the most common call types?

q4_df = fire_df.select("CallType")\
                .where("CallType is not null")\
                .groupBy("CallType")\
                .count()\
                .orderBy("count", ascending=False)\
                .show()

In [0]:
# 5. What zip codes accounted for the most common calls?

q5_df = fire_df.select("Zipcode of Incident")\
                .groupBy("Zipcode of Incident")\
                .count()\
                .orderBy("count",ascending=False)\
                .show()

In [0]:
# 6. What San Francisco neighborhoods are in the zip codes 94102 and 94103

q6_df = fire_df.where("`Zipcode of Incident` = 94102 OR `Zipcode of Incident`= 94103 AND City='SF'")\
                .select("Neighborhood")\
                .show()

In [0]:
# 7. What was the sum of all calls, average, min, and max of the call response times?

from pyspark.sql.functions import sum, avg, min, max

q7_df = fire_df.select(
                sum("Delay").alias("Sum_of_Calls"),
                avg("Delay").alias("Avg_Delay"),
                min("Delay").alias("Min_Delay"),
                max("Delay").alias("Max_Delay")
                )\
                .show()

In [0]:
# 8. How many distinct years of data are in the CSV file?
from pyspark.sql.functions import year
q8_df = fire_df.select(year("Call Date"))\
                .distinct()

print(q8_df.count())

In [0]:
#  9. What week of the year in 2018 had the most fire calls?
from pyspark.sql.functions import weekofyear, year, count

q9_df = fire_df.filter(year("Call Date") == 2018)\
                .groupBy(weekofyear("Call Date").alias("WeekOfYear"))\
                .agg(count("*").alias("CallCount"))\
                .orderBy("CallCount", ascending=False)\
                .limit(1)\
                .show()

In [0]:
#  10. What neighborhoods in San Francisco had the worst response time in 2018?

from pyspark.sql.functions import col, year, avg

q10_df = fire_df.filter((year("Call Date") == 2018) & (col("Neighborhood").isNotNull()))\
                .groupBy("Neighborhood")\
                .agg(avg("Delay").alias("Avg_response_time"))\
                .orderBy("Avg_response_time", ascending=False)\
                .show()