In [0]:
# 1. Load the given data file and create a Spark data frame.

# fire_df = spark.read\ #spark is spark session object which is entry point to spark programming APIs this .read gives DFs reader
#     .format("csv")\ # these below all are attributes of DFs reader
#         .option("header","true")\ # data comes with header
#             .option("inferSchema","true")\ # DF should infer schema from data file itself
#                 .load("/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv") # load file from given location
fire_df = spark.read\
    .format("csv")\
        .option("header","true")\
            .option("inferSchema","true")\
                .load("/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv") 

# Another way -
# fire_df = spark.read\
#     .csv("/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv", header='true', inferSchema='true')

In [0]:
#fire_df.show()
display(fire_df) # to get data in structured table

In [0]:
fire_df.createGlobalTempView("fireview") #converts dataframe to table view

In [0]:
%sql
SELECT * FROM global_temp.fireview -- # temp tables are created in global database

In [0]:
%sql
-- # 1. How many distinct types of calls were made to the fire department?
SELECT COUNT(DISTINCT CallType) FROM global_temp.fireview

In [0]:
%sql
-- # 2. What are distinct types of calls made to the fire department?
SELECT DISTINCT CallType FROM global_temp.fireview

In [0]:
%sql
-- # 3. Find out all responses or delayed times greater than 5 mins?
SELECT * FROM global_temp.fireview WHERE Delay > 5

In [0]:
%sql
-- # 4. What were the most common call types?
SELECT CallType,COUNT(CallType) AS cnt FROM global_temp.fireview GROUP BY CallType ORDER BY cnt DESC

In [0]:
%sql
-- # 5. What zip codes accounted for the most common calls?
SELECT `Zipcode of Incident`,COUNT(*) AS cnt FROM global_temp.fireview GROUP BY `Zipcode of Incident` ORDER BY cnt DESC 

In [0]:
%sql
-- # 6.What San Francisco neighborhoods are in the zip codes 94102 and 94103
SELECT Neighborhood FROM global_temp.fireview WHERE `Zipcode of Incident` = 94102 OR `Zipcode of Incident`= 94103 AND City='SF'

In [0]:
%sql
-- # 7. What was the sum of all calls, average, min, and max of the call response times?
SELECT SUM(Delay) AS Sum_of_Calls, AVG(Delay) As Avg_Delay, MIN(Delay) As Min_Delay, MAX(Delay) AS Max_Delay FROM global_temp.fireview

In [0]:
%sql
-- # 8. How many distinct years of data are in the CSV file?
SELECT COUNT(DISTINCT YEAR(`Call Date`)) FROM global_temp.fireview

In [0]:
%sql
-- # 9. What week of the year in 2018 had the most fire calls?
SELECT EXTRACT(WEEK FROM `Call Date`) AS week_of_year, COUNT(*) AS call_count
FROM global_temp.fireview
WHERE EXTRACT(YEAR FROM `Call Date`) = 2018
GROUP BY EXTRACT(WEEK FROM `Call Date`)
ORDER BY call_count DESC
LIMIT 1;

In [0]:
%sql
-- # 10. What neighborhoods in San Francisco had the worst response time in 2018?
SELECT Neighborhood, AVG(Delay) AS Avg_response_time
FROM global_temp.fireview
WHERE EXTRACT(YEAR FROM `Call Date`) = 2018
  AND Neighborhood IS NOT NULL
GROUP BY Neighborhood
ORDER BY Avg_response_time DESC;