In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession
 .builder
 .appName("PythonChapter3")
 .getOrCreate())

In [1]:
from pyspark.sql.types import *

fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
 StructField('UnitID', StringType(), True),
 StructField('IncidentNumber', IntegerType(), True),
 StructField('CallType', StringType(), True), 
 StructField('CallDate', StringType(), True), 
 StructField('WatchDate', StringType(), True),
 StructField('CallFinalDisposition', StringType(), True),
 StructField('AvailableDtTm', StringType(), True),
 StructField('Address', StringType(), True), 
 StructField('City', StringType(), True), 
 StructField('Zipcode', IntegerType(), True), 
 StructField('Battalion', StringType(), True), 
 StructField('StationArea', StringType(), True), 
 StructField('Box', StringType(), True), 
 StructField('OriginalPriority', StringType(), True), 
 StructField('Priority', StringType(), True), 
 StructField('FinalPriority', IntegerType(), True), 
 StructField('ALSUnit', BooleanType(), True), 
 StructField('CallTypeGroup', StringType(), True),
 StructField('NumAlarms', IntegerType(), True),
 StructField('UnitType', StringType(), True),
 StructField('UnitSequenceInCallDispatch', IntegerType(), True),
 StructField('FirePreventionDistrict', StringType(), True),
 StructField('SupervisorDistrict', StringType(), True),
 StructField('Neighborhood', StringType(), True),
 StructField('Location', StringType(), True),
 StructField('RowID', StringType(), True),
 StructField('Delay', FloatType(), True)])

sf_fire_file = "C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/chapter3/data/sf-fire-calls.csv"
fire_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema)

In [2]:
sampleDF = spark.read.csv("C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/chapter3/data/sf-fire-calls.csv", header=True, samplingRatio=0.001)


In [9]:
from pyspark.sql.types import *

new_fire_df =fire_df.withColumnRenamed("Delay", "ResponseDelayedMins")
new_fire_df.select("ResponseDelayedMins").where(col("ResponseDelayedMins")>5).show(5, False)

+-------------------+
|ResponseDelayedMins|
+-------------------+
|5.35               |
|6.25               |
|5.2                |
|5.6                |
|7.25               |
+-------------------+
only showing top 5 rows



In [4]:
from pyspark.sql.functions import *

fire_ts_df = (new_fire_df
              .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
              .drop("CallDate")
              .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
              .drop("WatchDate")
              .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"), "MM/dd/yyyy hh:mm:ss a"))
              .drop("AvailableDtTm"))

In [5]:
(fire_ts_df
.select("IncidentDate", "OnWatchDate", "AvailableDtTS")
.show(5,False))

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [6]:
(fire_ts_df
.select(year("IncidentDate"))
.distinct()
.orderBy(year("IncidentDate"))
.show())

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



In [7]:
(fire_ts_df
.select("CallType")
.where(col("CallType").isNotNull())
.groupBy("CallType")
.count()
.orderBy(desc("count"))
.show(10,False))

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



In [8]:
(fire_ts_df
.select("CallType")
.where(col("CallType").isNotNull())
.distinct()
.show(30,False))

+--------------------------------------------+
|CallType                                    |
+--------------------------------------------+
|Elevator / Escalator Rescue                 |
|Marine Fire                                 |
|Aircraft Emergency                          |
|Confined Space / Structure Collapse         |
|Administrative                              |
|Alarms                                      |
|Odor (Strange / Unknown)                    |
|Citizen Assist / Service Call               |
|HazMat                                      |
|Watercraft in Distress                      |
|Explosion                                   |
|Oil Spill                                   |
|Vehicle Fire                                |
|Suspicious Package                          |
|Extrication / Entrapped (Machinery, Vehicle)|
|Other                                       |
|Outside Fire                                |
|Traffic Collision                           |
|Assist Polic

In [16]:
(fire_ts_df
.filter(year(col("IncidentDate"))==2018)
.groupBy(month(col("IncidentDate")))
.count()
.orderBy(desc("count"))
.show(12))

+-------------------+-----+
|month(IncidentDate)|count|
+-------------------+-----+
|                 10| 1068|
|                  5| 1047|
|                  3| 1029|
|                  8| 1021|
|                  1| 1007|
|                  7|  974|
|                  6|  974|
|                  9|  951|
|                  4|  947|
|                  2|  919|
|                 11|  199|
+-------------------+-----+



In [17]:
(fire_ts_df
.select("Neighborhood")
.where(year(col("IncidentDate")) == 2018)
.groupBy("Neighborhood")
.count()
.orderBy(desc("count"))
.show(50, False))

+------------------------------+-----+
|Neighborhood                  |count|
+------------------------------+-----+
|Tenderloin                    |1393 |
|South of Market               |1053 |
|Mission                       |913  |
|Financial District/South Beach|772  |
|Bayview Hunters Point         |522  |
|Western Addition              |352  |
|Sunset/Parkside               |346  |
|Nob Hill                      |295  |
|Hayes Valley                  |291  |
|Outer Richmond                |262  |
|Castro/Upper Market           |251  |
|North Beach                   |231  |
|Excelsior                     |212  |
|Potrero Hill                  |210  |
|West of Twin Peaks            |210  |
|Marina                        |191  |
|Chinatown                     |191  |
|Pacific Heights               |191  |
|Mission Bay                   |178  |
|Bernal Heights                |170  |
|Lakeshore                     |159  |
|Inner Sunset                  |154  |
|Russian Hill            

In [18]:
(fire_ts_df
.select("Neighborhood", "ResponseDelayedMins")
.filter(year(col("IncidentDate")) == 2018)
.show(10, False))

+------------------------------+-------------------+
|Neighborhood                  |ResponseDelayedMins|
+------------------------------+-------------------+
|Presidio Heights              |2.8833334          |
|Mission Bay                   |6.3333335          |
|Chinatown                     |2.65               |
|Financial District/South Beach|3.5333333          |
|Tenderloin                    |1.1                |
|Bayview Hunters Point         |4.05               |
|Inner Richmond                |2.5666666          |
|Inner Sunset                  |1.4                |
|Sunset/Parkside               |2.6666667          |
|South of Market               |1.7666667          |
+------------------------------+-------------------+
only showing top 10 rows



In [19]:
(fire_ts_df
.filter(year(col("IncidentDate"))==2018)
.groupBy(weekofyear(col("IncidentDate")))
.count()
.orderBy(desc("count"))
.show(12))

+------------------------+-----+
|weekofyear(IncidentDate)|count|
+------------------------+-----+
|                      22|  259|
|                      40|  255|
|                      43|  250|
|                      25|  249|
|                       1|  246|
|                      44|  244|
|                      13|  243|
|                      32|  243|
|                      11|  240|
|                      18|  236|
|                       5|  236|
|                      23|  235|
+------------------------+-----+
only showing top 12 rows



In [20]:
(fire_ts_df
.select("CallType", "ZipCode")
.where(col("CallType").isNotNull())
.groupBy("CallType", "ZipCode")
.count()
.orderBy(desc("count"))
.show())

+----------------+-------+-----+
|        CallType|ZipCode|count|
+----------------+-------+-----+
|Medical Incident|  94102|16130|
|Medical Incident|  94103|14775|
|Medical Incident|  94110| 9995|
|Medical Incident|  94109| 9479|
|Medical Incident|  94124| 5885|
|Medical Incident|  94112| 5630|
|Medical Incident|  94115| 4785|
|Medical Incident|  94122| 4323|
|Medical Incident|  94107| 4284|
|Medical Incident|  94133| 3977|
|Medical Incident|  94117| 3522|
|Medical Incident|  94134| 3437|
|Medical Incident|  94114| 3225|
|Medical Incident|  94118| 3104|
|Medical Incident|  94121| 2953|
|Medical Incident|  94116| 2738|
|Medical Incident|  94132| 2594|
|  Structure Fire|  94110| 2267|
|Medical Incident|  94105| 2258|
|  Structure Fire|  94102| 2229|
+----------------+-------+-----+
only showing top 20 rows



In [21]:
(fire_ts_df
.select("Neighborhood", "ZipCode")
.where((col("ZipCode")==94102) | (col("ZipCode")==94103))
.distinct()
.show(10,truncate=False))

+------------------------------+-------+
|Neighborhood                  |ZipCode|
+------------------------------+-------+
|Potrero Hill                  |94103  |
|Western Addition              |94102  |
|Tenderloin                    |94102  |
|Nob Hill                      |94102  |
|Castro/Upper Market           |94103  |
|South of Market               |94102  |
|South of Market               |94103  |
|Hayes Valley                  |94103  |
|Financial District/South Beach|94102  |
|Mission Bay                   |94103  |
+------------------------------+-------+
only showing top 10 rows



In [22]:
# El último parámetro de StructField(‘Delay’, FloatType(), True) significa que el campo “Delay” puede ser nulo.
# En caso de ser False, no podría ser nulo. 

In [23]:
# Los DataSets son colecciones de objetos fuertemente tipados y los DataFrames son DataSets sin tipado. 
# En Scala, un DataFrame es un alias para un DataSet[Row], siendo Row un objeto JVM genérico sin tipado 
# que puede contener diferentes tipos de campos. 

In [24]:
# Guardando en parquet
fire_ts_df.write.format("parquet").mode("overwrite").save("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas/Parquet")

In [25]:
# Guardando en json
fire_ts_df.write.format("json").mode("overwrite").save("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas/JSON/")

In [35]:
# Guardando en csv
fire_ts_df.write.format("csv").mode("overwrite").save("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas/CSV/")

In [27]:
# Guardando en avro
fire_ts_df.write.format("avro").mode("overwrite").save("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas/AVRO/")

In [29]:
# Hay más de un fichero debido a las particiones.
# El comando getNumPartitions en el RDD sirve para obtener el número de particiones de un DataFrame
fire_ts_df.rdd.getNumPartitions
# El comando coalesce en el RDD sirve para reducir el número de particiones de un DataFrame
# El comando partition en el RDD sirve para aumentar el número de particiones de un DataFrame
prueba_fire_ts_df = fire_ts_df.coalesce(1)
prueba_fire_ts_df.rdd.getNumPartitions

<bound method RDD.getNumPartitions of MapPartitionsRDD[126] at javaToPython at NativeMethodAccessorImpl.java:0>

In [30]:
# Guardando en parquet
prueba_fire_ts_df.write.format("parquet").mode("overwrite").save("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas2/Parquet")

In [32]:
# Guardando en json
prueba_fire_ts_df.write.format("json").mode("overwrite").save("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas2/JSON/")

In [33]:
# Guardando en csv
prueba_fire_ts_df.write.format("csv").mode("overwrite").save("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas2/CSV/")

In [34]:
# Guardando en avro
prueba_fire_ts_df.write.format("avro").mode("overwrite").save("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas2/AVRO/")