# Week 5 Examples
### Spark SQL

In [1]:
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = '--master local[2] pyspark-shell'

In [2]:
import findspark
findspark.init()

### Custom spark config

In [3]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .master("local[*]")
         .config("spark.executor.memory", "2g")
         .appName("Week 5 Examples")
         .getOrCreate()
        
        )

In [4]:
# Extract out the sparkcontext from the session
sc = spark.sparkContext

In [5]:
# Checking the configuration we set
spark.conf.get('spark.executor.memory')

'2g'

### DataFrame Reader

In [9]:
file = 'data/sales.csv'
df = spark.read.format('csv').option('header','true').load(file)

In [10]:
df.show(5)

+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|  Profit|
+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|     1|CA-2016-152156| 11/8/2016|11/11/2016|  Second Class|   CG-12520|    Claire Gute| Consumer|United States|      Henderson|  Kentucky|      42420| South|FUR-BO-10001798|      Furniture|   Bookcases|Bush Somerset 

### DataFrame Writer

In [13]:
(df.write.format('orc')
    .partitionBy("Category")
    .mode('overwrite')
    .option('compression','snappy')
    .save('./data/orc/sales_data')
)

### Joins

In [26]:
from pyspark.sql.functions import desc

In [18]:
file_i = 'data/sf-fire-incidents.csv'
incidents = spark.read.format('csv').option('header','true').load(file_i)

In [19]:
file_c = 'data/sf-fire-calls.csv'
calls = spark.read.format('csv').option('header','true').load(file_c)

In [20]:
incidents.show(1)

+---------------+---------------+--------------------+-------------+-----------+--------------------+--------------------+--------------------+-------------+-------+---------+------------+----+-----------------+---------------------+---------+-------------+-----------+---------------+-------------------+-----------------------+-----------------------+---------------+-------------+-------------------+-----------------+----------------+--------------------+----------+--------------------+----------------------+------------------+--------------------------+--------------------+-------------------+--------------+-----------------------+-------------------------+-----------+------------------+--------------------------------------+--------------+----------------+--------------------+-----------+--------------+------------------------------------+----------------------------------------+----------------------------------+------------------------------------+-----------------+-------------+--

In [30]:
print(incidents.schema["Estimated Property Loss"].dataType)

StringType()


In [21]:
calls.show(1)

+----------+------+--------------+--------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+---------------+--------------------+-------------+-----+
|CallNumber|UnitID|IncidentNumber|      CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|   Neighborhood|            Location|        RowID|Delay|
+----------+------+--------------+--------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+

In [34]:
calls.join(incidents, calls['IncidentNumber'] == incidents['Incident Number'], 'inner')\
.select('IncidentNumber', 'CallType', col('Estimated Property Loss').cast('double').alias("Estimated Property Loss"))\
.sort(desc("Estimated Property Loss"))\
.show()

+--------------+--------------+-----------------------+
|IncidentNumber|      CallType|Estimated Property Loss|
+--------------+--------------+-----------------------+
|      16067026|Structure Fire|              2500000.0|
|      16067026|Structure Fire|              2500000.0|
|      16067026|Structure Fire|              2500000.0|
|      16105402|Structure Fire|              1200000.0|
|      16116773|Structure Fire|              1000000.0|
|      16134711|Structure Fire|              1000000.0|
|      16017840|Structure Fire|               500000.0|
|      16044470|Structure Fire|               500000.0|
|      16069575|Structure Fire|               450000.0|
|      16069575|Structure Fire|               450000.0|
|      16069575|Structure Fire|               450000.0|
|      16104735|Structure Fire|               375000.0|
|      16104735|Structure Fire|               375000.0|
|      16104735|Structure Fire|               375000.0|
|      16044882|Structure Fire|               35