# 1. Read the case, department, and source data into their own spark dataframes.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

In [2]:
source = spark.read.csv("data/source.csv", sep=",", header=True, inferSchema=True)
case = spark.read.csv("data/case.csv", sep=",", header=True, inferSchema=True)
dept = spark.read.csv("data/dept.csv", sep=",", header=True, inferSchema=True)

# 2. Let's see how writing to the local disk works in spark:

### Write the code necessary to store the source data in both csv and json format, store these as sources_csv and sources_json

In [3]:
source.write.json("data/source_json", mode="overwrite")
source.write.csv("data/source_csv", mode="overwrite")

### Inspect your folder structure. What do you notice?

# 3. Inspect the data in your dataframes. Are the data types appropriate? Write the code necessary to cast the values to the appropriate types.

In [4]:
from pyspark.sql.types import StructType, StructField, StringType

In [6]:
print('SOURCE')
source.printSchema()
print('DEPT')
dept.printSchema()
print('CASE')
case.printSchema()

SOURCE
root
 |-- source_id: string (nullable = true)
 |-- source_username: string (nullable = true)

DEPT
root
 |-- dept_division: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- standardized_dept_name: string (nullable = true)
 |-- dept_subject_to_SLA: string (nullable = true)

CASE
root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



Things to fix:
    - Date fields within the case dataframe
    - Convert case_late from a string to a bool
    - Figure out what a "Double" dtype is

In [8]:
case.show(5)

+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|      num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29|9/26/20 0:42|       NO| -998.5087616000001|        YES|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|     1/1/18 0:46|     1/3/18 8:11| 1/5/18 8:30|       NO|-2.0126041669999997|        YES|     Storm Water|Removal Of Obstru...|4.322222222| 