In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

Answer the questions below by using a combination of the techniques discussed in the lesson that you think is appropriate.

In [2]:
df = spark.read.csv("case.csv", header=True, inferSchema=True)
df.show(2, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616000001   
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 1/1/18 0:46          
 case_closed_date     | 1/3/18 8:11          
 SLA_due_date         | 1/5/18 8:30          
 case_late            | NO                   
 num_days_late        | -2.0126041

In [11]:
source = spark.read.csv("source.csv", header=True, inferSchema=True)
source.show(4)

+---------+----------------+
|source_id| source_username|
+---------+----------------+
|   100137|Merlene Blodgett|
|   103582|     Carmen Cura|
|   106463| Richard Sanchez|
|   119403|  Betty De Hoyos|
+---------+----------------+
only showing top 4 rows



In [3]:
dept = spark.read.csv("dept.csv", header=True, inferSchema=True)
dept.show(2, vertical=True)

-RECORD 0--------------------------------------
 dept_division          | 311 Call Center      
 dept_name              | Customer Service     
 standardized_dept_name | Customer Service     
 dept_subject_to_SLA    | YES                  
-RECORD 1--------------------------------------
 dept_division          | Brush                
 dept_name              | Solid Waste Manag... 
 standardized_dept_name | Solid Waste          
 dept_subject_to_SLA    | YES                  
only showing top 2 rows



In [4]:
df = df.withColumnRenamed('SLA_due_date', 'case_due_date')

In [5]:
df = (
    df.withColumn('case_late', col('case_late') == 'YES').\
    withColumn('case_closed', col('case_closed') == 'YES')
)

In [6]:
df = (
    df.withColumn('case_opened_date', to_timestamp(('case_opened_date'), 'M/d/yy H:mm')).\
withColumn('case_closed_date', to_timestamp(('case_closed_date'), 'M/d/yy H:mm')).\
withColumn('case_due_date', to_timestamp(('case_due_date'), 'M/d/yy H:mm'))
)

In [7]:
(
    df.withColumn('case_age', datediff(lit('latest_date'), 'case_opened_date'))
    .withColumn('days_to_closed', datediff('case_closed_date', 'case_opened_date'))
    .withColumn('case_lifetime', when(col('case_closed'), col('days_to_closed')).otherwise(col('case_age')))
    .filter(~ col('case_closed'))
    .select('case_opened_date', 'case_closed_date', 'case_due_date', 'case_lifetime')
    .sort(col('case_age').desc()).show(2)
)

+-------------------+----------------+-------------------+-------------+
|   case_opened_date|case_closed_date|      case_due_date|case_lifetime|
+-------------------+----------------+-------------------+-------------+
|2017-08-11 09:24:00|            null|2017-12-18 09:24:00|         null|
|2017-08-11 10:01:00|            null|2017-12-18 10:01:00|         null|
+-------------------+----------------+-------------------+-------------+
only showing top 2 rows



In [9]:
df = df.join(dept, "dept_division", "left").drop(dept.dept_division)\
.drop(dept.dept_name)\
.drop(df.dept_division).withColumnRenamed("standardized_dept_name", "department")

In [12]:
df = df.join(source, "source_id", "left")

#### How many different cases are there, by department?

In [13]:
df.show(1, vertical = True)

-RECORD 0------------------------------------
 source_id            | svcCRMLS             
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 case_due_date        | 2020-09-26 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
 department           | Animal Care Services 
 dept_subject_to_SLA  | YES                  
 source_username      | svcCRMLS             
only showing top 1 row



In [17]:
df.groupBy('department').agg(count('case_id')).show( vertical = True)

-RECORD 0------------------------------
 department     | Solid Waste          
 count(case_id) | 286287               
-RECORD 1------------------------------
 department     | Animal Care Services 
 count(case_id) | 119362               
-RECORD 2------------------------------
 department     | Trans & Cap Impro... 
 count(case_id) | 97841                
-RECORD 3------------------------------
 department     | Parks & Recreation   
 count(case_id) | 19964                
-RECORD 4------------------------------
 department     | Customer Service     
 count(case_id) | 2889                 
-RECORD 5------------------------------
 department     | Metro Health         
 count(case_id) | 5313                 
-RECORD 6------------------------------
 department     | City Council         
 count(case_id) | 34                   
-RECORD 7------------------------------
 department     | DSD/Code Enforcement 
 count(case_id) | 323579               



#### Does the percentage of cases that are late vary by department?

In [23]:
df.filter(df.num_days_late>1).groupBy('department').agg()

<pyspark.sql.group.GroupedData at 0x11fcdaa90>

In [33]:
(df
 .withColumn('num_days_late', (when(col('num_days_late') > 0, 1).otherwise(0)))\
 .groupBy('department').agg(mean('num_days_late')*100)).show(vertical = True)

-RECORD 0------------------------------------------
 department                 | Solid Waste          
 (avg(num_days_late) * 100) | 11.781533915266849   
-RECORD 1------------------------------------------
 department                 | Animal Care Services 
 (avg(num_days_late) * 100) | 19.898292588931152   
-RECORD 2------------------------------------------
 department                 | Trans & Cap Impro... 
 (avg(num_days_late) * 100) | 5.651005202318047    
-RECORD 3------------------------------------------
 department                 | Parks & Recreation   
 (avg(num_days_late) * 100) | 19.08435183329994    
-RECORD 4------------------------------------------
 department                 | Customer Service     
 (avg(num_days_late) * 100) | 70.43959847698166    
-RECORD 5------------------------------------------
 department                 | Metro Health         
 (avg(num_days_late) * 100) | 16.073781291172594   
-RECORD 6------------------------------------------
 department 

#### On average, how late are the late cases by department?

In [37]:
(df.filter(df.num_days_late >0)\
 .groupBy('department').agg(mean('num_days_late'))).show(vertical = True)

-RECORD 0----------------------------------
 department         | Solid Waste          
 avg(num_days_late) | 7.147172789557422    
-RECORD 1----------------------------------
 department         | Animal Care Services 
 avg(num_days_late) | 23.44672963473822    
-RECORD 2----------------------------------
 department         | Trans & Cap Impro... 
 avg(num_days_late) | 10.66295045507867    
-RECORD 3----------------------------------
 department         | Parks & Recreation   
 avg(num_days_late) | 22.427807192724128   
-RECORD 4----------------------------------
 department         | Customer Service     
 avg(num_days_late) | 88.18248182589824    
-RECORD 5----------------------------------
 department         | Metro Health         
 avg(num_days_late) | 6.494699602827868    
-RECORD 6----------------------------------
 department         | DSD/Code Enforcement 
 avg(num_days_late) | 49.50633998635033    



#### What is the service type that is the most late? Just for Parks & Rec?

In [51]:
(df.filter(df.num_days_late >0)\
 .groupBy('service_request_type').agg(mean('num_days_late'))).sort(col('avg(num_days_late)').desc()).show(2,vertical = True)

-RECORD 0------------------------------------
 service_request_type | Zoning: Recycle Yard 
 avg(num_days_late)   | 210.89201994318182   
-RECORD 1------------------------------------
 service_request_type | Zoning: Junk Yards   
 avg(num_days_late)   | 200.20517608494276   
only showing top 2 rows



In [58]:
(df.filter(df.num_days_late >0).filter(df.department == 'Parks & Recreation')\
 .groupBy('service_request_type').agg(mean('num_days_late'))).sort(col('avg(num_days_late)').desc()).show(2,vertical = True, truncate = False)

-RECORD 0----------------------------------------------
 service_request_type | Amenity Park Improvement       
 avg(num_days_late)   | 76.60603677123078              
-RECORD 1----------------------------------------------
 service_request_type | Major Park Improvement Install 
 avg(num_days_late)   | 75.79450367282352              
only showing top 2 rows



#### For the DSD/Code Enforcement department, what are the most common service request types? Look at other departments too.

In [65]:
(df.filter(df.department == 'DSD/Code Enforcement')\
.groupby('service_request_type').agg(count('service_request_type')).sort(col('count(service_request_type)').desc()).show(4,vertical = True, truncate = False))

-RECORD 0-------------------------------------------------------
 service_request_type        | Overgrown Yard/Trash             
 count(service_request_type) | 66403                            
-RECORD 1-------------------------------------------------------
 service_request_type        | Bandit Signs                     
 count(service_request_type) | 32968                            
-RECORD 2-------------------------------------------------------
 service_request_type        | Front Or Side Yard Parking       
 count(service_request_type) | 28920                            
-RECORD 3-------------------------------------------------------
 service_request_type        | Junk Vehicle On Private Property 
 count(service_request_type) | 21649                            
only showing top 4 rows



#### Does whether or not its a weekend matter for when a case is opened/closed?

In [76]:
(df.withColumn("weekday", dayofweek("case_closed_date")\
              .cast("int")).groupBy('weekday').agg(count('weekday'))).show(vertical = True)

-RECORD 0----------------
 weekday        | null   
 count(weekday) | 0      
-RECORD 1----------------
 weekday        | 1      
 count(weekday) | 27276  
-RECORD 2----------------
 weekday        | 6      
 count(weekday) | 131026 
-RECORD 3----------------
 weekday        | 3      
 count(weekday) | 157360 
-RECORD 4----------------
 weekday        | 5      
 count(weekday) | 148120 
-RECORD 5----------------
 weekday        | 4      
 count(weekday) | 173833 
-RECORD 6----------------
 weekday        | 7      
 count(weekday) | 62327  
-RECORD 7----------------
 weekday        | 2      
 count(weekday) | 136994 



In [78]:
(df.withColumn("weekday", dayofweek("case_opened_date")\
              .cast("int")).groupBy('weekday').agg(count('weekday'))).show(vertical = True)

-RECORD 0----------------
 weekday        | null   
 count(weekday) | 0      
-RECORD 1----------------
 weekday        | 1      
 count(weekday) | 29327  
-RECORD 2----------------
 weekday        | 6      
 count(weekday) | 145242 
-RECORD 3----------------
 weekday        | 3      
 count(weekday) | 164608 
-RECORD 4----------------
 weekday        | 5      
 count(weekday) | 155919 
-RECORD 5----------------
 weekday        | 4      
 count(weekday) | 149783 
-RECORD 6----------------
 weekday        | 7      
 count(weekday) | 54593  
-RECORD 7----------------
 weekday        | 2      
 count(weekday) | 155781 



#### On average, how many cases are opened a day for the Customer Service department?

In [84]:
df.select(max(df.case_closed_date)).show()

+---------------------+
|max(case_closed_date)|
+---------------------+
|  2018-08-08 10:38:00|
+---------------------+



In [91]:
days = df.select(datediff((max(df.case_closed_date)), (min(df.case_closed_date)))).first()[0]
days

584

In [94]:
df.filter(df.department == 'Customer Service').count()/days

4.946917808219178

#### Does the number of service requests for the solid waste department vary by day of the week?

In [96]:
(df.filter(df.department == 'Solid Waste').withColumn("weekday", dayofweek("case_opened_date")\
              .cast("int")).groupBy('weekday').agg(count('weekday'))).show(vertical = True)

-RECORD 0---------------
 weekday        | null  
 count(weekday) | 0     
-RECORD 1---------------
 weekday        | 1     
 count(weekday) | 7102  
-RECORD 2---------------
 weekday        | 6     
 count(weekday) | 52121 
-RECORD 3---------------
 weekday        | 3     
 count(weekday) | 59759 
-RECORD 4---------------
 weekday        | 5     
 count(weekday) | 48914 
-RECORD 5---------------
 weekday        | 4     
 count(weekday) | 43150 
-RECORD 6---------------
 weekday        | 7     
 count(weekday) | 15721 
-RECORD 7---------------
 weekday        | 2     
 count(weekday) | 59517 

