In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# NYC Parking Tickets: An Exploratory Analysis

## Objectives of the Case Study

- Primarily, this case study is meant as a deep dive into the usage of Spark. As you saw while working with Spark, its syntax behaves differently from a regular Python syntax. One of the major objectives of this case study is to gain familiarity with how analysis works in PySpark as opposed to base Python.
- Learning the basic idea behind using functions in PySpark can help in using other libraries like SparkR. If you are in a company where R is a primary language, you can easily pick up SparkR syntax and use Spark’s processing power.
- The process of running a model-building command boils down to a few lines of code. While drawing inferences from data, the most time-consuming step is preparing the data up to the point of model building. So, this case study will focus more on exploratory analysis.

## Problem Statement

Big data analytics allows you to analyse data at scale. It has applications in almost every industry in the world. Let’s consider an unconventional application that you wouldn’t ordinarily encounter.

New York City is a thriving metropolis. Just like most other metros its size, one of the biggest problems its citizens face is parking. The classic combination of a huge number of cars and cramped geography leads to a huge number of parking tickets.

In an attempt to scientifically analyse this phenomenon, the NYC Police Department has collected data for parking tickets. Of these, the data files for multiple years are publicly available on Kaggle. We will try and perform some exploratory analysis on a part of this data. Spark will allow us to analyse the full files at high speeds as opposed to taking a series of random samples that will approximate the population. For the scope of this analysis, we will analyse the parking tickets over the year 2017. 

Note: Although the broad goal of any analysis of this type is to have better parking and fewer tickets, we are not looking for recommendations on how to reduce the number of parking tickets—there are no specific points reserved for this.

The purpose of this case study is to conduct an exploratory data analysis that will help you understand the data. Since the size of the dataset is large, your queries will take some time to run, and you will need to identify the correct queries quicker. The questions given below will guide your analysis.

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Exploratory Analysis") \
    .getOrCreate()

In [None]:
parking = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load('/kaggle/input/nyc-parking-tickets/Parking_Violations_Issued_-_Fiscal_Year_2017.csv')
parking

In [None]:
parking.show(5)

In [None]:
parking.head()

In [None]:
# summary statistics
parking.describe().show()

In [None]:
# datatype of columns
parking.printSchema()

In [None]:
# rows
parking.count()

In [None]:
# columns
len(parking.columns)

In [None]:
# Drop duplicates
parking=parking.dropDuplicates()
parking.count()

In [None]:
# Droping null values if any
parking=parking.dropna()
parking.count()

In [None]:
parking.select('Summons Number').distinct().count()

In [None]:
parking= parking.toDF(*(c.replace(' ', '_') for c in parking.columns))
parking.show(5)

In [None]:
parking.createOrReplaceTempView("parkingtable")

In [None]:
spark.sql('Select * from parkingtable')

In [None]:
# Total number of tickets for each year
sql_ticket_year = spark.sql("select year(Issue_Date) as year, count(Summons_Number) as no_of_tickets from parkingtable group by year order by year")
sql_ticket_year.show(100)

### Summary
    -- So we have data from 1972 to 2069
    -- The data is centered around 2016-2017
    -- For the scope of this analysis, we will analyse the parking tickets over the year 2017. 

In [None]:
sql_ticket_year.count()

**There are 55 distinct years. As we have to consider data which belongs to 2017. We should consider only 2017.**

In [None]:
# Filtering only 2017 data
parking.createOrReplaceTempView("tble_view2017")
parking=spark.sql("select * from tble_view2017 where year(TO_DATE(CAST(UNIX_TIMESTAMP(Issue_Date,'MM/dd/yyyy') AS TIMESTAMP))) = 2017 ")
parking.count()

In [None]:
# For using SQL, you need to create a temporary view
parking.createOrReplaceTempView("tble_view2017")

#Showing distribution 
Distribution_on_years= spark.sql("SELECT year(Issue_Date) as year,month(Issue_Date) as month,count(*) as Ticket_Frequency FROM tble_view2017 GROUP BY year(Issue_Date),month(Issue_Date) order by Ticket_Frequency desc")
Distribution_on_years.show()

**Maximum number of violations are in the month of May. It has been observed that from July to December, there is a significant drop in number of violations.**

In [None]:
Number_of_Violations_by_month = Distribution_on_years.toPandas()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
plt.clf()
Number_of_Violations_by_month.plot(x= 'month', y='Ticket_Frequency', kind='bar')
plt.title("Violations on the basis of month in 2017")
plt.xlabel('month')
plt.ylabel('Ticket_Frequency')
plt.show()

In [None]:
Checking_null_values=spark.sql("Select count(*) as Number_of_Null_Values from tble_view2017 where Summons_Number is NULL or Plate_ID is NULL or Registration_State is NULL or Issue_Date is NULL or Violation_Code is NULL or Vehicle_Body_Type is NULL or Vehicle_Make is NULL or Violation_Precinct is NULL or Issuer_Precinct is NUll or Violation_Time is NULL ")
Checking_null_values.show()

**There is no field with null value.**

In [None]:
#Checking on Plate_ID field to know if there are cases with same plate id.

Plate_Id_Check=spark.sql("Select Plate_ID, count(*) as Ticket_Frequency from tble_view2017 group by Plate_ID having count(*)>1 order by Ticket_Frequency desc")
Plate_Id_Check.show()

**There is one value'BLANKPLATE' which we cannot track. Therefore, we can remove this.**

In [None]:
parking=parking[parking.Plate_ID!='BLANKPLATE']
parking.count()

In [None]:
# For using SQL, you need to create a temporary view
parking.createOrReplaceTempView("tble_view2017")

In [None]:
# Lets see number of violations above 500
Plate_Id_Above_500=spark.sql("Select Plate_ID, count(*) as Ticket_Frequency from tble_view2017 group by Plate_ID having count(*)>=500 order by Ticket_Frequency desc")
Plate_Id_Above_500.show()

In [None]:
# Plot number of violation above 500
Number_of_Violations_By_PlateID=Plate_Id_Above_500.toPandas()
plt.clf()
Number_of_Violations_By_PlateID.plot(x= 'Plate_ID', y='Ticket_Frequency', kind='bar')
plt.title("Number of Violations above 500 ")
plt.xlabel('Plate_ID')
plt.ylabel('Ticket_Frequency')
plt.show()

**There are 7 Plate ID with more than 500 violations.**

## Questions to Be Answered in the Analysis

The following analysis should be performed on PySpark mounted on your CoreStack cluster, using the PySpark library. Remember that you need to summarise the analysis with your insights along with the code.

### Examine the data

#### Q1. Find the total number of tickets for the year

In [None]:
q1=spark.sql("Select count(*),count(distinct(Summons_Number)) from tble_view2017")
q1.show()

**As number of distinct Summons_Number is equal to total number of rows in data frame.That means there are no duplicate Summons_Number.**

#### Q2. Find out the number of unique states from where the cars that got parking tickets came from. 

In [None]:
q2 = spark.sql("SELECT distinct(Registration_State), Count(*) as Number_of_Records from tble_view2017 group by Registration_State order by Number_of_Records desc")
q2.count()

In [None]:
q2.show(500)

##### There are 65 distinct values of Registration_State.
- There is a numeric entry '99' in the column which should be corrected. We need to replace it with the state having maximum entries.
- As maximum number of tickets are issued in NY, We will replace 99 by NY.

In [None]:
from pyspark.sql.functions import when,lit
parking=parking.withColumn('Registration_State',when(parking["Registration_State"]=="99",lit('NY')).otherwise(parking["Registration_State"]))

In [None]:
parking.createOrReplaceTempView("tble_view2017")

In [None]:
# Lets check again for number of distinct Registration_State.

q2=spark.sql("SELECT Registration_State, Count(*) as Ticket_Frequency from tble_view2017 group by Registration_State order by Ticket_Frequency desc")
q2.count()

##### There are 64 distinct values of Registration_State after replacing '99' with 'NY'

In [None]:
#By using SQL, create a temporary veiw:
parking.createOrReplaceTempView("tble_view2017") 

In [None]:
# plot violation on the basis of Registration State
q2_for_plot = q2.toPandas()
plt.figure(figsize=(100,200))
q2_for_plot.head(10).plot(x='Registration_State', y='Ticket_Frequency', kind='bar')
plt.title("Violations on the basis of Registration State (top 10)")
plt.xlabel('Registration State')
plt.ylabel('Ticket Frequency')

plt.show()

## Aggregation tasks

### Q1. How often does each violation code occur? Display the frequency of the top five violation codes.

In [None]:
# Violation Code count
from pyspark.sql.functions import count,desc,countDistinct
parking.select(countDistinct("Violation_Code")).show()

In [None]:
# Frequency of each violation code occurs
Violation_Code_count = parking.select("Violation_Code")\
  .groupBy("Violation_Code")\
.agg(count("Violation_Code")\
.alias("no_of_tickets"))\
.sort(desc("no_of_tickets"))

Violation_Code_count.show(100)

In [None]:
# Top 5 Violation code
q3 = spark.sql("SELECT Violation_Code, Count(*) as Ticket_Frequency from tble_view2017 group by Violation_code order by Ticket_Frequency desc")
q3.show(5)

In [None]:
# Top 5 Violation code
q3_for_plot = q3.toPandas()
plt.clf()
q3_for_plot.head(5).plot(x='Violation_Code', y='Ticket_Frequency', kind='bar')
plt.title("Top Violation Code")
plt.xlabel('Violation Code')
plt.ylabel('Ticket Frequency')
plt.show()

### Q2. How often does each 'vehicle body type' get a parking ticket? How about the 'vehicle make'? 

In [None]:
# Each vehicle body type get a parking ticket
vehicleBodyType = spark.sql("SELECT Vehicle_Body_Type, count(*) as Ticket_Frequency from tble_view2017 group by Vehicle_Body_Type order by Ticket_Frequency desc")
vehicleBodyType.show(5)

In [None]:
# plot Violations on the basis of Vehicle_Body_Type
vehicleBodyType_for_plot = vehicleBodyType.toPandas()
plt.clf()
vehicleBodyType_for_plot.head(5).plot(x='Vehicle_Body_Type', y='Ticket_Frequency', kind='bar')
plt.title("Violations on the basis of Vehicle_Body_Type")
plt.xlabel('Vehicle Body Type')
plt.ylabel('Ticket Frequency')
plt.show()

### How about the 'vehicle make'?

In [None]:
vehicleMake = spark.sql("SELECT Vehicle_Make, count(*) as Ticket_Frequency from tble_view2017 group by Vehicle_Make order by Ticket_Frequency desc")
vehicleMake.show(5)

In [None]:
# plot Violations on the basis of Vehicle_Make
vehicleMake_for_plot = vehicleMake.toPandas()
plt.clf()
vehicleMake_for_plot.head(5).plot(x='Vehicle_Make', y='Ticket_Frequency', kind='bar')
plt.title("Violations on the basis of Vehicle_Make")
plt.xlabel('Vehicle Make')
plt.ylabel('Ticket Frequency')
plt.show()

### Q3 :  A precinct is a police station that has a certain zone of the city under its command.Find the (5 highest) frequency of tickets for each of the following:
#### 1.'Violation Precinct' (this is the precinct of the zone where the violation occurred). Using this, can you make any insights for parking violations in any specific areas of the city?

In [None]:
Violation_Precinct = spark.sql("SELECT Violation_Precinct, count(*) as Ticket_Frequency from tble_view2017 group by Violation_Precinct order by Ticket_Frequency desc")
Violation_Precinct.show(6)

##### Here, you would have noticed that the dataframe has the'Violating Precinct' as '0'. These are erroneous entries. Hence, you need to provide the records for five correct precincts. (Hint: Print the top six entries after sorting.)

In [None]:
# plot Violations on the basis of Violation_Precinct
Violation_Precinct_for_plot = Violation_Precinct.toPandas()
plt.clf()
Violation_Precinct_for_plot[Violation_Precinct_for_plot.Violation_Precinct!=0].head(5).plot(x='Violation_Precinct', y='Ticket_Frequency', kind='bar')
plt.title("Violations on the basis of Violation_Precinct")
plt.xlabel('Vehicle Precinct')
plt.ylabel('Ticket Frequency')
plt.show()

#### 2 'Issuer Precinct' (this is the precinct that issued the ticket)

In [None]:
Issue_precinct = spark.sql("SELECT Issuer_Precinct, count(*) as Ticket_Frequency from tble_view2017 group by Issuer_Precinct order by Ticket_Frequency desc")  
Issue_precinct.show(5)

##### Here, you would have noticed that the dataframe has the 'Issuing Precinct' as '0'. These are erroneous entries. Hence, you need to provide the records for five correct precincts. (Hint: Print the top six entries after sorting.)

In [None]:
# plot Violations on the basis of Issue_Precinct
Issue_Precinct_for_plot = Issue_precinct.toPandas()
plt.clf()
Issue_Precinct_for_plot[Issue_Precinct_for_plot.Issuer_Precinct!=0].head(5).plot(x='Issuer_Precinct', y='Ticket_Frequency', kind='bar')
plt.title("Violations on the basis of Issuer Precinct TOP 5")
plt.xlabel('Issuer Precinct')
plt.ylabel('Ticket Frequency')
plt.show()

- So the top 5 area where most violation occurs are 19, 14, 1, 18 and  114.
- Similarily,  the top 5 Issuer Precient are 19, 14, 1, 18 and  114

### Q4. Find the violation code frequency across three precincts which have issued the most number of tickets.Do these precinct zones have an exceptionally high frequency of certain violation codes? Are these codes common across precincts? Hint: In the SQL view, use the 'where' attribute to filter among three precincts

#### 4.1 Finding violation code frequency

In [None]:
violation_code_freq = spark.sql("select Issuer_Precinct,Violation_Code, count(*) as Frequency from tble_view2017 group by Issuer_Precinct, Violation_Code order by Frequency desc" )
violation_code_freq.show(7)

**We are not considering 0. Therefore 18,19,14 are the three issuer precincts with maximum number of violations.**

In [None]:
# Lets dive into the Issuer Precinct one by one
# Issuer Precinct 18 here
violation_code_freq_18 = spark.sql("select Violation_Code, count(*) as Frequency from tble_view2017 where Issuer_Precinct=18 group by Violation_Code order by Frequency desc" )
violation_code_freq_18.show(10)

In [None]:
# Issuer Precinct 19 here
violation_code_freq_19 = spark.sql("select Violation_Code, count(*) as Frequency from tble_view2017 where Issuer_Precinct=19 group by Violation_Code order by Frequency desc" )
violation_code_freq_19.show(10)

In [None]:
# Issuer Precinct 14 here
violation_code_freq_14 = spark.sql("select Violation_Code, count(*) as Frequency from tble_view2017 where Issuer_Precinct=14 group by Violation_Code order by Frequency desc" )
violation_code_freq_14.show(10)

#### 4.2 Common codes across precincts

In [None]:
common_codes =spark.sql("select Violation_Code, count(*) as Frequency from tble_view2017 where Issuer_Precinct in (18,19,14) group by Violation_Code order by Frequency desc")
common_codes.show(5)

**Summary:**
    
- Precinct 18 and Precinct 14 has more less similar top violation code.
    
- But Precinct 19 has very different top violation code.

### Q5.Find out the properties of parking violations across different times of the day:

- Find a way to deal with missing values, if any.
(Hint: Check for the null values using 'isNull' under the SQL. Also, to remove the null values, check the 'dropna' command in the API documentation.)

- The Violation Time field is specified in a strange format. Find a way to make this a time attribute that you can use to divide into groups.

- Divide 24 hours into six equal discrete bins of time. Choose the intervals as you see fit. For each of these groups, find the three most commonly occurring violations.
(Hint: Use the CASE-WHEN in SQL view to segregate into bins. To find the most commonly occurring violations, you can use an approach similar to the one mentioned in the hint for question 4.)

- Now, try another direction. For the three most commonly occurring violation codes, find the most common time of the day (in terms of the bins from the previous part).

In [None]:
# Number of missing values
null_vltime_2017 = spark.sql("SELECT count(*) as No_of_Count_Values from tble_view2017 WHERE Violation_Time is NULL")
null_vltime_2017.show()

In [None]:
#Checking for the null value
from pyspark.sql.functions import col
parking.where(col("Violation_Time").isNull()).show()

In [None]:
parking.select('Violation_Time').show()

In [None]:
# Divide 24 hours into six equal discrete bins of time.
bins=spark.sql("SELECT Summons_Number, Violation_Code , Violation_Time, Issuer_Precinct, case when substring(Violation_Time,1,2) in ('00','01','02','03','12') and upper(substring(Violation_Time,-1))='A' then 1 when substring(Violation_Time,1,2) in ('04','05','06','07') and upper(substring(Violation_Time,-1))='A' then 2 when substring(Violation_Time,1,2) in ('08','09','10','11') and upper(substring(Violation_Time,-1))='A' then 3 when substring(Violation_Time,1,2) in ('12','00','01','02','03') and upper(substring(Violation_Time,-1))='P' then 4 when substring(Violation_Time,1,2) in ('04','05','06','07') and upper(substring(Violation_Time,-1))='P' then 5 when substring(Violation_Time,1,2) in ('08','09','10','11') and upper(substring(Violation_Time,-1))='P' then 6 else null end as Violation_Time_bin from tble_view2017 where Violation_Time is not null or (length(Violation_Time)=5 and upper(substring(Violation_Time,-1)) in ('A','P') and substring(Violation_Time,1,2) in ('00','01','02','03','04','05','06','07', '08','09','10','11','12'))")
bins.show()          

#### Bins Details

    Bin       Time Interval
    1         12:00 AM to 4:00 AM
    2         4:00 AM to 8:00 AM
    3         8:00 AM to 12:00 PM
    4         12:00 PM to 4:00 PM
    5         4:00 PM to 8:00 PM
    6         8:00 PM to 12:00 AM

In [None]:
bins.createOrReplaceTempView("bins_tbl")

In [None]:
# violation code time count
violation_code_time_count = spark.sql("SELECT Violation_Code,Violation_Time_bin, count(*) count from bins_tbl group by Violation_Code,Violation_Time_bin")
violation_code_time_count.show()

In [None]:
bin1 = spark.sql("select Violation_Code,count(*) Vio_cnt from bins_tbl where Violation_Time_bin == 1 group by Violation_Code order by Vio_cnt desc")
bin1.show(3)

In [None]:
bin2 = spark.sql("select Violation_Code,count(*) Vio_cnt from bins_tbl where Violation_Time_bin == 2 group by Violation_Code order by Vio_cnt desc")
bin2.show(3)

In [None]:
bin3 = spark.sql("select Violation_Code,count(*) Vio_cnt from bins_tbl where Violation_Time_bin == 3 group by Violation_Code order by Vio_cnt desc")
bin3.show(3)

In [None]:
bin4 = spark.sql("select Violation_Code,count(*) Vio_cnt from bins_tbl where Violation_Time_bin == 4 group by Violation_Code order by Vio_cnt desc")
bin4.show(3)

In [None]:
bin5 = spark.sql("select Violation_Code,count(*) Vio_cnt from bins_tbl where Violation_Time_bin == 5 group by Violation_Code order by Vio_cnt desc")
bin5.show(3)

In [None]:
bin6 = spark.sql("select Violation_Code,count(*) Vio_cnt from bins_tbl where Violation_Time_bin == 6 group by Violation_Code order by Vio_cnt desc")
bin6.show(3)

**Now, try another direction. For the three most commonly occurring violation codes, find the most common time of the day (in terms of the bins from the previous part).**

In [None]:
time_bin = spark.sql("select Violation_Time_bin, count(*) Vio_count from bins_tbl where Violation_Code in (21, 36, 38) group by Violation_Time_bin order by Vio_count desc")
time_bin.show(3)

**Bins 3, 4, 5 are having most violations**

The obvious reason could be, In day time significantly more vehicles were running and hence more violations. 

### Q6.Let’s try and find some seasonality in this data:

#### a)First, divide the year into some number of seasons,and find frequencies of tickets for each season.

In [None]:
tickets_seasonality = spark.sql("select Violation_Code , Issuer_Precinct, case when MONTH(TO_DATE(Issue_Date, 'MM/dd/yyyy')) between 03 and 05 then 'spring' when MONTH(TO_DATE(Issue_Date, 'MM/dd/yyyy')) between 06 and 08 then 'summer' when MONTH(TO_DATE(Issue_Date, 'MM/dd/yyyy')) between 09 and 11 then 'autumn' when MONTH(TO_DATE(Issue_Date, 'MM/dd/yyyy')) in (1,2,12) then 'winter' else 'unknown' end  as season from tble_view2017")
tickets_seasonality.show()

#### Season Binning Details

    Season    Month intervall
    
    spring    March, April, May
    summer    June, July, August
    autumn    September, October, November
    winter    December, January, February

In [None]:
tickets_seasonality.createOrReplaceTempView("tickets_seasonality_tbl")

In [None]:
tickets_seasonality_freq = spark.sql("select season, count(*) as no_of_tickets from tickets_seasonality_tbl group by 1 order by 2 desc")
tickets_seasonality_freq.show()

In [None]:
# Spring season
violation_spring = spark.sql("select Violation_Code, count(*) as Frequency from tickets_seasonality_tbl where Issuer_Precinct in (19, 14, 1) and season = 'spring' group by Violation_Code order by Frequency desc" )
violation_spring.show(3)

In [None]:
# Winter season
violation_winter = spark.sql("select Violation_Code, count(*) as Frequency from tickets_seasonality_tbl where Issuer_Precinct in (19, 14, 1) and season = 'winter' group by Violation_Code order by Frequency desc" )
violation_winter.show(3)

In [None]:
# Summer season
violation_summer = spark.sql("select Violation_Code, count(*) as Frequency from tickets_seasonality_tbl where Issuer_Precinct in (19, 14, 1) and season = 'summer' group by Violation_Code order by Frequency desc" )
violation_summer.show(3)

In [None]:
# Autumn season
violation_autumn = spark.sql("select Violation_Code, count(*) as Frequency from tickets_seasonality_tbl where Issuer_Precinct in (19, 14, 1) and season = 'autumn' group by Violation_Code order by Frequency desc" )
violation_autumn.show(3)

### Q7. The fines collected from all the instances of parking violation constitute a source of revenue for the NYC Police Department. Let’s take an example of estimating this for the three most commonly occurring codes:

#### a). Find total occurrences of the three most common violation codes

In [None]:
common_Violation = spark.sql("select Violation_Code, count(*) as Frequency from tble_view2017 group by Violation_Code order by Frequency desc")
common_Violation.show(3)

#### b). Using this information, find the total amount collected for the three violation codes with maximum tickets. State the code which has the highest total collection.

In [None]:
from pyspark.sql.functions import when

common_Violation_fine=common_Violation.withColumn("fine",when(common_Violation.Violation_Code == 21, (common_Violation.Frequency) *55).otherwise((common_Violation.Frequency)*50))
common_Violation_fine.show(3)
print('Total collection = ',767740*55+662765*50+541526*50)

**code with 21 had the highest collection.**

#### c).What can you intuitively infer from these findings?
- Jan to June had the major violation & July  to Dec has a drastic drop.
- Highest violation &collection was by Code-21(No parking where parking is not allowed by sign, street marking or traffic control device.)

In [None]:
spark.stop()