In [2]:
import configparser
from datetime import datetime
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lit, concat
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [3]:
spark = SparkSession \
        .builder \
        .appName("Capstone Cluster") \
        .getOrCreate()

In [4]:
df_ticket = spark.read.format("csv").option("header", "true").load("parking-violations-issued-fiscal-year-2018.csv")

In [5]:
pd.set_option('display.max_columns', 999)

In [6]:
df_ticket.printSchema()

root
 |-- Summons Number: string (nullable = true)
 |-- Plate ID: string (nullable = true)
 |-- Registration State: string (nullable = true)
 |-- Plate Type: string (nullable = true)
 |-- Issue Date: string (nullable = true)
 |-- Violation Code: string (nullable = true)
 |-- Vehicle Body Type: string (nullable = true)
 |-- Vehicle Make: string (nullable = true)
 |-- Issuing Agency: string (nullable = true)
 |-- Street Code1: string (nullable = true)
 |-- Street Code2: string (nullable = true)
 |-- Street Code3: string (nullable = true)
 |-- Vehicle Expiration Date: string (nullable = true)
 |-- Violation Location: string (nullable = true)
 |-- Violation Precinct: string (nullable = true)
 |-- Issuer Precinct: string (nullable = true)
 |-- Issuer Code: string (nullable = true)
 |-- Issuer Command: string (nullable = true)
 |-- Issuer Squad: string (nullable = true)
 |-- Violation Time: string (nullable = true)
 |-- Time First Observed: string (nullable = true)
 |-- Violation County: str

In [7]:
df_ticket.limit(10).toPandas()

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,Street Code3,Vehicle Expiration Date,Violation Location,Violation Precinct,Issuer Precinct,Issuer Code,Issuer Command,Issuer Squad,Violation Time,Time First Observed,Violation County,Violation In Front Of Or Opposite,House Number,Street Name,Intersecting Street,Date First Observed,Law Section,Sub Division,Violation Legal Code,Days Parking In Effect,From Hours In Effect,To Hours In Effect,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
0,1105232165,GLS6001,NY,PAS,2018-07-03T00:00:00.000,14,SDN,HONDA,X,47130,13230,80030,20180702.0,78,78,968,86684,0968,0,0811P,,K,F,2,HANSON PLACE,,0,408,D1,,BBYBBBB,ALL,ALL,BLUE,0,2006,-,0,,,,,
1,1121274900,HXM7361,NY,PAS,2018-06-28T00:00:00.000,46,SDN,NISSA,X,28990,14890,15040,20200203.0,112,112,968,103419,0968,0,1145A,,Q,F,71-30,AUSTIN ST,,0,408,C,,BBBBBBB,ALL,ALL,GRY,0,2017,-,0,,,,,
2,1130964875,GTR7949,NY,PAS,2018-06-08T00:00:00.000,24,SUBN,JEEP,X,64,18510,99,20180930.0,122,122,835,0,0835,0,0355P,,R,,,GREAT KILLS BOAT LAU,,0,408,D5,,BBBBBBB,ALL,ALL,GREEN,0,0,-,0,,,,,
3,1130964887,HH1842,NC,PAS,2018-06-07T00:00:00.000,24,P-U,FORD,X,11310,39800,39735,0.0,122,122,835,0,0835,0,0123P,,R,,,GREAT KILLS PARK BOA,,0,408,D5,,BBBBBBB,ALL,ALL,WHITE,0,0,-,0,,,,,
4,1131599342,HDG7076,NY,PAS,2018-06-29T00:00:00.000,17,SUBN,HYUND,X,47130,13230,80030,20190124.0,78,78,868,2354,0868,0,0514P,,K,F,2,HANSON PLACE,,0,408,C4,,BBBBBBB,ALL,ALL,GREEN,0,2007,-,0,,,,,
5,1131610520,GER9006,NY,PAS,2018-07-02T00:00:00.000,17,SUBN,NISSA,X,64790,18640,18790,20190128.0,103,103,968,86652,0968,0,0827A,0827A,Q,O,94-14,SUTPHIN BLVD,,0,408,C,,BBBBBBB,ALL,ALL,BLK,0,2013,-,0,,,,,
6,1133401569,HLC3177,NY,PAS,2018-07-02T00:00:00.000,21,SDN,HONDA,S,31830,67030,64830,20181219.0,77,77,0,559290,KN08,0,0843A,,K,F,1235,DEAN STREET,,0,408,D1,,YBBYBBB,ALL,ALL,BLACK,0,1999,-,0,,,,,
7,1133401570,HZN6473,NY,PAS,2018-07-02T00:00:00.000,21,SDN,TOYOT,S,68930,23330,54070,20200429.0,77,77,0,559290,KN08,0,0910A,,K,F,1445,PACIFIC STREET,,0,408,D1,,YBBYBBB,ALL,ALL,BLACK,0,2006,-,0,,,,,
8,1133401594,HPC9135,NY,PAS,2019-07-02T00:00:00.000,21,SUBN,HONDA,S,68930,64830,23330,20190710.0,77,77,0,559290,KN08,0,0919A,,K,F,1369,PACIFIC STREET,,0,408,D1,,YBBYBBB,ALL,ALL,GRAY,0,2005,-,0,,,,,
9,1133401636,HZJ8359,NY,PAS,2018-07-02T00:00:00.000,21,SDN,NISSA,S,67030,19230,80430,20180803.0,77,77,0,559290,KN08,0,0818A,,K,F,666,NOSTRAND AVE,,0,408,D1,,YYYYYYB,ALL,ALL,GRAY,0,2018,-,0,,,,,


### Imporing json parking ticket code  data into spark

In [8]:
df_ticket_code = spark.read.json("parking_violation codes.json", multiLine=True)

### printing the Schema

In [9]:
df_ticket_code.printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- meta: struct (nullable = true)
 |    |-- view: struct (nullable = true)
 |    |    |-- attribution: string (nullable = true)
 |    |    |-- attributionLink: string (nullable = true)
 |    |    |-- averageRating: long (nullable = true)
 |    |    |-- category: string (nullable = true)
 |    |    |-- columns: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- cachedContents: struct (nullable = true)
 |    |    |    |    |    |-- largest: string (nullable = true)
 |    |    |    |    |    |-- non_null: long (nullable = true)
 |    |    |    |    |    |-- null: long (nullable = true)
 |    |    |    |    |    |-- smallest: string (nullable = true)
 |    |    |    |    |    |-- top: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |   

In [10]:
df_ticket_code.show()

+--------------------+--------------------+
|                data|                meta|
+--------------------+--------------------+
|[[row-vc2y~qug8_q...|[[Department of F...|
+--------------------+--------------------+



In [11]:
code_list = []
definition_list = []
for data in df_ticket_code.toPandas().data[0]:
    code_list.append(data[8])
    definition_list.append(data[9])

In [12]:
df_codes = pd.DataFrame(columns=['Code','Definition'])

In [13]:
df_codes['Code'] = code_list
df_codes['Definition'] = definition_list

In [14]:
df_codes_spark  = spark.createDataFrame(df_codes)

In [15]:
df_codes_spark.select('Code').where(df_codes_spark.Code == '46').toPandas().head()

Unnamed: 0,Code
0,46


### Creating Vehicle Table

In [16]:
df_vehicle_table = df_ticket.select(col('Plate ID').alias('plate_id'), col('Vehicle Make').alias('vehicle_make')\
                                    ,col('Vehicle Body Type').alias('vehicle_body_type'), col('Vehicle Color').alias('vehicle_color')\
                                    ,col('Vehicle Year').alias('vehicle_year'))

In [17]:
df_vehicle_table.printSchema()

root
 |-- plate_id: string (nullable = true)
 |-- vehicle_make: string (nullable = true)
 |-- vehicle_body_type: string (nullable = true)
 |-- vehicle_color: string (nullable = true)
 |-- vehicle_year: string (nullable = true)



### Create Registration table

In [18]:
df_registration_table = df_ticket.select(col('Plate ID').alias('plate_id'), col('Plate Type').alias('plate_type')\
                                         ,col('Registration State').alias('registration_state'), col('Vehicle Expiration Date').alias('registration_expired_date')\
                                        ,col('Unregistered Vehicle?').alias('unregistered_vehicle'))

In [19]:
df_registration_table.count()

4001111

In [20]:
df_registration_table.printSchema()

root
 |-- plate_id: string (nullable = true)
 |-- plate_type: string (nullable = true)
 |-- registration_state: string (nullable = true)
 |-- registration_expired_date: string (nullable = true)
 |-- unregistered_vehicle: string (nullable = true)



### Create Violation Location Table

In [21]:
df_violation_location_table = df_ticket.select(col('Street Code1').alias('street_code1'), col('Street Code2').alias('street_code2')\
                                         ,col('Street Code3').alias('street_code3'), col('Violation Precinct').alias('violation_precinct')\
                                        ,col('Violation County').alias('violation_county'),col('House Number').alias('house_number')
                                        ,col('Street Name').alias('street_name'),col('Days Parking In Effect    ').alias('parking_enforced_days')
                                        ,col('From Hours In Effect').alias('from_enforced_hours'),col('To Hours In Effect').alias('to_enforced_hours')).dropDuplicates()

In [22]:
df_violation_location_table = df_violation_location_table.withColumn("street_code_key", \
                                    concat(col("street_code1"), lit('-'),col("street_code2"), lit('-'),col("street_code3"))) 

In [23]:
df_violation_location_table.limit(5).toPandas()

Unnamed: 0,street_code1,street_code2,street_code3,violation_precinct,violation_county,house_number,street_name,parking_enforced_days,from_enforced_hours,to_enforced_hours,street_code_key
0,31830,20530,64330,84,K,235,DEAN ST,BBBBBBB,ALL,ALL,31830-20530-64330
1,67830,18030,49730,60,K,5134,OCEANVIEW AVE,BBBBBBB,0100A,0600A,67830-18030-49730
2,18890,10890,39010,110,Q,50-53,96TH STREET,BBBBBBB,ALL,ALL,18890-10890-39010
3,72520,26130,26160,40,BX,551,WALES AVE,BBBBBBB,ALL,ALL,72520-26130-26160
4,55960,12550,12550,45,BX,1,ORCHARD BEACH RD,BBBBBBB,ALL,ALL,55960-12550-12550


### Join Codes table with main tables code details columns

In [24]:
df_codes_joined_spark = df_codes_spark.join(df_ticket.select(col('Law Section').alias('law_section'), col('Sub Division').alias('sub_division'), col('Violation Code').alias('violation_code'))).where(df_ticket['Violation Code'] == df_codes_spark['Code']).dropDuplicates()

AnalysisException: 'Resolved attribute(s) Violation Code#15 missing from Definition#155,violation_code#233,Code#154,sub_division#232,law_section#231 in operator !Filter (Violation Code#15 = Code#154).;;\n!Filter (Violation Code#15 = Code#154)\n+- Join Inner\n   :- LogicalRDD [Code#154, Definition#155], false\n   +- Project [Law Section#37 AS law_section#231, Sub Division#38 AS sub_division#232, Violation Code#15 AS violation_code#233]\n      +- Relation[Summons Number#10,Plate ID#11,Registration State#12,Plate Type#13,Issue Date#14,Violation Code#15,Vehicle Body Type#16,Vehicle Make#17,Issuing Agency#18,Street Code1#19,Street Code2#20,Street Code3#21,Vehicle Expiration Date#22,Violation Location#23,Violation Precinct#24,Issuer Precinct#25,Issuer Code#26,Issuer Command#27,Issuer Squad#28,Violation Time#29,Time First Observed#30,Violation County#31,Violation In Front Of Or Opposite#32,House Number#33,... 19 more fields] csv\n'

In [None]:
df_codes_joined_spark.limit(5).toPandas()

### Create the main ticket violation table

In [25]:
df_ticket.limit(10).toPandas()

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,Street Code3,Vehicle Expiration Date,Violation Location,Violation Precinct,Issuer Precinct,Issuer Code,Issuer Command,Issuer Squad,Violation Time,Time First Observed,Violation County,Violation In Front Of Or Opposite,House Number,Street Name,Intersecting Street,Date First Observed,Law Section,Sub Division,Violation Legal Code,Days Parking In Effect,From Hours In Effect,To Hours In Effect,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
0,1105232165,GLS6001,NY,PAS,2018-07-03T00:00:00.000,14,SDN,HONDA,X,47130,13230,80030,20180702.0,78,78,968,86684,0968,0,0811P,,K,F,2,HANSON PLACE,,0,408,D1,,BBYBBBB,ALL,ALL,BLUE,0,2006,-,0,,,,,
1,1121274900,HXM7361,NY,PAS,2018-06-28T00:00:00.000,46,SDN,NISSA,X,28990,14890,15040,20200203.0,112,112,968,103419,0968,0,1145A,,Q,F,71-30,AUSTIN ST,,0,408,C,,BBBBBBB,ALL,ALL,GRY,0,2017,-,0,,,,,
2,1130964875,GTR7949,NY,PAS,2018-06-08T00:00:00.000,24,SUBN,JEEP,X,64,18510,99,20180930.0,122,122,835,0,0835,0,0355P,,R,,,GREAT KILLS BOAT LAU,,0,408,D5,,BBBBBBB,ALL,ALL,GREEN,0,0,-,0,,,,,
3,1130964887,HH1842,NC,PAS,2018-06-07T00:00:00.000,24,P-U,FORD,X,11310,39800,39735,0.0,122,122,835,0,0835,0,0123P,,R,,,GREAT KILLS PARK BOA,,0,408,D5,,BBBBBBB,ALL,ALL,WHITE,0,0,-,0,,,,,
4,1131599342,HDG7076,NY,PAS,2018-06-29T00:00:00.000,17,SUBN,HYUND,X,47130,13230,80030,20190124.0,78,78,868,2354,0868,0,0514P,,K,F,2,HANSON PLACE,,0,408,C4,,BBBBBBB,ALL,ALL,GREEN,0,2007,-,0,,,,,
5,1131610520,GER9006,NY,PAS,2018-07-02T00:00:00.000,17,SUBN,NISSA,X,64790,18640,18790,20190128.0,103,103,968,86652,0968,0,0827A,0827A,Q,O,94-14,SUTPHIN BLVD,,0,408,C,,BBBBBBB,ALL,ALL,BLK,0,2013,-,0,,,,,
6,1133401569,HLC3177,NY,PAS,2018-07-02T00:00:00.000,21,SDN,HONDA,S,31830,67030,64830,20181219.0,77,77,0,559290,KN08,0,0843A,,K,F,1235,DEAN STREET,,0,408,D1,,YBBYBBB,ALL,ALL,BLACK,0,1999,-,0,,,,,
7,1133401570,HZN6473,NY,PAS,2018-07-02T00:00:00.000,21,SDN,TOYOT,S,68930,23330,54070,20200429.0,77,77,0,559290,KN08,0,0910A,,K,F,1445,PACIFIC STREET,,0,408,D1,,YBBYBBB,ALL,ALL,BLACK,0,2006,-,0,,,,,
8,1133401594,HPC9135,NY,PAS,2019-07-02T00:00:00.000,21,SUBN,HONDA,S,68930,64830,23330,20190710.0,77,77,0,559290,KN08,0,0919A,,K,F,1369,PACIFIC STREET,,0,408,D1,,YBBYBBB,ALL,ALL,GRAY,0,2005,-,0,,,,,
9,1133401636,HZJ8359,NY,PAS,2018-07-02T00:00:00.000,21,SDN,NISSA,S,67030,19230,80430,20180803.0,77,77,0,559290,KN08,0,0818A,,K,F,666,NOSTRAND AVE,,0,408,D1,,YYYYYYB,ALL,ALL,GRAY,0,2018,-,0,,,,,


In [26]:
ticket_fact_df = df_ticket.join(df_violation_location_table).where((df_ticket['Street Code1'] == df_violation_location_table['street_code1']) & (df_ticket['Street Code2'] == df_violation_location_table['street_code2']) & (df_ticket['Street Code3'] == df_violation_location_table['street_code3'])).dropDuplicates()

In [27]:
ticket_fact_df = ticket_fact_df.select(col('Summons Number').alias('summons_number'), col('Plate ID').alias('plate_id'), col('Issue Date').alias('issue_date'), col('Violation Code').alias('violation_code'), col('street_code_key')).limit(100).toPandas()

Unnamed: 0,Summons Number,Plate ID,Issue Date,Violation Code,street_code_key
0,1450865859,JCG3480,2018-11-13T00:00:00.000,21,74230-12130-86030
1,1442593994,VAM9239,2018-08-07T00:00:00.000,21,74230-12130-86030
2,1440991340,HAN8407,2018-07-10T00:00:00.000,21,74230-12130-86030
3,1442635289,HWA8179,2018-08-24T00:00:00.000,21,74230-12130-86030
4,1442635290,XHM1416,2018-08-24T00:00:00.000,21,74230-12130-86030
5,1440991698,HZY7791,2018-07-24T00:00:00.000,21,74230-12130-86030
6,1442689626,HAN8407,2018-08-31T00:00:00.000,21,74230-12130-86030
7,1442597513,T686300C,2018-08-13T00:00:00.000,21,74230-12130-86030
8,1442540345,HAN8407,2018-07-23T00:00:00.000,21,74230-12130-86030
9,1440970853,HAN8407,2018-06-28T00:00:00.000,21,74230-12130-86030
