# Cleaning the 2021 NYC Parking Violation Dataset

Disclaimer: I am currently working on optimizing this code and clarifying the steps taken to arrive at a cleaned dataset. 
<br>The section converting Manhattan ticket fine values in particular should be automated.

This is the beginning of my very first independent project after completing IBM's Professional Data Analyst Specialization at the end of March, 2022.

<p>Roughly 15 million violations were issued by NYC in fiscal year 2021 (July 2020 - June 2021). Our final cleaned dataset is composed of about 6.6 million rows.
<br>Unfortunately, many data entries containing null or 'intranslatable' values (i.e. violation time of '45:21 PM') were removed, primarily due to the limitations of my PC.
<br>Efforts were made to preserve as much data as possible after initial nan drop by replacing with median/mode/manually input spell-check key:value pairs.</p>

The main dataset was obtained from <a href="https://data.cityofnewyork.us/City-Government/Parking-Violations-Issued-Fiscal-Year-2021/kvfd-bves">NYC OpenData</a> using the Socrata OpenData API.

Violation and vehicle codes were converted to full text descriptions using information 'scraped' from official nyc.gov documentation. Identical values written in numerous different ways were standardized when able. 

Below you may find my cleaning process:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from datetime import datetime as dt
from sodapy import Socrata

#### Importing dataset using Socrata API

In [None]:
# Ordered by date ascending 2021 fiscal year begins on row \254353 and ends on row \14955034
data_url = 'data.cityofnewyork.us'
data_set = 'kvfd-bves'
app_token = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
client = Socrata(data_url, app_token)
client.timeout = 90
start = 254353
chunk_size = 50000
end = 14955034
results = []
while True:
     #records starting at 'start'
     results.extend( client.get(data_set, 
                     select="""
                     summons_number, registration_state, plate_type, issue_date, violation_code, vehicle_body_type, 
                     vehicle_make, violation_precinct, issuer_code, violation_time, violation_county, house_number, 
                     street_name, vehicle_color, vehicle_year""",
                     order='issue_date',
                     offset=start,
                     limit=chunk_size))
     start = start + chunk_size
     if (start > end):
        break
df = pd.DataFrame.from_records(results)

#### Cleaning/Pruning

In [None]:
# Convert to datetime
df['issue_date'] = pd.to_datetime(df['issue_date'])

# Remove rows after 2021
df = df[df['issue_date'] <= '2021-12-31']

# Remove rows with NaN to protect CPU
df2 = df.dropna()

# Replacing invalid years with median

df2['vehicle_year'] = df2.vehicle_year.astype('int64')
df2.vehicle_year.loc[df2['vehicle_year'] == 0] = 2015
df2.vehicle_year.loc[df2['vehicle_year'] >= 2023] = 2015

In [None]:
df2.describe(exclude=['datetime64[ns]'])

In [None]:
# Convert violation_precinct to int
df2['violation_precinct'] = pd.to_numeric(df2.violation_precinct)

#remove all rows with precinct > 123 (no such precincts exist in NYC)
df2 = df2[df2['violation_precinct'].between(1, 123, inclusive='both')]

In [None]:
# Plate Types and State Codes using PDF: http://www.nyc.gov/html/dof/html/pdf/faq/stars_codes.pdf
plates = 'C:\\Users\\sarzy\\Documents\\Jupyter-lab\\2022 Parking Violations\\platetypes.xlsx'
states = 'C:\\Users\\sarzy\\Documents\\Jupyter-lab\\2022 Parking Violations\\territorycodes.xlsx'
plate_df = pd.read_excel(plates, index_col=0)
state_df = pd.read_excel(states, index_col=0)

In [None]:
dict1 = plate_df.to_dict()
dict2 = state_df.to_dict()
plate_dict = dict1['plate_types']
state_dict = dict2['territory']

In [None]:
df3 = df2.replace({'registration_state': state_dict})
df3.replace({'plate_type': plate_dict}, inplace = True)

In [None]:
#Violation codes and fines from NYC Open Data
vios = 'C:\\Users\\sarzy\\Documents\\Jupyter-lab\\2022 Parking Violations\\ParkingViolationCodes_January2020.xlsx'
vio_df = pd.read_excel(vios)

In [None]:
#set as dictionary
viocodes = vio_df[['violation_code', 'violation_description']].set_index('violation_code').to_dict()
viofine = vio_df[['violation_description', 'fine_amount']].set_index('violation_description').to_dict()
viocodes_dict = viocodes['violation_description']
viofine_dict = viofine['fine_amount']

In [None]:
df3['violation_code'] = pd.to_numeric(df3.violation_code)
df3.replace({'violation_code':viocodes_dict}, inplace=True)

In [None]:
df3['violation'] = df3['violation_code']
df3.replace({'violation_code':viofine_dict}, inplace=True)
df3.rename(columns={'violation_code':'violation_fine'}, inplace=True)

### Cleaning the Violation Time

In [None]:
import re
#convert '0000P' to '0000 PM'
times = df3.violation_time.tolist()
time_formatted = []
for time in times:
    if re.search('.+P', time):
        time_formatted.append(time.replace('P', ' PM'))
    elif re.search('.+A', time):
        time_formatted.append(time.replace('A', ' AM'))
    else: ##I need to find these culprits##
        time_formatted.append(time)

In [None]:
# Finding the culprits:

culprits = []
for suspect in time_formatted:
    if not re.search(('.+A'), suspect):
        if not re.search(('.+P'), suspect):
            culprits.append(suspect)

In [None]:
# Adjusting time values in incorrect format:
for culprit in culprits:
    if culprit[0:2] <= '11':
        time_formatted[time_formatted.index(culprit)] = (culprit + ' ' + 'AM')
    else:
        time_formatted[time_formatted.index(culprit)] = (culprit + ' ' + 'PM')

time_formatted[time_formatted.index('110  AM')] = '1100 AM'
time_formatted[time_formatted.index('040/ PM')] = '0400 PM'

In [None]:
# Checking for potential formatting issues:
wrong_length = []
for time in time_formatted:
    if len(time) != 7:
        wrong_length.append(time)
print(wrong_length)
print((len(df3.violation_time)) == (len(time_formatted)))

In [None]:
# Checking for minute values > 59
wrong_minutes = []
for time in time_formatted:
    if time[2:4] >= '60':
        wrong_minutes.append(time)
print(wrong_minutes)

##### Standardizing times to hours in 12H format, isolating values with H > 24, adding colon for style

In [None]:
# Defining hour dictionary and empty lists
time_dict = {'13':'01:', '14':'02:', '15':'03:', '16':'04:', '17':'05:', '18':'06:','19':'07:','20':'08:','21':'09:','22':'10:','23':'11:'}
time_list = []
times_over_24 = []

for time in time_formatted:
    if '00' <= time[0:2] <= '24':                                    # Time values within 00H - 24H range
        if '01' <= time[0:2] <= '12':                                # Time values that have hours in 12H format (01 - 12)
            time_list.append(time[0:2] + ':00' + time[4:])
        elif '00' == time[0:2] or '24' == time[0:2]:                 # Midnight hours (00/24)
            if re.search('.+AM', time):
                time_list.append('12:00' + time[4:])
            else:                                                    # PM values
                time_list.append('12:00 ' + 'AM')
        elif '13' <= time[0:2] <= '23':                              # Our data does not contain AM times with hour >12
            for k,v in time_dict.items():                            # Iterate through keys, values in hour dictionary. colon already in values.
                if k in time[0:2]:
                    time_list.append(v + '00' + time[4:])                
    else:                                                            # All times with hour > 24 (only PM exists in our data)
        time_list.append(time[0:2] + ':00' + time[4:])
        times_over_24.append(time[0:2] + ':00' + time[4:])

In [None]:
# Save indices of times over 24 for drop
bad_time_index = []
for time in times_over_24:
    bad_time_index.append(time_list.index(time))

In [None]:
df3['violation_time'] = time_list
df4 = df3.drop(df3.index[bad_time_index])

In [None]:
df4.describe(exclude=['datetime64[ns]', 'int64', 'float64'])

### Replace body type NaN with mode per vehicle make

In [None]:
# Upper case
df4['vehicle_make'] = df4.vehicle_make.str.upper()
df4['vehicle_body_type'] = df4.vehicle_body_type.str.upper()

In [None]:
# All missing body plate types are passenger (mode value)
df_cars = df4[df4['plate_type'] == 'Passenger']
df_cars = df_cars[['vehicle_make', 'vehicle_body_type']]

# Find most frequent body types by vehicle make
df_car_grp = df_cars.groupby(['vehicle_make', 'vehicle_body_type'], as_index = False).size()
df_car_grp.sort_values(by=['size'], ascending = False, inplace = True)

In [None]:
# Drop duplicates and create dictionary to replace values
df_car_grp.drop_duplicates(subset = 'vehicle_make', inplace = True)
car_dict = df_car_grp[['vehicle_make', 'vehicle_body_type']].set_index('vehicle_make').head(20).to_dict()
car_dict = car_dict['vehicle_body_type']

In [None]:
# Create df with remaining NaN rows
narows = df4[df4.isna().any(axis=1)]

# Replace NaN values with corresponding vechicle make's mode body type 
narows['vehicle_type'] = narows['vehicle_make']
narows.replace({'vehicle_type':car_dict}, inplace=True)
narows['vehicle_body_type'] = narows['vehicle_type']
narows.drop(columns = 'vehicle_type', inplace = True)

In [None]:
# Replace main df NaN rows by matching with temp df index
df4.loc[narows.index, :] = narows[:]

### Misc Data Standardization

In [None]:
# Remove issuer_code == '0'
df4 = df4[df4['issuer_code'] != 0]

In [None]:
# Upper-casing
df4 = df4.applymap(lambda x: x.upper() if type(x) == str else x)

In [None]:
# Precinct as int
df4['violation_precinct'] = df4.violation_precinct.astype('int64')

In [None]:
# Month column
import datetime
df4['month_year'] = pd.DatetimeIndex(df4['issue_date']).strftime('%b %Y').str.upper()

### Standardizing Location Info

In [None]:
# Standardize violation county as borough name
county_dict = {'NY': 'MANHATTAN', 'K': 'BROOKLYN', 'Q': 'QUEENS', 'BX': 'BRONX', 'R': 'STATEN ISLAND', 
               'KINGS':'BROOKLYN', 'QNS': 'QUEENS', 'F': 'QUEENS', 'RICH':'STATEN ISLAND'} #Based off address, F stands for Flushing, Queens
df4.replace({'violation_county':county_dict}, inplace = True) 

In [None]:
# Merge house_number and street_name into address
# Consider using splchk_addr to spellcheck addresses. https://rdrr.io/github/gmculp/rNYCclean/man/splchk_addr.html
df4['address'] = df4[['house_number', 'street_name', 'violation_county']].agg(' '.join, axis=1)

### Standardizing Vehicle Color

In [None]:
# https://data.ny.gov/api/assets/83055271-29A6-4ED4-9374-E159F30DB5AE contains NYC color codes
color_counts = df4.groupby('vehicle_color', as_index = False).size()
color_counts.sort_values(by=['size'], ascending = False, inplace = True)

In [None]:
# Color code dict derived from https://data.ny.gov/api/assets/83055271-29A6-4ED4-9374-E159F30DB5AE and manual interpretation of frequent misspellings 
color_dict = {"BK":"BLACK","BL":"BLUE", "BR":"BROWN","GL":"YELLOW","GY":"GRAY","MR":"RED","OR":"ORANGE","PK":"PINK","PR":"PURPLE","RD":"RED","TN":"BROWN","WH":"WHITE","YW":"YELLOW","NOCL":"NO COLOR",
             "GRY":"GRAY","GRN":"GREEN","BURG":"RED","SIL":"GRAY","WHT":"WHITE","BLK":"BLACK", "GOLD":"YELLOW","GLD":"YELLOW","SILVER":"GRAY","GREY":"GRAY","BLU":"BLUE","GRY":"GRAY", "GR":"GRAY",
             "SILVE":"GRAY","MAROON":"RED","PURPL":"PURPLE","YELLO":"YELLOW","WHI":"WHITE","WT":"WHITE","ORANG":"ORANGE","DKG":"GRAY","GY":"GRAY","GY/":"GRAY","TAN":"BROWN","W":"WHITE","BRN":"BROWN",
              "BRO":"BROWN","SILV":"GRAY","SLV":"GRAY","WH/":"WHITE","SL":"GRAY","NO":"NO COLOR","OTHER":"NO COLOR","DKB":"BLUE","LTB":"BLUE","NOC":"NO COLOR","WHB":"WHITE","B":"BLACK","LTG":"GRAY",
              "LT/":"GRAY","DK/":"BLACK","GYB":"GRAY","RD/":"RED","BK/":"BLACK","RDW":"RED","WHG":"WHITE","GN":"GREEN","BLG":"BLACK","GRW":"GRAY","BL/":"BLUE","G":"GRAY","BLW":"BLACK","LTT":"BROWN",
              "BN":"BROWN","BKG":"BLACK","LAVEN":"PURPLE","RDG":"RED","MAROO":"RED","WHTE":"WHITE","LTGY":"GRAY","OTH":"NO COLOR","YELL":"YELLOW","LT/GRY":"GRAY","YEL":"YELLOW","BW":"BROWN","BRW":"BROWN",
             "BEIGE":"BROWN","BKW":"BLACK","GD":"YELLOW","DKR":"RED","LT/GY":"GRAY",'GR/':"GRAY","BLCK":"BLACK","BLB":"BLUE","SLVR":"GRAY","MAR":"RED","SV":"GRAY","BRWN":"BROWN","DK/GY":"GRAY","R":"RED",
             "BLA":"BLACK","BLAK":"BLACK","WHIT":"WHITE","SILVR":"GRAY","DKP":"PURPLE","GRB":"GRAY","DKGY":"GREEN","GRG":"GREEN","DKGY":"GRAY","GRG":"GRAY","BUR":"RED","GYG":"GRAY","DKM":"RED","BURGU":"RED",
             "BROW":"BROWN","RDT":"RED","WHO":"WHITE","WTE":"WHITE","SIV":"GRAY","BLE":"BLUE","BLAC":"BLACK","PURP":"PURPLE","BG":"BROWN","ORG":"ORANGE","GRE":"GREEN","TNG":"BROWN","DKBL":"BLUE","PUR":"PURPLE",
             "S":"GRAY","BLN":"BLUE","GRA":"GRAY","Y":"YELLOW","GRT":"GRAY","BIEGE":"BROWN","WHE":"WHITE","BLT":"BLUE","BKT":"BLACK","GREN":"GREEN","DK/RD":"RED","ORA":"ORANGE","TN/":"BROWN","ORAN":"ORANGE",
             "YE":"YELLOW","LTBL":"BLUE","BGE":"BROWN","BRON":"BROWN","YLW":"YELLOW","TNR":"BROWN","LTGR":"GRAY","LTP":"PURPLE","BWN":"BROWN","DKRD":"RED","DK/BL":"BLUE","BL/GY":"BLUE","LT/BL":"BLUE",
              "YL":"YELLOW", "MULTI":"NO COLOR","RUST":"BROWN","PRB":"PURPLE","BRT":"BROWN","B L":"BLACK","GYT":"GRAY","LT/GR":"GRAY","MARON":"RED","GLB":"YELLOW","DK/GR":"GRAY","LT/TN":"BROWN","BU":"BLUE",
             "BURGA":"RED","BKL":"BLACK","SLVE":"GRAY","NO CL":"NO COLOR","BLWH":"BLACK","MRG":"RED","DKGR":"GRAY","TEAL":"GREEN","BY":"BLACK","GEY":"GRAY","BLRD":"BLUE","SLR":"GRAY","ORN":"ORANGE","GT":"GRAY",
             "O":"ORANGE","GAY":"GRAY","NLK":"BLACK","GOL":"YELLOW","GARY":"GRAY","NAVY":"BLUE","WJ":"WHITE","RE":"RED","BKJ":"BLACK","BEI":"BROWN","MA":"RED","WHITW":"WHITE","BLC":"BLACK","SLVER":"GRAY",
             "SLIVE":"GRAY","BURGE":"RED"}
df4.replace({'vehicle_color':color_dict}, inplace = True) 

### Standardizing Vehicle Body Type

In [None]:
# https://data.ny.gov/api/assets/83055271-29A6-4ED4-9374-E159F30DB5AE
body_counts = df4.groupby('vehicle_body_type', as_index = False).size()
body_counts.sort_values(by=['size'], ascending = False, inplace = True)

In [None]:
body_dict = {"FIRE":"FIRE TRUCK","CONV":"CONVERTIBLE","SEDN":"SEDAN","SUBN":"SUV","4DSD":"FOUR-DOOR SEDAN","2DSD":"TWO-DOOR SEDAN","H/WH":"HOUSE ON WHEELS","ATV":"ALL TERRAIN VEHICLE",
             "MCY":"MOTORCYCLE","H/IN":"HEARSE","LOCO":"LOCOMOTIVE","CUST":"CUSTOM","RPLC":"REPLICA","AMBU":"AMBULANCE","P/SH":"POWER SHOVEL","RBM":"ROAD BUILDING MACHINE","R/RD":"ROAD ROLLER",
             "RD/S":"ROAD SWEEPER","S/SP":"SAND SPRAYER","SN/P":"SNOW PLOW","TRAV":"SNOW TRAVELER","MOBL":"SNOWMOBILE","TR/E":"TRACTION ENGINE","T/CR":"TRACTOR CRANE","TR/C":"TRUCK CRANE",
             "SWT":"TRUCK W/SMALL WHEELS","W/DR":"WELL DRILLER","W/SR":"WELL SERVICING RIG","FPM":"FEED PROCESSING MACHINE","MCC":"MOBILE CAR CRUSHER","EMVR":"EARTH MOVER","TRAC":"TRACTOR",
             "N/A":"NOT APPLICABLE","DELV":"DELIVERY TRUCK","DUMP":"DUMP TRUCK","FLAT":"FLAT BED TRUCK","PICK":"PICK-UP TRUCK","STAK":"STAKE TRUCK","TANK":"TANK TRUCK","REFG":"SEMI-TRAILER",
             "TOW":"TOW TRUCK","UTIL":"UTILITY","POLE":"POLE TRAILER","BOAT":"BOAT","H/TR":"HOUSE TRAILER","SEMI":"SEMI-TRAILER","TRLR":"SEMI-TRAILER","LTRL":"LIGHT TRAILER",
             "LSVT":"LOW SPEED VEHICLE - TRUCK","BUS":"BUS(OMNIBUS)","LIM":"LIMOUSINE(OMNIBUS)","HRSE":"HEARSE(AMBULANCE)","TAXI":"TAXI","DCOM":"DISABLED COMMERICAL","CMIX":"CEMENT MIXER",
             "MOPD":"MOPED","MFH":"MANUFACTURED HOME","SNOW":"SNOWMOBILE","LSV":"LOW SPEED VEHICLE","SDN":"SEDAN","P-U":"PICK-UP TRUCK","FOUR":"FOUR-DOOR SEDAN","TRUC":"TRUCK", "TR":"TRUCK",
            "TT":"SEMI-TRAILER","TRAI":"SEMI-TRAILER","MOT":"MOTORCYCLE","MC":"MOTORCYCLE","TRC":"SEMI-TRAILER","TR":"SEMI-TRAILER","REFR":"SEMI-TRAILER","MOPE":"MOPED","TRC":"TRUCK CRANE","RF":"SEMI-TRAILER",
            "TLR":"SEMI-TRAILER","TRL":"SEMI-TRAILER","TK":"TRUCK","TRK":"TRUCK","REFRIGERATOR TRAILER":"SEMI-TRAILER", "TL":"SEMI-TRAILER","MOTO":"MOTORCYCLE","HWH":"HOUSE ON WHEELS",
            "CM":"CEMENT MIXER","VAN TRUCK":"VAN","SUBURBAN":"SUV","RV":"HOUSE ON WHEELS", "HOUSE TRAILER":"HOUSE ON WHEELS","JEEP":"SUV","FREI":"SEMI-TRAILER"}
df4.replace({'vehicle_body_type':body_dict}, inplace = True) 

### Converting Manhattan violation fines to appropriate values

In [None]:
df5 = df4.copy()
df5.violation_fine = df5.violation_fine.astype(str)

In [None]:
# Will automate in future:
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'NO PARKING-DAY/TIME LIMITS'), 'violation_fine'] = df5.replace({'violation_fine':{'60.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'NO PARKING-STREET CLEANING'), 'violation_fine'] = df5.replace({'violation_fine':{'45.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'NO PARKING-TAXI STAND'), 'violation_fine'] = df5.replace({'violation_fine':{'60.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'NO PARKING-EXC. AUTH. VEHICLE'), 'violation_fine'] = df5.replace({'violation_fine':{'60.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'OT PARKING-MISSING/BROKEN METR'), 'violation_fine'] = df5.replace({'violation_fine':{'35.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'EXPIRED METER'), 'violation_fine'] = df5.replace({'violation_fine':{'35.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'SELLING/OFFERING MCHNDSE-METER'), 'violation_fine'] = df5.replace({'violation_fine':{'35.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'EXPIRED MUNI METER'), 'violation_fine'] = df5.replace({'violation_fine':{'35.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'FAIL TO DSPLY MUNI METER RECPT'), 'violation_fine'] = df5.replace({'violation_fine':{'35.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'OVERTIME PKG-TIME LIMIT POSTED'), 'violation_fine'] = df5.replace({'violation_fine':{'60.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'EXPIRED MUNI MTR-COMM MTR ZN'), 'violation_fine'] = df5.replace({'violation_fine':{'35.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'EXPIRED METER-COMM METER ZONE'), 'violation_fine'] = df5.replace({'violation_fine':{'35.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'PKG IN EXC. OF LIM-COMM MTR ZN'), 'violation_fine'] = df5.replace({'violation_fine':{'35.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'MARGINAL STREET/WATER FRONT'), 'violation_fine'] = df5.replace({'violation_fine':{'45.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'ANGLE PARKING'), 'violation_fine'] = df5.replace({'violation_fine':{'45.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'WRONG WAY'), 'violation_fine'] = df5.replace({'violation_fine':{'45.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'BEYOND MARKED SPACE'), 'violation_fine'] = df5.replace({'violation_fine':{'45.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'DETACHED TRAILER'), 'violation_fine'] = df5.replace({'violation_fine':{'45.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'NON-COMPLIANCE W/ POSTED SIGN'), 'violation_fine'] = df5.replace({'violation_fine':{'60.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'FAIL TO DISP. MUNI METER RECPT'), 'violation_fine'] = df5.replace({'violation_fine':{'35.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'PARKED BUS-EXC. DESIG. AREA'), 'violation_fine'] = df5.replace({'violation_fine':{'45.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'MISSING EQUIPMENT'), 'violation_fine'] = df5.replace({'violation_fine':{'45.0':'60'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'PLTFRM LFTS LWRD POS COMM VEH'), 'violation_fine'] = df5.replace({'violation_fine':{'45.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'VEH-SALE/WSHNG/RPRNG/DRIVEWAY'), 'violation_fine'] = df5.replace({'violation_fine':{'25.0':'40'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'VEHICLE FOR SALE(DEALERS ONLY)'), 'violation_fine'] = df5.replace({'violation_fine':{'45.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'WASH/REPAIR VEHCL-REPAIR ONLY'), 'violation_fine'] = df5.replace({'violation_fine':{'45.0':'65'}})
df5.loc[(df5['violation_county'] == 'MANHATTAN') & (df5['violation'] == 'VACANT LOT'), 'violation_fine'] = df5.replace({'violation_fine':{'60.0':'65'}})

In [None]:
df5['violation_fine'] = df5.violation_fine.astype('float64')

In [None]:
df5.head()

In [None]:
# Removing values after June 2021
df5 = df5.loc[(df5['month_year'] != 'JUL 2021') & (df5['month_year'] != 'AUG 2021') & (df5['month_year'] != 'SEP 2021') & (df5['month_year'] != 'OCT 2021') & 
              (df5['month_year'] != 'NOV 2021') & (df5['month_year'] != 'DEC 2021')]

In [None]:
df5.to_csv('C:\\Users\\sarzy\\Documents\\Jupyter-lab\\2022 Parking Violations\\2021_parking_violations_cleaned_all_final.csv', index = False)

In [None]:
df5.info()