**You can find the descriptive blog post regarding this kernel [here](https://www.linkedin.com/pulse/la-parking-roulette-data-science-approach-solving-sajak-upadhyaya/)**

In [None]:
import numpy as np
import pandas as pd
import requests
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from pyproj import Proj,transform
from folium.plugins import FastMarkerCluster
from folium.plugins import MarkerCluster
from sklearn.cluster import DBSCAN

In [None]:
dataframe = pd.read_csv('../input/parking-citations.csv',nrows=50000)

In [None]:
dataframe.head(10)

Since our primary focus is the location and type of violation along with the fine amount, we can disregard information such as Marked date and time, Plate expiry date, VIN, Make, Body style etc. We would keep the fine amount to get a sense of the amount of money being collected as fine to get a better idea on financial scale.

In [None]:
dataframe = dataframe.drop(['Ticket number','Meter Id','Marked Time','RP State Plate','Plate Expiry Date','Make','VIN','Body Style','Color','Route','Agency'],axis=1)

As we can see, we have dates and times on a timestamp and a 24 hour format. So the next thing we do is to see the data types of each column and try to convert it into a suitable form fo our analysis.

In [None]:
dataframe.dtypes

Since the date in the column is in IS08601 standard and we only need the year,month and the day,we are going to parse it to remove the time part from the data.

In [None]:
dataframe['Issue Date'] = pd.to_datetime(dataframe['Issue Date'], dayfirst=True)

In [None]:
dataframe = dataframe.sort_values(by='Issue Date')
dataframe.head()

The next thing for us would be to conver the issue time from a 24 hour format and a float data type.

In [None]:
dataframe['Issue time'].isna().sum()

In [None]:
dataframe = dataframe.dropna(subset=['Issue time'])
dataframe['Issue time'] = dataframe['Issue time'].astype(int)

In [None]:
dataframe['Issue time'] = dataframe['Issue time'].apply(str)
for i,time in enumerate(dataframe['Issue time']):
#     print(time)
    length = len(time)
    if(length==1):
        time="000"+ time
    elif(length==2):
        time = "00" + time
    elif(length==3):
        time = "0"+ time
    else:
        time = time
#       
    dataframe.at[i,"Issue time"] = time


In [None]:
# dataframe = dataframe[dataframe['Issue time'].str.contains(':')]

In [None]:
dataframe['Issue time']=pd.to_datetime(dataframe['Issue time'],format ='%H%M').dt.time

In [None]:
dataframe.head()

We now try to bring a hypothesis into our analysis. We pick a specific time duration which is considered to be a rush hour and try to see if the count of parking citations are in any way correlated to the number of citations.

In [None]:
from datetime import time
msrush = time(5,0,0)
merush = time(10,0,0)
esrush = time(16,0,0)
eerush = time(21,0,0)
print(msrush)
print(merush)
print(esrush)
print(eerush)

In [None]:
dataframe['Rush hour'] = np.where(((dataframe['Issue time']>msrush) & (dataframe['Issue time']<merush)) | ((dataframe['Issue time']>esrush) & (dataframe['Issue time']<eerush)),"1","0")

In [None]:
dataframe.head()

Exploring more into the database we now try to look various types of violation code and the number of offences commited.

In [None]:
counts = pd.DataFrame(dataframe['Violation code'].value_counts())
counts.plot(kind='bar',figsize=(20,15),fontsize=12)

In [None]:
print("The top 10 Violations are:")
counts[0:10].plot(kind='bar',figsize=(20,15))

We now compare the number of rush hour violation to the number of number of non rush hour violation

In [None]:
RushDF = pd.DataFrame(dataframe['Rush hour'].value_counts())
RushDF.index=['No Rush','Rush']

In [None]:
RushDF.plot(kind='bar', title ="Rush Hour Vs No Rush",figsize=(15,8))

As we can see some of the Fine amount have NaN as the value, our next approach would be to deal with those values. There are multiple ways to deal with this. One of the option would be to remove the values with NaN out of our analysis but it might lead us to miss out on other important information. Another way to handle the day, which we are going to use is to fill the NaN with the average fine across the parking citations.

In [None]:
n_rows = dataframe.shape[0]
meanFine = dataframe['Fine amount'].mean()
print(meanFine)
for i in range(0,n_rows):
    if np.isnan(dataframe['Fine amount'][i]) == True:
        dataframe['Fine amount'][i] = meanFine

In [None]:
dataframe = dataframe.dropna()
dataframe.head()

In [None]:
dataframe['Fine amount'] = dataframe['Fine amount'].round(0).astype(int)
rushfine = dataframe[dataframe['Rush hour'] =="0"]['Fine amount'].values.sum()
nrushfine = dataframe[dataframe['Rush hour'] =="1"]['Fine amount'].values.sum()
print("Total Amount spent in Fines:", rushfine + nrushfine )
fineDF = pd.DataFrame([rushfine,nrushfine])
fineDF.index=['Rush Hour','Non Rush Hour']
fineDF.columns=['Amoount']
fineDF.plot(kind = 'barh', figsize=(15,10), title="Rush Hour V/S Non Rush Hour Fine", fontsize=14)

In [None]:
start2013 = dt.datetime(2013,1,1)
end2013 = dt.datetime(2013,12,31)
start2014 = dt.datetime(2014,1,1)
end2014 = dt.datetime(2014,12,31)
start2015 = dt.datetime(2015,1,1)
end2015 = dt.datetime(2015,12,31)
start2016 = dt.datetime(2016,1,1)
end2016 = dt.datetime(2016,12,31)
start2017 = dt.datetime(2017,1,1)
end2017 = dt.datetime(2017,12,31)
# print(start2015,end2015)

In [None]:
dataframe.dtypes

In [None]:
data2013 = dataframe[(dataframe['Issue Date']> start2013) & (dataframe['Issue Date']<= end2013)].shape[0]
data2014 = dataframe[(dataframe['Issue Date']> start2014) & (dataframe['Issue Date']<= end2014)].shape[0]
data2015 = dataframe[(dataframe['Issue Date']> start2015) & (dataframe['Issue Date']<= end2015)].shape[0]
data2016 = dataframe[(dataframe['Issue Date']> start2016) & (dataframe['Issue Date']<= end2016)].shape[0]
data2017 = dataframe[(dataframe['Issue Date']> start2017) & (dataframe['Issue Date']<= end2017)].shape[0]

yearDF = pd.DataFrame([data2013,data2014,data2015,data2016,data2017])
yearDF.index =(['2013','2014','2015','2016','2017'])
yearDF.columns =(['Violation Counts'])

yearDF.plot(kind= 'bar', figsize=(15,8))

If you notice the Dataframe, the Latitude and Longitude are given on a in US Feet coordinates according to the NAD_1983_StatePlane_California_V_FIPS_0405_Feet projection. If we try and convert that, every entry with value 99999.0 lies somewhere in the Pacific ocean. So to ease our job in hand, we decided to proceed with only those coordinates whose lat long are not 99999.

In [None]:
dataframe = dataframe[dataframe['Latitude'] != 99999.0]

In [None]:
dataframe.head()

In [None]:
dataframe.head()


In [None]:
LAmap = folium.Map(location=[34.0522,-118.2437],zoom_start=10)
# LAmap

In [None]:
pm = '+proj=lcc +lat_1=34.03333333333333 +lat_2=35.46666666666667 +lat_0=33.5 +lon_0=-118 +x_0=2000000 ' \
     '+y_0=500000.0000000002 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs'

# convert to lat/long
x_in,y_in = dataframe['Latitude'].values, dataframe['Longitude'].values
dataframe['Longitude'],dataframe['Latitude'] = transform(Proj(pm, preserve_units = True), Proj("+init=epsg:4326"), x_in,y_in)

In [None]:
dataframe.head()

In [None]:
mc = MarkerCluster()
for row in dataframe.itertuples():
    mc.add_child(folium.Marker(location =[row.Latitude,row.Longitude],popup = row.Location))

In [None]:
LAmap.add_child(mc)
LAmap

In [None]:
df_top_frequency = dataframe.groupby(['Location','Latitude', 'Longitude'])['Location'].agg(
    {"counts": len}).sort_values(
    "counts", ascending=False).head(10).reset_index()

In [None]:
df_top_frequency.head(10)

The functions below leverages the Places API from Foursquare. Unfortunately I have not been able to make requests through kaggle, but it works if you download the kernel and run it on your local machine. Feel free to try.

In [None]:
CLIENT_ID = 'CNUE4BESOB1KV2MHPXRIE10RLRKXQCFOHYE2MCTS3MJSDUVI' # your Foursquare ID
CLIENT_SECRET = 'UP3N3DIWT25YMXJ4NLEPMKOZYY4VPRBSLWRCFDAKNSOZVTUT' # your Foursquare Secret
VERSION = '20190131' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
LIMIT=20
radius = 500
categoryId = '4c38df4de52ce0d596b336e1'

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']

        venues_list.append([(
            name, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],
            v['venue']['location']['distance'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
#                   'Neighborhood Latitude', 
#                   'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                    'Distance',
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
countparkings = getNearbyVenues(names=df_top_frequency['Location'],
                                   latitudes=df_top_frequency['Latitude'],
                                   longitudes=df_top_frequency['Longitude']
                                  )

Here we calculate the number of parking lots in each of the neighborhood from the highest number of violating.

In [None]:
df_top_freq = countparkings.groupby(['Neighborhood'])['Neighborhood'].agg(
    {"counts_parking": len}).sort_values(
    "counts_parking", ascending=False).head(10).reset_index()

In [None]:
df_top_freq = df_top_freq.rename(index=str, columns={'Neighborhood':'Location'})

In [None]:
finalDF = df_top_freq.merge(df_top_frequency)


In [None]:
finalDF.head(10)

In [None]:
plt.scatter(finalDF['counts_parking'],finalDF['counts'])
plt.xlabel("Number of Parking")
plt.ylabel("Number of offenses")
plt.plot()

In [None]:
finalDF['ratio'] = finalDF['counts'] / finalDF['counts_parking']

In [None]:
finalDF.head(100)

Now that we have our final table with the ratio of parking tickets to number of parking space, we can now pick the worst location and conclude that we need a new parking lot in the neighborhood.

In [None]:
print(finalDF.loc[finalDF['ratio'].idxmax()])