In [1]:
import numpy as np
import pandas as pd

import pydotplus
from sklearn import tree
import collections

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split # typically done at the start
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder


In [2]:
small_df = pd.read_csv('./Parking_Violations_Issued_-_Fiscal_Year_2018.csv', usecols=  ['Summons Number', 'Registration State', 'Plate Type',
       'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make',
       'Issuing Agency', 'Violation Time', 'Violation County', 'Street Name','Law Section',
       'Sub Division', 'Vehicle Year', 'Feet From Curb'], index_col='Summons Number', nrows = 100000)

In [3]:
# violations_df.columns
# violations_df = violations_df.drop(columns=['Unregistered Vehicle?','Meter Number', 'Unregistered Vehicle?', 'Time First Observed','Intersecting Street','Double Parking Violation','No Standing or Stopping Violation', 'Hydrant Violation','Violation Post Code','Violation Legal Code', 'Intersecting Street','To Hours In Effect','From Hours In Effect','Violation Description','House Number', 'Violation In Front Of Or Opposite', 'Violation Location', 'Days Parking In Effect    '])


In [4]:
columns = ['Summons Number', 'Registration State', 'Plate Type',
       'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make',
       'Issuing Agency', 'Violation Time', 'Violation County', 'Street Name','Law Section',
       'Sub Division', 'Vehicle Year', 'Feet From Curb']
small_df.columns

Index(['Registration State', 'Plate Type', 'Issue Date', 'Violation Code',
       'Vehicle Body Type', 'Vehicle Make', 'Issuing Agency', 'Violation Time',
       'Violation County', 'Street Name', 'Law Section', 'Sub Division',
       'Vehicle Year', 'Feet From Curb'],
      dtype='object')

In [5]:
# small_df = violations_df.sample(500000)
small_df = small_df.dropna()                      

In [6]:
def convert_to_hours(x):
   mid = int(len(x) / 2)
   hrs = int(x[0:mid])
   if (x[len(x) - 1] == 'P' or x[len(x) - 1] == 'A'):
       mins = int(x[mid:len(x) - 1])
       is_pm = x[len(x) - 1] == 'P'
   else:
       mins = int(x[mid:len(x)])
       is_pm = False
   if (is_pm):
       hrs = hrs + 12
   return hrs

small_df['Violation Time'] = small_df['Violation Time'].dropna().apply(lambda x: convert_to_hours(x))

In [7]:
print(small_df.iloc[0])
# small_df.columns

Registration State            NY
Plate Type                   PAS
Issue Date            01/01/2018
Violation Code                71
Vehicle Body Type            SDN
Vehicle Make               HYUND
Issuing Agency                 P
Violation Time                 2
Violation County              NY
Street Name           E 112TH ST
Law Section                  408
Sub Division                  D4
Vehicle Year                   0
Feet From Curb                 0
Name: 1434219940, dtype: object


In [8]:
# small_df['Issue Date']= pd.to_datetime(small_df['Issue Date'])
small_df.drop('Issue Date', axis=1, inplace=True)

### Predicting type of parking violation
Since we only have data about vehicles that have violated parking laws in NYC, the question we are now looking to answer is:  
Can we predict the type of violation committed by a vehicle based on it's Registration State, Plate Type, Vehicle Body Type, Vehicle Make,Issuing Agency, Violation Time, Violation County, Street Name, Law Section, Sub Division, Vehicle Year and Feet From Curb. 

### Creating Dummy variables for categorical data
The dataset we are using consists mostly of categorical data and conducting statistical analysis on categorical data is difficult and inefficent becasue it proposes various challenges and limitations. Text is hard to compare since there is no inherent structure or numeric attributes that define and rank some of the categories. Therefore we created dummy variables for each unique value in each category so it contains binary representation (either it belongs in that subcategory or not). The following kernels of code perform Hot Encoding of the categorical variables and joins them to the dataset. 

In [9]:
one_hot = pd.get_dummies(small_df['Registration State'])
one_hot.columns = ['Reg_State_' + str(col) for col in one_hot.columns]

small_df.drop(columns=['Registration State'],inplace=True)

small_df = small_df.join(one_hot)


In [10]:
one_hot_pl_type = pd.get_dummies(small_df['Plate Type'])
one_hot_pl_type.columns = ['plate_type_' + str(col) for col in one_hot_pl_type.columns]

small_df.drop(columns=['Plate Type'],inplace=True)

small_df = small_df.join(one_hot_pl_type)


In [11]:
one_hot_v_body_type = pd.get_dummies(small_df['Vehicle Body Type'])
one_hot_v_body_type.columns = ['v_body_type_' + str(col) for col in one_hot_v_body_type.columns]

small_df.drop(columns=['Vehicle Body Type'],inplace=True)

small_df = small_df.join(one_hot_v_body_type)



In [12]:
one_hot_v_make = pd.get_dummies(small_df['Vehicle Make'])
one_hot_v_make.columns = ['v_make_' + str(col) for col in one_hot_v_make.columns]

small_df.drop(columns=['Vehicle Make'],inplace=True)

small_df = small_df.join(one_hot_v_make)


In [13]:
one_hot_issuing_agency = pd.get_dummies(small_df['Issuing Agency'])
one_hot_issuing_agency.columns = ['issuing_agency_' + str(col) for col in one_hot_issuing_agency.columns]

small_df.drop(columns=['Issuing Agency'],inplace=True)

small_df = small_df.join(one_hot_issuing_agency)


In [14]:
one_hot_sub_div = pd.get_dummies(small_df['Sub Division'])
one_hot_sub_div.columns = ['sub_div_' + str(col) for col in one_hot_sub_div.columns]

small_df.drop(columns=['Sub Division'],inplace=True)

small_df = small_df.join(one_hot_sub_div)


In [15]:
one_hot_vio_cty = pd.get_dummies(small_df['Violation County'])
one_hot_vio_cty.columns = ['vio_cty_' + str(col) for col in one_hot_vio_cty.columns]

small_df.drop(columns=['Violation County'],inplace=True)

small_df = small_df.join(one_hot_vio_cty)


In [16]:
one_hot_street_name = pd.get_dummies(small_df['Street Name'])
one_hot_street_name.columns = ['strt_name_' + str(col) for col in one_hot_street_name.columns]

small_df.drop(columns=['Street Name'],inplace=True)

small_df = small_df.join(one_hot_street_name)



### Decision Trees
After encoding the data, we create decision trees based on all of our independent variables mentioned in the question above. 
We are using decision trees here because it is fast. We have a lot of data and a lost 5000 columns becasue of hot encoding the data. Most ML methods take an exponentially large time becasue of the size of the data set. Additionally, Decision trees are easy to interpret and visualize.It can easily capture Non-linear patterns. The decision tree has no assumptions about distribution because of the non-parametric nature of the algorithm. [Source](https://scikit-learn.org/stable/modules/tree.html)
However, DTs also tend to overfit the data. Since we have so many columns and the categories are so varied, there is a lot of noise created, detracting us from accurately predicting type of parking violation.   
  
Although we only created the decision trees for a sample of the data, we noticed that as the data got bigger, the accuraccy score also got bigger. This may be becasue we are introducing additional columns becasue there might be new values of categories, adding to additional columns. However the updward trend suggests that a higher number of cases studied improves accuracy.  

In [17]:
train_features, test_features, train_outcome, test_outcome = train_test_split(
   small_df.drop(columns=['Violation Code']),      # features
   small_df['Violation Code'],    # outcome
   test_size=0.30 # percentage of data to use as the test set
)

In [18]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(train_features, train_outcome)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [19]:
accuracy = accuracy_score(tree_clf.predict(test_features), test_outcome)
accuracy

0.9187875514750556

### KNN 
K Nearest neighbors takes an exceptionally long time to run since it has to cross validate the data and perform a grid search to find the best value of K. We were unable to compile the code becasue of memory errors and time it took to run the method. 

In [None]:
pipe = make_pipeline(MinMaxScaler(), KNeighborsClassifier())

param_grid = {'kneighborsclassifier__n_neighbors': range(1,20), 'kneighborsclassifier__weights':["uniform", "distance"]}
grid = GridSearchCV(pipe)
grid.fit(train_features, train_outcome)
grid.score(test_features, test_outcome)