In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split # typically done at the start


In [2]:
small_df = pd.read_csv('./Parking_Violations_Issued_-_Fiscal_Year_2018.csv', usecols=  ['Summons Number', 'Registration State', 'Plate Type',
       'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make', 'Vehicle Color',
       'Issuing Agency', 'Violation Time', 'Violation County','Law Section',
       'Sub Division', 'Vehicle Year'], index_col='Summons Number', nrows = 100000)

def initialClean(df):
    df = df.dropna(how = 'any')
    df = df.drop_duplicates()
    
#     Correct plate type and registration site for vehicle
    plate_types = 'AGR MCD AMB MCL ARG MED ATD MOT ATV NLM AYG NYA BOB NYC BOT NYS CBS OMF CCK OML CHC OMO CLG OMR CMB OMS CME OMT CMH OMV COM ORC CSP ORG DLR PAS EDU PHS FAR PPH FPW PSD GAC RGC GFC RGL GSC SCL GSM SNO HAC Ham SOS HAM HIF SPO HIR SRF HIS SRN HOU STA HSM STG IRP SUP ITP TOW JCA TRA JCL THC JSC TRC JWV TRL LMA USC LMB USS LMC VAS LOC VPL LTR WUG LUA'.split(' ')
    reg_state = 'AL MT AK NE AZ NV AR NH CA NJ CO NM CT NY DE NC FL ND GA OH HI OK ID OR IL PA IN RI IA SC KS SD KY TN LA TX ME UT MD VT MA VA MI WA MN WV MS WI MO WY AB ON BC PE FO QB MB SK MX GV NB DP NF DC NT YT NS'.split(' ')
    df = df[df['Registration State'].isin(reg_state)]
    df = df[df['Plate Type'].isin(plate_types)]

#     Correct make and year of vehicle
    df = clean_make_year(df)
    
#     Correct color
    col_rem = ['PRW','MH','TPE','FT','TAUPE','GRU','XXX','WHBR','AZ','PL','GM','LGITH','BLPR','BK SL','RDBR','LEXUS','RDPR','BRK','MAXIM','BRT','GRBK','GU','BT','BL/SI','ZIWIY','WT BK','BN.','LTBK', 'PRBL','BLTN','VAN','BLT','BRG','GY TN','BLYW','BLBR','GY/BL','DODGE','MAZ','BWTH','IW','NMI','NOCLR','OTH','WYBLU','BKPR','GKN','WAJP','GYG','BRY','KX','VN  R','DKTN','RDTN','TQ','PLE','ORBK','GLGR','YAN','QGY','MITI','BL/WH','QUEEN','GRT','BLK/S','ORWH','BLE','MULT','ZJHT','BWR','BBRW','GRG','TN.','DARL','GYMR','GRGL','CNH', 'SC','GLBK', 'VN  S', 'COM''WT BL','LTPR','BLGL','GRAT','WORKH','GRBL','CRM','MUILT','K.','VLK','GA','RD BK','GYRD','ZKCXN','YE/OR','RDWH','GLK','GT','MC','BLR','KD','GRTN''WHOR','BM','WJT','BK/GR','DI', 'BKT', 'SIM','RDGR', 'RDT', 'WH BL','BE','RUST','WHBK','RY','UJ','GYGL','NH','WH/BL','RDBK','TL', 'BKRD','SR','BRU','BUS','GYQ','MUL','DECAL','BLBK','RDG','BON','LTBR','LTTN','TNGY','TNBL','GRGY','B L','BLWH','BK BL','TNG','RDGY','HRAU','CHAMP','BLKQ','BJ','WB','GYTN','BKBL','BLKL','WHRD','BRGR','BKG','BGE','-','TNRD','GRW','NOCL','W/B','GLBL','BRTOW','NO','RDBL',', ,U','BRZ','UNKNO','BLW','BLGR','GYBL','BLRD','WN','WHGR','GYGR','RDMR','BLB','WHG','GYBK','WHO','BKGR','TNR','OR BK','BLG','NOC','BKTN','MULTI','GYGY','RDW','WT/BL','BLGY','GYB','GYBR','UNK','BKGY', 'DK/', 'TN', 'PR', 'ALUMI', 'OTHER', 'LT/', 'WHB', 'WHBL', 'WHGY', 'TAN', 'LTT']
    df = df[~df['Vehicle Color'].isin(col_rem)]

    black = ['BKBK','BLBL','BLLK','B K','BLACJ','BLKI','BL AC','BLC','CHAR','BLACL','BKL','BLIC','B LK','BLK.','VLACK','B LAC''BIACK','BLCK','BLACC','Black','BLAVK','BIK','BK,','BL/','BLA','BLAC','BK/','BKACK','BLAKC','BK', 'BLACK', 'BLK']
    white = ['WITE','WHOTE','WHICH','WIT','CREAM','WHLE','WHWH','WHIYE','ITE',"WH'",'WHITW','WJOTE','WHTN','WHITR','WHT/S','WHTE','WT.','PEARL','WTE','W','WH.','WHIT', 'WHITE', 'WH', 'WT', 'WH/', 'WHT', 'WHI']
    silver = ['SIV','Silve','SIL.','SVR','SI','SLIVE','SIVL','SILO','SLVER','SILGV','SILVQ','SV','SIVLE','SILER','SLR','SL.','SILVE', 'SIL', 'SL', 'SILV', 'SILVR']
    green = ['GRGR','Green','DK GN','GR.','LGR','LT/GR','DK GR','GN.','DKGR','G','GRE','GREEB','LTGR','GN','GREEN', 'GR', 'GRN', 'GREN', 'GREE']
    grey = ['GEAY','GEY','GARY','DGRAY','GRRAY','GREY.','GHRAY','GRY.','GR/','GRAY.','GRA','HREY','GAEY','GY/','Gray','GY', 'GREY', 'GRAY', 'GRY', 'LTGY', 'LTG', 'DKGY', 'DKG', 'GY.']
    red = ['RR','RED.','RD.','RD/','R','DKRD','RD', 'RED', 'DKR']
    maroon = ['MROO','MRN','MN','DKMR','DKM','MAROO','MAR', 'MR', 'MARO']
    gold = ['GDL','GNY','GLOD','GD.','LTGL','GLTN','GOLD.','GL.','GD','GOLD', 'GL', 'GLD']
    blue = ['TEAL','BLUEW','BLIE','NAVY','B LUE','BL UE','BLUE.','BLUED','LBL','BI','NAVY.','DBL','LBUE','LB','LTBL','DKB','B','LTB', 'BL.', 'DKBL', 'BL', 'BLUE', 'BLU']
    brown = ['BRBR','BROWY','BROK','BRONK','BRZ', 'BRONZ','COPPE','DKBR','BWN','BRW','BROW','BRO','BROWN', 'BR', 'BRN', 'BRWN', 'BW']
    orange = ['O','ORAGE','OG','ORANE','DKOR','ORNGE','ORAG','ORA','OR.','0','ORN','ORAN','ONG','ORANG', 'OR', 'ORG']
    yellow = ['YELO','YLL','YEDLL','LTYW','YELW','YLW','YELLW','YL','YEL','YELL','Y','YELLO', 'YW']
    purple = ['LAVEN','DKPR','PUPLE','TNGL','TURPL','PURP','PURPL', 'PUR']
    pink = ['LTPK','DKPK','LTP','PINK']
    burgundy = ['BY','BUR','BURGY','BARG','BURGE','BERG','BU','BGDY','BURGA','BRGY','BN','BURGU', 'BURG']
    beige = ['BEGE','BGE.','BIGE','BAGE','BEIG','BEIGE', 'BEIEG', 'BIEGE', 'BG']
    
    color = ''
    
    for index, row in df.iterrows():
        if row['Vehicle Color'] in black:
            color = 'black'
        elif row['Vehicle Color'] in white:
            color = 'white'
        elif row['Vehicle Color'] in silver:
            color = 'silver'
        elif row['Vehicle Color'] in green:
            color = 'green'
        elif row['Vehicle Color'] in grey:
            color = 'grey'
        elif row['Vehicle Color'] in red:
            color = 'red'
        elif row['Vehicle Color'] in maroon:
            color = 'maroon'
        elif row['Vehicle Color'] in gold:
            color = 'gold'
        elif row['Vehicle Color'] in blue:
            color = 'blue'
        elif row['Vehicle Color'] in brown:
            color = 'brown'
        elif row['Vehicle Color'] in orange:
            color = 'orange'
        elif row['Vehicle Color'] in yellow:
            color = 'yellow'
        elif row['Vehicle Color'] in purple:
            color = 'purple'
        elif row['Vehicle Color'] in pink:
            color = 'pink'
        elif row['Vehicle Color'] in burgundy:
            color = 'burgundy'
        elif row['Vehicle Color'] in beige:
            color = 'beige'
        df.at[index,'Vehicle Color'] = color

#     Adding Month and Year of the violation
    months = []
    years = []
    for k in df["Issue Date"]:
        months.append(int(k[3:5]))
        years.append(int(k[6:]))
    df["Month"] = months
    df['Year'] = years

    df.drop(columns=['Issue Date', 'Vehicle Make', 'Year'],inplace=True)
    return df

def clean_make_year(clean):
    # remove white space
    clean['Vehicle_Make'] = clean['Vehicle Make'].str.strip()
    
    # remove any car with less than 10 recordings
    clean = clean[clean.groupby('Vehicle_Make').Vehicle_Make.transform(len) > 10]
    
    # fix R&R
    clean['Vehicle_Make'] = clean['Vehicle_Make'].replace('R/R', 'ROLLS')
    
    # fix L/R
    clean['Vehicle_Make'] = clean['Vehicle_Make'].replace('L/R', 'ROVER')
    
    # remove unrelated
    removable = ['FRHT', 'FEDEX', 'WHITE', 'UTILI', 'BL/BI', 'NEW', 'HIGHW', 'TRUCK', 'UPS']
    clean = clean[~clean['Vehicle_Make'].isin(removable)]
    
    clean = clean[(clean['Vehicle Year'] > 1910) & (clean['Vehicle Year'] < 2020)]

    return clean

def convert_to_datetime(x):
    mid = int(len(x) / 2)
    hrs = int(x[0:mid])
    if (x[len(x) - 1] == 'P' or x[len(x) - 1] == 'A'):
        is_pm = x[len(x) - 1] == 'P'
    else:
        mins = int(x[mid:len(x)])
        is_pm = False
    if (is_pm):
        hrs = hrs + 12
    return hrs

small_df = initialClean(small_df)
small_df['Violation Time'] = small_df['Violation Time'].dropna().apply(lambda x: convert_to_datetime(x))

def get_col_dummies(df):
    for column in list(df):
        if column != 'Violation Code':
            one_hot = pd.get_dummies(df[column])
            one_hot.columns = [column+'_' + str(c) for c in one_hot.columns]
            df.drop(columns=[column],inplace=True)
            df = df.join(one_hot)
    return df
small_df = get_col_dummies(small_df)
small_df.head()

# small_df.head(20)
# small_df.columns.contains('Violation Code')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(train_features, train_outcome)
accuracy = accuracy_score(tree_clf.predict(test_features), test_outcome)
accuracy

0.9111833550065019

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
pipe = make_pipeline(MinMaxScaler(), KNeighborsClassifier())

param_grid = {'kneighborsclassifier__n_neighbors': range(1,20), 'kneighborsclassifier__weights':["uniform", "distance"]}
grid = GridSearchCV(pipe, param_grid)
grid.fit(train_features, train_outcome)
grid.score(test_features, test_outcome)

