In [51]:
import pandas as pd
import numpy as np
import s3fs
# import sklearn
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
# only take 5,000 rows 
nrows = 5000
df = pd.read_csv('https://s3-us-west-2.amazonaws.com/pcadsassessment/parking_citations.corrupted.csv', nrows=nrows)
df.head()

Unnamed: 0,Ticket number,Issue Date,Issue time,Meter Id,Marked Time,RP State Plate,Plate Expiry Date,VIN,Make,Body Style,Color,Location,Route,Agency,Violation code,Violation Description,Fine amount,Latitude,Longitude
0,1103341116,2015-12-21T00:00:00,1251.0,,,CA,200304.0,,,PA,GY,13147 WELBY WAY,01521,1.0,4000A1,NO EVIDENCE OF REG,50.0,99999.0,99999.0
1,1103700150,2015-12-21T00:00:00,1435.0,,,CA,201512.0,,,VN,WH,525 S MAIN ST,1C51,1.0,4000A1,NO EVIDENCE OF REG,50.0,99999.0,99999.0
2,1104803000,2015-12-21T00:00:00,2055.0,,,CA,201503.0,,,PA,BK,200 WORLD WAY,2R2,2.0,8939,WHITE CURB,58.0,6439997.9,1802686.4
3,1104820732,2015-12-26T00:00:00,1515.0,,,CA,,,,PA,WH,100 WORLD WAY,2F11,2.0,000,17104h,,6440041.1,1802686.2
4,1105461453,2015-09-15T00:00:00,115.0,,,CA,200316.0,,CHEV,PA,BK,GEORGIA ST/OLYMPIC,1FB70,1.0,8069A,NO STOPPING/STANDING,93.0,99999.0,99999.0


In [48]:
top_25 = df['Make'].value_counts().index[:25]
top_25

Index(['TOYT', 'HOND', 'FORD', 'NISS', 'CHEV', 'BMW', 'VOLK', 'MERZ', 'HYUN',
       'DODG', 'LEXS', 'TOYO', 'KIA', 'GMC', 'MAZD', 'JEEP', 'AUDI', 'CHRY',
       'INFI', 'OTHR', 'ACUR', 'SUBA', 'VOLV', 'MITS', 'CADI'],
      dtype='object')

In [41]:
df_encoded = pd.get_dummies(df.drop(columns='Make'))
df_encoded = df_encoded.select_dtypes(exclude=['object']).fillna(0)
df_encoded = df_encoded.join(df['Make'])

In [42]:
df_uncorrupt = df_encoded[df_encoded['Make'].isin(top_25)]
df_uncorrupt.to_csv('bari_assessment/parking_citations_unc.csv')
df_uncorrupt.head()

Unnamed: 0,Ticket number,Issue time,Marked Time,Plate Expiry Date,VIN,Agency,Fine amount,Latitude,Longitude,Issue Date_2015-09-14T00:00:00,...,Violation Description_SAFETY ZONE/CURB,Violation Description_STANDNG IN ALLEY,Violation Description_STNDNG IN ALLEY,Violation Description_STOP/STAND PROHIBIT,Violation Description_TIME LIMIT/CITY LOT,Violation Description_WHITE CURB,Violation Description_WHITE ZONE,Violation Description_WITHIN INTERSECTION,Violation Description_YELLOW ZONE,Make
4,1105461453,115.0,0.0,200316.0,0.0,1.0,93.0,99999.0,99999.0,0,...,0,0,0,0,0,0,0,0,0,CHEV
6,1106500452,1710.0,0.0,201605.0,0.0,1.0,163.0,99999.0,99999.0,0,...,0,0,0,0,0,0,0,0,0,MAZD
10,1106506424,1100.0,0.0,201511.0,0.0,1.0,93.0,99999.0,99999.0,0,...,0,0,0,0,0,0,0,0,0,FORD
11,1106506435,1105.0,0.0,201701.0,0.0,1.0,93.0,99999.0,99999.0,0,...,0,0,0,0,0,0,0,0,0,CHRY
14,1107179581,1055.0,0.0,201605.0,0.0,54.0,68.0,99999.0,99999.0,0,...,0,0,0,0,0,0,0,0,0,TOYO


In [43]:
df_corrupt = df_encoded[df_encoded['Make'].isnull()]
df_corrupt.to_csv('bari_assessment/parking_citations_cor.csv')
df_corrupt.head()

Unnamed: 0,Ticket number,Issue time,Marked Time,Plate Expiry Date,VIN,Agency,Fine amount,Latitude,Longitude,Issue Date_2015-09-14T00:00:00,...,Violation Description_SAFETY ZONE/CURB,Violation Description_STANDNG IN ALLEY,Violation Description_STNDNG IN ALLEY,Violation Description_STOP/STAND PROHIBIT,Violation Description_TIME LIMIT/CITY LOT,Violation Description_WHITE CURB,Violation Description_WHITE ZONE,Violation Description_WITHIN INTERSECTION,Violation Description_YELLOW ZONE,Make
0,1103341116,1251.0,0.0,200304.0,0.0,1.0,50.0,99999.0,99999.0,0,...,0,0,0,0,0,0,0,0,0,
1,1103700150,1435.0,0.0,201512.0,0.0,1.0,50.0,99999.0,99999.0,0,...,0,0,0,0,0,0,0,0,0,
2,1104803000,2055.0,0.0,201503.0,0.0,2.0,58.0,6439997.9,1802686.4,0,...,0,0,0,0,0,1,0,0,0,
3,1104820732,1515.0,0.0,0.0,0.0,2.0,0.0,6440041.1,1802686.2,0,...,0,0,0,0,0,0,0,0,0,
5,1106226590,19.0,0.0,201507.0,0.0,1.0,50.0,99999.0,99999.0,0,...,0,0,0,0,0,0,0,0,0,


In [44]:
X_train, X_test, y_train, y_test = train_test_split(df_uncorrupt.drop(columns='Make'), df_uncorrupt['Make'], test_size=0.3)

In [None]:
SVCpipe = Pipeline([('scale', preprocessing.StandardScaler()),
                   ('SVC',LinearSVC())])

# Gridsearch to determine the value of C
param_grid = {'SVC__C':np.arange(0.01,1,0.01)}
linearSVC = GridSearchCV(SVCpipe,param_grid,cv=5,return_train_score=True)
linearSVC.fit(X_train,y_train)
print(linearSVC.best_params_)
# svc = LinearSVC(penalty='l1', dual=False).fit(X_train, y_train)
# pred = svc.predict(X_test)
# print(classification_report(pred, y_test))