### Predict the probability that the corresponding blight ticket will be paid on time

In [2]:
# import some necessary libraries
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Suppress all warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.set_option('display.max_columns', 40)  
train = pd.read_csv('assets/train.csv')
test = pd.read_csv('assets/test.csv')

# Filter out NaN from target
train = train[(train['compliance'] == 1) | (train['compliance'] == 0)]

In [5]:
## Drop columns not present in training and testing
remaining_columns = ['payment_amount','payment_status', 'balance_due','collection_status', 'compliance_detail','payment_date']
train.drop(remaining_columns, axis=1, inplace=True)

# Drop First column
train.drop(['Unnamed: 0'], axis=1, inplace=True)
train
## Drop Columns with too many NAs
print(train.isna().sum())
na_columns = ['violation_zip_code', 'non_us_str_code',  'grafitti_status']
train.drop(na_columns, axis=1, inplace=True)

ticket_id                          0
agency_name                        0
inspector_name                     0
violator_name                     26
violation_street_number            0
violation_street_name              0
violation_zip_code            159880
mailing_address_str_number      2558
mailing_address_str_name           3
city                               0
state                             84
zip_code                           1
non_us_str_code               159877
country                            0
ticket_issued_date                 0
hearing_date                     227
violation_code                     0
violation_description              0
disposition                        0
fine_amount                        0
admin_fee                          0
state_fee                          0
late_fee                           0
discount_amount                    0
clean_up_cost                      0
judgment_amount                    0
grafitti_status               159880
c

In [6]:
#Drop clean_up_cost, admin_fee, state_fee, as there is only one unique value
train['clean_up_cost'].nunique()
train.drop(['clean_up_cost', 'admin_fee', 'state_fee'], axis=1, inplace=True)

#Drop violation_description, as it correlates to violation_code
train.drop(['violation_description'], axis=1, inplace=True)
#Drop violator_name, as it will create too many unique categories
train.drop(['violator_name'], axis=1, inplace=True)
#Drop state, zip_code, county as it will correlate to city
train.drop(['state', 'zip_code', 'country'], axis=1, inplace=True)

#Drop mailing_address_str_number, mailing_address_str_name, county as it will create too many unique categories
train.drop(['mailing_address_str_number', 'mailing_address_str_name'], axis=1, inplace=True)
#Drop dates
train.drop(['hearing_date', 'ticket_issued_date'], axis=1, inplace=True)


In [7]:
# Drop same columns in testing
test.drop(['clean_up_cost', 'admin_fee', 'state_fee'], axis=1, inplace=True)

test.drop(['violation_description'], axis=1, inplace=True)
test.drop(['violator_name'], axis=1, inplace=True)
test.drop(['state', 'zip_code', 'country'], axis=1, inplace=True)
test.drop(['mailing_address_str_number', 'mailing_address_str_name'], axis=1, inplace=True)
test.drop(['hearing_date', 'ticket_issued_date'], axis=1, inplace=True)
test.drop(['violation_zip_code', 'non_us_str_code',  'grafitti_status'], axis=1, inplace=True)

In [8]:
train = train.set_index('ticket_id')
test = test.set_index('ticket_id')

In [9]:
#Check to make sure the lengths are the same (excluding compliance)
len(train.columns) == (len(test.columns) +1)

True

In [10]:
X_train = train.iloc[:,0:11]
y_train = train['compliance']

X_test = test.iloc[:,0:11]

In [13]:
# Create categorical variables
cols = ['agency_name', 'inspector_name', 'violation_street_number', 'violation_street_name', 'city', 'violation_code', 'disposition']
X_train[cols] = X_train[cols].astype('category')
X_train[cols] = X_train[cols].apply(lambda x: x.cat.codes)
X_test[cols] = X_test[cols].astype('category')
X_test[cols] = X_test[cols].apply(lambda x: x.cat.codes)


In [15]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state = 42)
learning_rate = [0.001, 0.01,0.1]
max_depth = [2,3,4]
grid_values = {'learning_rate' : learning_rate, 'max_depth' : max_depth}


grid_clf_acc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
grid_clf_acc.fit(X_train, y_train)

GridSearchCV(estimator=GradientBoostingClassifier(random_state=42),
             param_grid={'learning_rate': [0.001, 0.01, 0.1],
                         'max_depth': [2, 3, 4]},
             scoring='roc_auc')

In [19]:
print(grid_clf_acc.best_score_)
print(grid_clf_acc.best_params_)

values = grid_clf_acc.predict_proba(X_test) 
result = pd.Series(values[:,1], index=X_test.index)
result

0.7982512431057947
{'learning_rate': 0.1, 'max_depth': 4}


ticket_id
284932    0.288440
285362    0.118877
285361    0.114852
285338    0.306785
285346    0.330352
            ...   
376496    0.094476
376497    0.094476
376499    0.159447
376500    0.159447
369851    0.921474
Length: 61001, dtype: float64