In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import random as rdm
import re
from difflib import SequenceMatcher
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
le = preprocessing.LabelEncoder()

# load latitude/longitude/address data
latlons = pd.read_csv('latlons.csv', encoding = 'ISO-8859-1', skiprows=None, low_memory=False)
addresses = pd.read_csv('addresses.csv', encoding = 'ISO-8859-1', skiprows=None, low_memory=False)
dfmap = addresses.merge(latlons, left_on='address', right_on='address', how='inner')

# load data
df = pd.read_csv('train.csv', encoding = 'ISO-8859-1', skiprows=None, low_memory=False)
df = df[~np.isnan(df['compliance'])] # removing empty output values
df = df.merge(dfmap,left_on='ticket_id', right_on='ticket_id', how='inner')
df.reset_index(inplace=True)
df = df.drop(['index'],1)

# load testing data
dftest = pd.read_csv('test.csv', encoding = 'ISO-8859-1', skiprows=None, low_memory=False)
dftest = dftest.merge(dfmap,left_on='ticket_id', right_on='ticket_id', how='inner')
dftest.reset_index(inplace=True)
dftest = dftest.drop(['index'],1)

# extract outputs (can be removed for assignment test)
y = df['compliance']
df = df.drop('compliance',1);

# clean data

le_d = le.fit(dftest['disposition'].unique())
df['disposition'] = le_d.transform(df['disposition'])
df['violation_code'] = df['violation_code'].apply(lambda x: re.sub('[^0-9]', '', x))
df['zip_code'] = df['zip_code'].str[:5]
df['zip_code'] = pd.to_numeric(df['zip_code'], errors='coerce')

# determine whether violation and mailing addresses match
df['violation_street_name'] = df['violation_street_name'].astype(str)
df['mailing_address_str_name'] = df['mailing_address_str_name'].astype(str)
df['violation_street_name'] = df['violation_street_name'].str.replace('[^\w\s]','').str.replace(' st', '').str.replace(' ave', '').str.replace(' blvd', '')
df['mailing_address_str_name'] = df['mailing_address_str_name'].str.replace('[^\w\s]','').str.replace(' st', '').str.replace(' ave', '').str.replace(' blvd', '')
df['violation_street_name'] = df['violation_street_name'].apply(lambda x: x.lower())
df['mailing_address_str_name'] = df['mailing_address_str_name'].apply(lambda x: x.lower())
df['same_address'] = df.apply(lambda row: SequenceMatcher( None, str(row['violation_street_name']), str(row['mailing_address_str_name'])).ratio(), axis=1) >= 0.75
df['same_address'] = df['same_address'].astype(int)

# date of issue, hearing date 
df['ticket_issued_date'] = pd.to_datetime(df['ticket_issued_date'])
df['hearing_date'] = pd.to_datetime(df['hearing_date'])
df['time_delta'] = (pd.to_datetime(df['hearing_date']).dt.date - pd.to_datetime(df['ticket_issued_date']).dt.date).astype('timedelta64[D]').astype(float)
df['time_delta'][df['time_delta']<0] += 365
df['day_issued'] = df['ticket_issued_date'].dt.day.astype(float)
df['month_issued'] = df['ticket_issued_date'].dt.month.astype(float)
df['hearing_day'] = df['hearing_date'].dt.weekday.astype(float)

# is time delta null?
df['dtnull'] = df['time_delta'].isnull().astype(int)

# drop features that are not available in the test data
dropcolumnstrain = ['payment_amount','payment_date','payment_status','payment_status','balance_due','collection_status','compliance_detail']
df = df.drop(dropcolumnstrain, 1)

# drop irrelevant features (common for both training and test)
dropcolumns = ['country','state','ticket_issued_date','hearing_date','city','violation_street_number','violation_street_name','mailing_address_str_name','mailing_address_str_number','mailing_address_str_name','address','agency_name','inspector_name','ticket_id','violator_name','violation_zip_code','admin_fee','state_fee','violation_description','clean_up_cost','grafitti_status','judgment_amount','non_us_str_code']
df = df.drop(dropcolumns, 1)

df = df.astype(float)

# create train and test sets
df, dftest, y_train, y_test = train_test_split(df, y, random_state=0)

# now do cleaning that references other data (must split train/test first!)
df['zip_code'] = df['zip_code'].fillna(df['zip_code'].value_counts().idxmax())
df['lat'] = df['lat'].fillna(df['lat'].value_counts().mean())
df['lon'] = df['lon'].fillna(df['lon'].value_counts().mean())
df['time_delta'] = df['time_delta'].fillna(df['time_delta'].value_counts().mean())
df['hearing_day'] = df['hearing_day'].fillna(df['hearing_day'].value_counts().idxmax())

X_train = df;

dftest['zip_code'] = dftest['zip_code'].fillna(dftest['zip_code'].value_counts().idxmax())
dftest['lat'] = dftest['lat'].fillna(dftest['lat'].value_counts().mean())
dftest['lon'] = dftest['lon'].fillna(dftest['lon'].value_counts().mean())
dftest['time_delta'] = dftest['time_delta'].fillna(dftest['time_delta'].value_counts().mean())
dftest['hearing_day'] = dftest['hearing_day'].fillna(dftest['hearing_day'].value_counts().idxmax())

X_test = dftest

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [25]:
# random forest grid search
# run to search for best random forest parameters

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV

# clf = RandomForestClassifier(max_features = 3, n_estimators = 5, random_state = 0)
clf = RandomForestClassifier(random_state = 0).fit(X_train, y_train)

grid_values = {'n_estimators': [100,200], 'max_depth': [10,20,30], 'max_features': [2,5,10]}

grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
grid_clf_auc.fit(X_train, y_train)

print(grid_clf_auc.best_score_)
print(grid_clf_auc.best_params_)

0.835327687729
{'n_estimators': 200, 'max_features': 2, 'max_depth': 20}


In [52]:
# Run random forest classifier with optimized parameters, outputs ROC AUC score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV

clf = RandomForestClassifier(random_state = 0, max_depth = 20, n_estimators = 100, max_features = 2, criterion='entropy', n_jobs = -1).fit(X_train, y_train)
roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])

0.84375871261488522