In [12]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor

    #Insert lat and lon in address embbeded in adresses csv 
add = pd.read_csv('addresses.csv') 
lat = pd.read_csv('latlons.csv') 
geoRef = pd.merge(add,lat, how='left', on='address')
geoRef.set_index(['ticket_id'], inplace=True)
geoRef=geoRef[['lat','lon']]

train = pd.read_csv('train.csv', sep=',', encoding='cp1252', low_memory=False,
                    parse_dates = ['ticket_issued_date', 'hearing_date'], index_col='ticket_id')
train = pd.merge(train, geoRef, how='left', left_index=True, right_index=True)

test = pd.read_csv('test.csv', sep=',', encoding='cp1252', low_memory=False,
                    parse_dates = ['ticket_issued_date', 'hearing_date'], index_col='ticket_id')
test = pd.merge(test, geoRef, how='left', left_index=True, right_index=True)

train = train.dropna(subset=['compliance']) #drop compliance nan
train = train[train['agency_name'] != 'Neighborhood City Halls'] #delete option
#disposition feature (replace)
disposition_replace = {'Responsible by Default': 'By default',
                       'Responsible by Determination': 'By determination',
                       'Responsible (Fine Waived) by Admis': 'Fine Waived',
                       'Responsible (Fine Waived) by Deter': 'Fine Waived',
                       'Responsible - Compl/Adj by Default': 'By default',
                       'Responsible - Compl/Adj by Determi': 'By determination',
                       'Responsible by Admission': 'By admission',
                       'Responsible by Dismissal': 'By default'}
train.disposition.replace(disposition_replace, inplace=True)
test.disposition.replace(disposition_replace, inplace=True)
#discount feature (1 has discount, 0 without discount)
train['discount'] = train.discount_amount.apply(lambda x:1 if x > 0 else 0)
test['discount'] = test.discount_amount.apply(lambda x:1 if x > 0 else 0)
#bins judgement amount
cut = [-1, 100, 150, 300, float("inf")]
train['judgment_level'] = pd.cut(train.judgment_amount, bins=cut)
test['judgment_level'] = pd.cut(test.judgment_amount, bins=cut)
#State replacement and missing values treatment
states = ['AL','AK','AZ','AR','CA','CO','CT','DC','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME',
          'MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI',
          'SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

train['state_cat'] = train.state.apply(lambda x: x if x in states else 'NOT US')
train.loc[(train.state_cat != 'NOT US') & (train.state_cat != 'MI'),'state_cat'] = 'NOT MI'
train.loc[train.state.isna(),'state_cat'] = 'MI' #Missing values most Frequent

test['state_cat'] = test.state.apply(lambda x: x if x in states else 'NOT US')
test.loc[(test.state_cat != 'NOT US') & (test.state_cat != 'MI'),'state_cat'] = 'NOT MI'
test.loc[test.state.isna(),'state_cat'] = 'MI' #Missing values most Frequent
#Fill in loc and lat using the average in the most frequent places
#lat and lon is filled with rhe average of 50 most frequencies locations
mtrain = train.groupby(['lon','lat']).compliance.agg(['count', 'sum', 'mean', 'std']).sort_values('count',ascending=False)
lon = mtrain[mtrain['count']>50].index.get_level_values(0).tolist()
lat = mtrain[mtrain['count']>50].index.get_level_values(1).tolist()
lon = round(sum(lon) / float(len(lon)),6)
lat = round(sum(lat) / float(len(lat)),6)
train.loc[train.lon.isna(),'lon'] = lon #fill in missing values
train.loc[train.lat.isna(),'lat'] = lat #fill in missing values
mtest = test.groupby(['lon','lat']).agency_name.agg(['count']).sort_values('count',ascending=False)
lon = mtest[mtest['count']>50].index.get_level_values(0).tolist()
lat = mtest[mtest['count']>50].index.get_level_values(1).tolist()
lon = round(sum(lon) / float(len(lon)),6)
lat = round(sum(lat) / float(len(lat)),6)
test.loc[test.lon.isna(),'lon'] = lon #fill in missing values
test.loc[test.lat.isna(),'lat'] = lat #fill in missing values
#X and y values
ycol = ['compliance']
xcol = ['state_cat','judgment_level','discount','disposition','agency_name','lat','lon']
X = train[xcol]
Xval = test[xcol]
y = np.ravel(train[ycol].astype('int'))
#dummy variables
X2 = X.copy()
X2['train'] = [1] * X.shape[0]
#print(X.head())
X2val = Xval.copy()
X2val['train'] = [0] * Xval.shape[0]
X_merged = pd.concat([X2, X2val], axis=0)
X_merged = pd.get_dummies(X_merged)#.reset_index(drop=True)
X = X_merged[X_merged.train == 1]
Xval = X_merged[X_merged.train == 0]
#model random forest
model = RandomForestRegressor(random_state=8453, n_estimators=25, max_depth=15, max_features=10)
model.fit(X, y)
pred = model.predict(X)

print('AUCROC for random forests')
print(roc_auc_score(y,pred))

m = pd.Series(model.predict(Xval), index=Xval.index, name='compliance', dtype='float32')
print(m.shape)
m

AUCROC for random forests
0.8943301874402421
(61001,)


ticket_id
284932    0.034946
285362    0.009513
285361    0.043883
285338    0.045139
285346    0.045055
            ...   
376496    0.026914
376497    0.026914
376499    0.041623
376500    0.041623
369851    0.041815
Name: compliance, Length: 61001, dtype: float32