In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 12.0)
matplotlib.rcParams['axes.titlesize'] = 18

In [2]:
trainClean = pd.read_csv('input/trainClean.csv')
testClean = pd.read_csv('input/testClean.csv')

In [12]:
trainClean.head()

Unnamed: 0,Category,PdDistrict,X,Y,dayOfMonth,dayOfWeek,month,year,hour
0,37,4,-122.425892,37.774599,13,2,5,2015,23
1,21,4,-122.425892,37.774599,13,2,5,2015,23
2,21,4,-122.424363,37.800414,13,2,5,2015,23
3,16,4,-122.426995,37.800873,13,2,5,2015,23
4,16,5,-122.438738,37.771541,13,2,5,2015,23


## Trying different algorithms

In [3]:
from sklearn.cross_validation import train_test_split
x = trainClean.drop('Category', axis=1)
y = trainClean.Category

A helper function to help test the perfromance of different algorithms and tune hyperparameters.

In [16]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.externals import joblib

def runGridSearch(est, params, saveModelPath=None):
    #the competition judges our performance via log loss, so we use that as our scoring function.
    gs = GridSearchCV(estimator = est, param_grid=params, verbose=10, scoring='log_loss') 
    gs.fit(x, y)
    if saveModelPath:
        joblib.dump(simpleLR, saveModelPath)

    return gs

Let's try a few different algorithms and tune a couple hyperparameters for each algorithm.

In [None]:
from sklearn.linear_model import LogisticRegression
lrParams = {'C': [1e-2,1e-1,0,1]}

softMax = LogisticRegression(multi_class='multinomial', solver='lbfgs', tol=1e-2, max_iter=100)
softMaxParams = {'C': [1e-2,1e-1,0,1]}

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rfParams = {'max_depth':[1,5,10], 'n_estimators'=[5,10,20]}

gsLr = runGridSearch(LogisticRegression(), lrParams, saveModelPath='lr.pkl')
gsSf = runGridSearch(softMax, softMaxParams, saveModelPath='sf.pkl')
gsRf = runGridSearch(rf, rfParams, saveModelPath='rf.pkl')

print('Best Logistic Regression Score: {}'.format(gsLr.best_score_))
print('Best Softmax Score: {}'.format(gsSf.best_score_))
print('Best Random Forest Score: {}'.format(gsRf.best_score_))


In [None]:
simpleLR = gsLr.best_estimator_
from sklearn.externals import joblib
joblib.dump(simpleLR, 'simpleLR.pkl') 

In [None]:
from sklearn.linear_model import LogisticRegression
softMax = LogisticRegression(multi_class='multinomial', solver='lbfgs', tol=1e-2, max_iter=100)
gsSoftMax = runGridSearch(softMax, params={'C':[0.5]})

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] C=0.5 ...........................................................


simpleSoftMax = gsSoftMax.best_estimator_
joblib.dump(gsSoftMax2, 'softMax2.pkl') 

softMax2 = LogisticRegression(multi_class='multinomial', solver='lbfgs', tol=1e-3, max_iter=100)
gsSoftMax2 = runGridSearch(softMax, params={'C':[0.5]})

## Making predictions and creating the submission file

In [7]:
from sklearn.externals import joblib
softMax = joblib.load('simpleSoftMax.pkl') 

In [5]:
samp = pd.read_csv('input/sampleSubmission.csv')
samp.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Below is a helper function to create a submission for any estimator that has a predict_proba method.

The trickiest part is formatting the submission. We need to use the category names as the column names, but the categories of our clean training set have already been converted to numbers. To get the names back, we read in the original names from the raw train.csv file, fit a label encoder to those names, then use that encoder transform the numeric representations in our clean training set back into their corresponding names. Essentially, we're just undoing what we did when we cleaned the data.

In [6]:
from sklearn.preprocessing import LabelEncoder
trainCats = pd.read_csv('input/train.csv', usecols=['Category'])

def createSubmission(est, filePath):
    
    #because we've already selected our hyperparameters, we can now fit our estimator on the entire training set
    est.fit(x, y)
    testProbs = est.predict_proba(test.drop('Id', axis=1))
    
    #get the category names
    categoryEncoder = LabelEncoder()
    categoryEncoder.fit(trainCats.Category)
    catLabels = categoryEncoder.inverse_transform(trainClean.Category.unique())
    
    subDf = pd.DataFrame(testProbs, columns=catLabels)
    subDf = pd.concat([test.Id, subDf], axis=1)
    subDf.to_csv(filePath, index=False)
    print('created {}'.format(filePath))


In [11]:
#createSubmission(softMax, 'softMaxSub.csv')

In [10]:
testProbs = softMax.predict_proba(testClean.drop('Id', axis=1))
    
#get the category names
categoryEncoder = LabelEncoder()
categoryEncoder.fit(trainCats.Category)
catLabels = categoryEncoder.inverse_transform(trainClean.Category.unique())
    
subDf = pd.DataFrame(testProbs, columns=catLabels)
subDf = pd.concat([testClean.Id, subDf], axis=1)
subDf.to_csv('softMaxSub.csv', index=False)
