In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 12.0)
matplotlib.rcParams['axes.titlesize'] = 18

In [9]:
train = pd.read_csv('input/train.csv', parse_dates=['Dates'])
test = pd.read_csv('input/test.csv', parse_dates=['Dates'])

Let's first see if there are missing values in our data.

In [10]:
train.isnull().sum()

Dates         0
Category      0
Descript      0
DayOfWeek     0
PdDistrict    0
Resolution    0
Address       0
X             0
Y             0
dtype: int64

In [11]:
test.isnull().sum()

Id            0
Dates         0
DayOfWeek     0
PdDistrict    0
Address       0
X             0
Y             0
dtype: int64

Nice. So we have a completely full data set.

Below is a function we'll use to prepare the data for training. I've commented the code as needed.

In [12]:
from sklearn.preprocessing import LabelEncoder
categoryEncoder = LabelEncoder()

def clean(df, isTrain=True):
    ## we need to clean the train and tests sets somewhat differently, hence the isTrain argument above
    
    ## transform the variable we will predict into a numeric category so our model can handle it. 
    if (isTrain):
        df['Category'] = categoryEncoder.fit(df.Category).transform(df.Category)
    
    ## parse the Date variable into component parts so that our model can take advantage of that information
    df['month'] = df.Dates.dt.month
    df['dayOfMonth'] = df.Dates.dt.day
    df['year'] = df.Dates.dt.year
    df['hour'] = df.Dates.dt.hour
    
    #create dummy variables for our categorical variables 
    df = df.merge(pd.get_dummies(df, columns=['DayOfWeek', 'year', 'month', 'PdDistrict', 'dayOfMonth', 'hour']))
    
    dropForTrain = [
                    'Descript', #Not in test, so not helpful for modeling
                    'Dates',  #We've parsed it into components
                    'Resolution', #Not in test, so not helpful for modeling
                    'Address',  # Too many unique values to dummy encode
                    'month', #Dummy Encoded Columns
                    'dayOfMonth', 
                    'hour',
                    'year',
                    'DayOfWeek',
                    'PdDistrict'
                    ]
    
    dropForTest = [
        'Dates', 
        'Address', 
        'year',
        'DayOfWeek',
        'month', 
        'dayOfMonth', 
        'hour',
        'PdDistrict'
    ]
    
    if (isTrain):
        df = df.drop(dropForTrain, axis=1)
    else:
        df = df.drop(dropForTest, axis=1)
    
    return df
    

In [13]:
trainClean = clean(train)

In [14]:
trainClean.columns

Index(['Category', 'X', 'Y', 'DayOfWeek_Friday', 'DayOfWeek_Monday',
       'DayOfWeek_Saturday', 'DayOfWeek_Sunday', 'DayOfWeek_Thursday',
       'DayOfWeek_Tuesday', 'DayOfWeek_Wednesday', 'year_2003', 'year_2004',
       'year_2005', 'year_2006', 'year_2007', 'year_2008', 'year_2009',
       'year_2010', 'year_2011', 'year_2012', 'year_2013', 'year_2014',
       'year_2015', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL',
       'PdDistrict_INGLESIDE', 'PdDistrict_MISSION', 'PdDistrict_NORTHERN',
       'PdDistrict_PARK', 'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN',
       'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN', 'dayOfMonth_1',
       'dayOfMonth_2', 'dayOfMonth_3', 'dayOfMonth_4', 'dayOfMonth_5',
       'dayOfMonth_6', 'dayOfMonth_7', 'dayOfMonth_8', 'dayOfMonth_9',
       'dayOfMonth_10', 'dayOfMonth_11', 'dayOfMonth_12', 'dayOfMo

In [15]:
testClean = clean(test, isTrain=False)

In [16]:
testClean.head()

Unnamed: 0,Id,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,0,-122.399588,37.735051,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,-122.391523,37.732432,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,-122.426002,37.792212,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,-122.437394,37.721412,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,-122.437394,37.721412,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
testClean.columns

Index(['Id', 'X', 'Y', 'DayOfWeek_Friday', 'DayOfWeek_Monday',
       'DayOfWeek_Saturday', 'DayOfWeek_Sunday', 'DayOfWeek_Thursday',
       'DayOfWeek_Tuesday', 'DayOfWeek_Wednesday', 'year_2003', 'year_2004',
       'year_2005', 'year_2006', 'year_2007', 'year_2008', 'year_2009',
       'year_2010', 'year_2011', 'year_2012', 'year_2013', 'year_2014',
       'year_2015', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL',
       'PdDistrict_INGLESIDE', 'PdDistrict_MISSION', 'PdDistrict_NORTHERN',
       'PdDistrict_PARK', 'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN',
       'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN', 'dayOfMonth_1',
       'dayOfMonth_2', 'dayOfMonth_3', 'dayOfMonth_4', 'dayOfMonth_5',
       'dayOfMonth_6', 'dayOfMonth_7', 'dayOfMonth_8', 'dayOfMonth_9',
       'dayOfMonth_10', 'dayOfMonth_11', 'dayOfMonth_12', 'dayOfMonth_13

## Training a logistic regression classifier

First, let's split the data into our features and our target.

In [18]:
from sklearn.cross_validation import StratifiedKFold 
X = trainClean.drop('Category', axis=1)
y = trainClean.Category

We'll train a logistic regression classifier and tune 'C', its regularization parameter. We use multinomial logistic regression instead of one-vs-all because the Kaggle competition is judged on log loss. Mutinomial produces better calibrated probabilities than one-vs-all, and so should perform better in respect to log loss.

In [24]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import log_loss

softMax = LogisticRegression(multi_class='multinomial', solver='lbfgs', tol=1e-2, max_iter=50)
softMaxParams = {'C': [1e-1, 1, 10]}

gs = GridSearchCV(estimator = softMax, param_grid=softMaxParams, verbose=5, scoring='log_loss') 
gs.fit(X, y)

print(gs.best_params_)


Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] C=0.1 ...........................................................
[CV] ................................. C=0.1, score=-2.681987 - 5.4min
[CV] C=0.1 ...........................................................
[CV] ................................. C=0.1, score=-2.682590 - 4.9min
[CV] C=0.1 ...........................................................
[CV] ................................. C=0.1, score=-2.679856 - 5.1min
[CV] C=1 .............................................................
[CV] ................................... C=1, score=-2.681988 - 5.0min
[CV] C=1 .............................................................
[CV] ................................... C=1, score=-2.682591 - 4.9min
[CV] C=1 .............................................................
[CV] ................................... C=1, score=-2.679856 - 4.9min
[CV] C=10 ............................................................
[CV] .............

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 46.5min finished


{'C': 0.1}


## Making predictions and creating the submission file

Below is a helper function to create a submission for any estimator that has a predict_proba method.

The trickiest part is formatting the submission. We need to use the category names as the column names, but the categories of our clean training set have already been converted to numbers. To get the names back, we read in the original names from the raw train.csv file, fit a label encoder to those names, then use that encoder transform the numeric representations in our clean training set back into their corresponding names. Essentially, we're just undoing what we did when we cleaned the data.

In [25]:
from sklearn.preprocessing import LabelEncoder
trainCats = pd.read_csv('input/train.csv', usecols=['Category'])

def createSubmission(est, filePath):
    
    #because we've already selected our hyperparameters, we can now fit our estimator on the entire training set
    est.fit(X, y)
    
    # get the probabilities that a crime belongs to each category.
    testProbs = est.predict_proba(testClean.drop('Id', axis=1))
    
    #get the category names for the header of the submission file.
    categoryEncoder = LabelEncoder()
    categoryEncoder.fit(trainCats.Category)
    catLabels = categoryEncoder.inverse_transform(trainClean.Category.unique())
    
    subDf = pd.DataFrame(testProbs, columns=catLabels)
    subDf = pd.concat([testClean.Id, subDf], axis=1)
    subDf.to_csv(filePath, index=False)
    print('created {}'.format(filePath))


Now let's use our function to create our submission!

In [26]:
createSubmission(gs.best_estimator_, 'my_submission.csv')

created my_submission.csv
