In [3]:
%matplotlib inline

# General Libraries
import re
import numpy as np
import matplotlib.pyplot as plt
import csv
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss



In [4]:
data_orig = pd.read_csv("../train.csv")
test_orig = pd.read_csv("../test.csv")

def build_features(data):
    data['DateTime'] = pd.to_datetime(data['Dates'])
    date_vector = data['DateTime'].dt.date
    data['DateDiff'] = (date_vector - date_vector.min()) / np.timedelta64(1, 'D')
    data['Year'] = pd.DatetimeIndex(data['DateTime']).year
    data['Month'] = pd.DatetimeIndex(data['DateTime']).month
    data['Day'] = pd.DatetimeIndex(data['DateTime']).day
    data['Hour'] = pd.DatetimeIndex(data['DateTime']).hour
    data['SecondsDelta'] = (data.DateTime - pd.Timestamp('2013-01-01')) / np.timedelta64(1,'s')
    data['Weekend'] = (data.DayOfWeek == "Saturday") | (data.DayOfWeek == "Sunday")
    years = pd.get_dummies(data.Year)
    years.columns = ['2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
    months = pd.get_dummies(data.Month)
    months.columns = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    days = pd.get_dummies(data.Day)
    days.columns = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']
    daysofweek = pd.get_dummies(data.DayOfWeek)
    hours = pd.get_dummies(data.Hour)
    hours.columns = ['12AM', '1AM', '2AM', '3AM', '4AM', '5AM',
                     '6AM', '7AM', '8AM', '9AM', '10AM', '11AM',
                     '12PM', '1PM', '2PM', '3PM', '4PM', '5PM',
                     '6PM', '7PM', '8PM', '9PM', '10PM', '11PM']
    districts = pd.get_dummies(data.PdDistrict)
    new_data = pd.concat([data, years, months, days, daysofweek, hours, districts], axis=1)
    return new_data


data = build_features(data_orig)
test = build_features(test_orig)

In [5]:
# Generate location-based dummies
XR3 = data['X'].round(decimals=3).apply(str)
YR3 = data['Y'].round(decimals=3).apply(str)
data_XR3s = pd.get_dummies(XR3)
data_YR3s = pd.get_dummies(YR3)    
XR3 = test['X'].round(decimals=3).apply(str)
YR3 = test['Y'].round(decimals=3).apply(str)
test_XR3s = pd.get_dummies(XR3)
test_YR3s = pd.get_dummies(YR3)    

#Subset the test to only include features that exist in training set
test_XR3s = test_XR3s[list(data_XR3s)]
test_YR3s = test_YR3s[list(data_YR3s)]

print data_XR3s.shape
print data_YR3s.shape
print
print test_XR3s.shape
print test_YR3s.shape

(878049, 147)
(878049, 107)

(884262, 147)
(884262, 107)


In [6]:
# Separate labels
train_labels = data.Category

# Create integer labels
panda_labels = pd.Categorical(data.Category).codes

train_labels_int = np.array(panda_labels).astype(np.int32)

# Drop Category, Descript and Resolution columns since they are not in the test set.
# Drop non-numerics too - they are accounted for as dummy variables.
train_data = data.drop(['Category', 'Descript', 'Resolution', 'DateTime', 'Dates', 'PdDistrict', 'Address', 'DayOfWeek'], axis=1)
train_data.Weekend = train_data.Weekend * 1
train_names = train_data.columns.values.tolist()

test_data = test.drop(['DateTime', 'Dates', 'PdDistrict', 'Address', 'DayOfWeek'], axis=1)
test_data.Weekend = test_data.Weekend * 1
test_names = test_data.columns.values.tolist()

In [7]:
features = ['Jan','Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', '12AM', '1AM', '2AM', '3AM', '4AM', '5AM', '6AM', '7AM', '8AM', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', '5PM', '6PM', '7PM', '8PM', '9PM', '10PM', '11PM', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']
np_train_data = np.array(pd.concat([train_data[features], (data.Year < 2006) * 1, (data.Year < 2008) * 1, (data.Year < 2010) * 1, data_XR3s, data_YR3s], axis=1))
np_test_data = np.array(pd.concat([test_data[features], (test.Year < 2006) * 1, (test.Year < 2008) * 1, (test.Year < 2010) * 1, test_XR3s[list(data_XR3s)], test_YR3s[list(data_YR3s)]], axis=1))

In [8]:
np_train_labels = np.array(panda_labels)
print np_train_data.shape
print np_train_labels.shape

print np_train_labels[:3]

print np.unique(train_labels)


(878049L, 299L)
(878049L,)
[37 21 21]
['ARSON' 'ASSAULT' 'BAD CHECKS' 'BRIBERY' 'BURGLARY' 'DISORDERLY CONDUCT'
 'DRIVING UNDER THE INFLUENCE' 'DRUG/NARCOTIC' 'DRUNKENNESS' 'EMBEZZLEMENT'
 'EXTORTION' 'FAMILY OFFENSES' 'FORGERY/COUNTERFEITING' 'FRAUD' 'GAMBLING'
 'KIDNAPPING' 'LARCENY/THEFT' 'LIQUOR LAWS' 'LOITERING' 'MISSING PERSON'
 'NON-CRIMINAL' 'OTHER OFFENSES' 'PORNOGRAPHY/OBSCENE MAT' 'PROSTITUTION'
 'RECOVERED VEHICLE' 'ROBBERY' 'RUNAWAY' 'SECONDARY CODES'
 'SEX OFFENSES FORCIBLE' 'SEX OFFENSES NON FORCIBLE' 'STOLEN PROPERTY'
 'SUICIDE' 'SUSPICIOUS OCC' 'TREA' 'TRESPASS' 'VANDALISM' 'VEHICLE THEFT'
 'WARRANTS' 'WEAPON LAWS']


In [9]:
mini_trainX, mini_train_labels = np_train_data[:200000], np_train_labels[:200000]
mini_testX, mini_test_labels = np_train_data[200000:400000], np_train_labels[200000:400000]

In [8]:
clf = LogisticRegression(penalty='l2', C=0.01, tol=0.01)
clf.fit(mini_trainX, mini_train_labels)
print "Completed training with best C"
clf_accuracy = clf.score(mini_testX, mini_test_labels)
clf_probs = clf.predict_proba(mini_testX)
clf_log_loss = log_loss(mini_test_labels, clf_probs)
f1_score = metrics.f1_score(clf.predict(mini_testX), mini_test_labels)

print "Logistic Regression Results"
print "--------------------"
print "Accuracy: ", clf_accuracy
print "Log Loss: ", clf_log_loss
print "F1 Score: ", f1_score

Completed training with best C
Logistic Regression Results
--------------------
Accuracy:  0.22536
Log Loss:  2.59505159313
F1 Score:  0.313056360511


  sample_weight=sample_weight)
  'recall', 'true', average, warn_for)


In [32]:
lr = LogisticRegression(penalty='l2', C=0.01, tol=0.01)
lr.fit(np_train_data, np_train_labels)
print "Completed the training"

Completed the training


In [34]:
probs = lr.predict_proba(np_test_data)
print probs.shape
print probs[:1]

(884262L, 39L)
[[ 0.00564554  0.1141673   0.00107871  0.00193759  0.02753063  0.00284736
   0.00764611  0.02101522  0.00604462  0.00146333  0.00135948  0.00163274
   0.00132     0.00630307  0.00141343  0.00570524  0.17905658  0.00186494
   0.001429    0.0263378   0.07059757  0.15749684  0.00119061  0.00152417
   0.00557516  0.04296505  0.00248459  0.02178049  0.00605755  0.00134107
   0.00585947  0.00143808  0.03777207  0.00120339  0.00537556  0.09322493
   0.07210074  0.03494107  0.02127292]]


In [38]:
import gzip
with gzip.open('submission-matrix-6.csv.gz', 'wb') as f:
    out = csv.writer(f, lineterminator='\n')
    out.writerow(['Id'] + list(np.unique(train_labels)))
    
    for i, prob in enumerate(probs):
        out.writerow([i] + list(prob))
print "Job completed"    

Job completed


In [2]:
# Keras
from keras.layers.advanced_activations import PReLU
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.models import Sequential
from keras.utils import np_utils
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler

In [10]:
print np_train_data.shape[1]
print len(train_labels_int)
Y = np_utils.to_categorical(np_train_labels)

299
878049


In [11]:
def get_model(input_dim, output_dim, wbit, dp, layers):
    keras = Sequential()
    keras.add(Dense(input_dim, wbit, init='glorot_uniform'))
    keras.add(PReLU((wbit,)))
    keras.add(Dropout(dp))

    for i in range(layers):
        keras.add(Dense(wbit, wbit, init='glorot_uniform'))
        keras.add(PReLU((wbit,)))
        keras.add(BatchNormalization((wbit,)))
        keras.add(Dropout(dp))

    keras.add(Dense(wbit, output_dim, init='glorot_uniform'))
    keras.add(Activation('softmax'))
    keras.compile(loss='categorical_crossentropy', optimizer='adam')
    return keras

input_dim = np_train_data.shape[1]
output_dim = 39
wbit = 64
EPOCHS = 50
RUN_FOLDS = False
BATCHES = 128
nb_folds = 4
dp = 0.5
layers = 2
kfolds = KFold(len(train_labels_int), nb_folds)
keras = get_model(input_dim, output_dim, wbit, dp, layers)

In [12]:
keras.fit(np_train_data, Y, nb_epoch=EPOCHS)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49


<keras.callbacks.History at 0x52245978>

In [13]:
keras_probs = keras.predict_proba(np_test_data, verbose=0)
print keras_probs[:1]

[[  3.25674705e-03   1.22304907e-01   1.81807564e-04   6.43533533e-04
    4.31008785e-02   2.34723054e-03   4.83023084e-03   4.01066927e-02
    4.83486479e-03   5.48736742e-04   2.89118463e-04   5.38090773e-04
    6.27333516e-03   1.11132646e-02   1.78778847e-04   4.56025970e-03
    1.27314537e-01   2.22818542e-03   3.69942682e-04   2.88985227e-02
    8.52939424e-02   1.77825962e-01   1.07319319e-05   1.52738432e-03
    6.89246199e-03   3.73732930e-02   1.47059583e-03   2.10422065e-02
    6.66893637e-03   2.07756043e-04   4.91733695e-03   6.89931348e-04
    4.14000600e-02   1.31568562e-07   5.23058896e-03   7.80024616e-02
    7.30121426e-02   3.79294526e-02   1.65849594e-02]]


In [64]:
print keras_probs[:1]
print keras_probs.shape

[[  2.46946225e-03   1.01283441e-01   6.10345719e-04   4.44019889e-04
    4.79291865e-02   4.98406825e-03   3.38991233e-03   3.68961106e-02
    5.42482574e-03   1.37701374e-03   3.39711722e-04   8.02718677e-04
    1.31758390e-02   1.74560687e-02   2.44102512e-04   3.38563075e-03
    1.34035570e-01   2.39675808e-03   1.39450970e-03   4.47952345e-02
    9.33920933e-02   1.57301968e-01   8.11747503e-05   7.30870354e-03
    4.91641267e-03   3.07563686e-02   3.56377344e-03   1.50235923e-02
    6.23720459e-03   2.68768026e-04   5.49464713e-03   7.84952416e-04
    4.02869381e-02   3.78244538e-05   8.49103600e-03   6.38030441e-02
    8.61019080e-02   4.05860824e-02   1.27289782e-02]]
(884262L, 39L)


In [14]:
import gzip
with gzip.open('submission-matrix-keras-10.csv.gz', 'wb') as f:
    out = csv.writer(f, lineterminator='\n')
    out.writerow(['Id'] + list(np.unique(train_labels)))
    
    for i, prob in enumerate(keras_probs):
        out.writerow([i] + list(prob))
print "Job completed"    

Job completed


In [69]:
keras1 = get_model(input_dim, output_dim, wbit, dp, layers)

In [70]:
mini_kerasX, mini_keras_labels = np_train_data[:200000], Y[:200000]
mini_testX, mini_test_labels = np_train_data[200000:400000], Y[200000:400000]

In [None]:

keras1.fit(mini_kerasX, mini_keras_labels, np_epochs=64)
print "Completed training with best C"
clf_accuracy = keras1.score(mini_testX, mini_test_labels)
clf_probs = keras1.predict_proba(mini_testX)
clf_log_loss = log_loss(mini_test_labels, clf_probs)
f1_score = metrics.f1_score(keras1.predict(mini_testX), mini_test_labels)

print "Keras Results"
print "--------------------"
print "Accuracy: ", clf_accuracy
print "Log Loss: ", clf_log_loss
print "F1 Score: ", f1_score

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75