In [1]:
import zipfile
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cross_validation import train_test_split, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import log_loss

In [2]:
train=pd.read_csv('train.csv', parse_dates = ['Dates'])
test=pd.read_csv('test.csv', parse_dates = ['Dates'])

## Missing values treatment

In [3]:
test.isnull().sum()

Id            0
Dates         0
DayOfWeek     0
PdDistrict    0
Address       0
X             0
Y             0
dtype: int64

In [4]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [5]:
train['Category'].unique()

array(['WARRANTS', 'OTHER OFFENSES', 'LARCENY/THEFT', 'VEHICLE THEFT',
       'VANDALISM', 'NON-CRIMINAL', 'ROBBERY', 'ASSAULT', 'WEAPON LAWS',
       'BURGLARY', 'SUSPICIOUS OCC', 'DRUNKENNESS',
       'FORGERY/COUNTERFEITING', 'DRUG/NARCOTIC', 'STOLEN PROPERTY',
       'SECONDARY CODES', 'TRESPASS', 'MISSING PERSON', 'FRAUD',
       'KIDNAPPING', 'RUNAWAY', 'DRIVING UNDER THE INFLUENCE',
       'SEX OFFENSES FORCIBLE', 'PROSTITUTION', 'DISORDERLY CONDUCT',
       'ARSON', 'FAMILY OFFENSES', 'LIQUOR LAWS', 'BRIBERY',
       'EMBEZZLEMENT', 'SUICIDE', 'LOITERING', 'SEX OFFENSES NON FORCIBLE',
       'EXTORTION', 'GAMBLING', 'BAD CHECKS', 'TREA', 'RECOVERED VEHICLE',
       'PORNOGRAPHY/OBSCENE MAT'], dtype=object)

## Encoding Class Labels and Features

In [6]:
#Convert crime labels to numbers
le_crime = LabelEncoder()
crime = le_crime.fit_transform(train.Category)

#Get binarized weekdays, districts, and hours.
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour)

In [10]:
crime

array([37, 21, 21, ..., 16, 35, 12], dtype=int64)

In [11]:
#Build new array
X = pd.concat([hour, days, district], axis=1)
#train_data['crime']=crime
y = crime

In [12]:
#Repeat for test data
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
hour = test.Dates.dt.hour
hour = pd.get_dummies(hour)
test_data = pd.concat([hour, days, district], axis=1)
X_test = test_data

## Partitioning: Dataset into training set and test set

In [13]:
#training, validation = train_test_split(train_data, train_size=.60)
X_train, X_val, y_train, y_val = \
        train_test_split(X, y, train_size=.70, random_state=1)

## Training

In [14]:
#naive_bayes
model = BernoulliNB()
model.fit(X_train, y_train)
predicted = np.array(model.predict_proba(X_val))
log_loss(y_val, predicted) 

2.581910011002698

In [15]:
#Logistic Regression for comparison
model = LogisticRegression(C=.1)
model.fit(X_train, y_train)
predicted = np.array(model.predict_proba(X_val))
log_loss(y_val, predicted) 

2.5803280468629857

## Final Test with Unseen data

In [61]:
model = BernoulliNB()
model.fit(X, y)

predicted = model.predict_proba(X_test)
 
#Write results
result=pd.DataFrame(predicted, columns=le_crime.classes_)
result.to_csv('testResult.csv', index = True, index_label = 'Id' )

In [62]:
result = pd.read_csv('testResult.csv')

In [63]:
result.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.006921,0.130651,2.2e-05,0.000933,0.02905,0.002047,0.006871,0.03281,0.005735,...,0.000134,0.003675,0.000175,0.029836,8.335391e-07,0.003378,0.08841,0.119041,0.039156,0.022786
1,1,0.006921,0.130651,2.2e-05,0.000933,0.02905,0.002047,0.006871,0.03281,0.005735,...,0.000134,0.003675,0.000175,0.029836,8.335391e-07,0.003378,0.08841,0.119041,0.039156,0.022786
2,2,0.001811,0.0902,3.1e-05,0.000224,0.038466,0.00374,0.008674,0.027546,0.00749,...,4.4e-05,0.005844,0.00028,0.020433,2.325302e-07,0.003543,0.074745,0.085124,0.035126,0.008132
3,3,0.003003,0.123099,2.3e-05,0.000968,0.026839,0.001756,0.008884,0.018424,0.004918,...,0.000149,0.003523,0.000359,0.026149,1.321607e-07,0.002377,0.098499,0.170349,0.024274,0.016148
4,4,0.003003,0.123099,2.3e-05,0.000968,0.026839,0.001756,0.008884,0.018424,0.004918,...,0.000149,0.003523,0.000359,0.026149,1.321607e-07,0.002377,0.098499,0.170349,0.024274,0.016148
