- Dates - timestamp of the crime incident
- Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
- Descript - detailed description of the crime incident (only in train.csv)
- DayOfWeek - the day of the week
- PdDistrict - name of the Police Department District
- Resolution - how the crime incident was resolved (only in train.csv)
- Address - the approximate street address of the crime incident 
- X - Longitude
- Y - Latitude

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import lightgbm as lgb
import catboost

In [2]:
X_train = pd.read_csv("data/train.csv")
X_test = pd.read_csv("data/test.csv")

In [3]:
X_train[:5]

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
X_test[:5]

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [5]:
y_train = X_train['Category']
X_train_description = X_train['Descript']
X_train_resolution = X_train['Resolution']
X_train.drop(["Category", "Descript", "Resolution"], axis=1, inplace=True)

In [6]:
test_ID = X_test["Id"]
X_test.drop("Id", axis=1, inplace=True)

In [7]:
X_train[:5]

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,Wednesday,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,Wednesday,PARK,100 Block of BRODERICK ST,-122.438738,37.771541


In [8]:
X_test[:5]

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [9]:
X_train.shape

(878049, 6)

In [10]:
X_test.shape

(884262, 6)

In [11]:
y_train.value_counts()

LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS 

In [12]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
print(le.classes_)

['ARSON' 'ASSAULT' 'BAD CHECKS' 'BRIBERY' 'BURGLARY' 'DISORDERLY CONDUCT'
 'DRIVING UNDER THE INFLUENCE' 'DRUG/NARCOTIC' 'DRUNKENNESS'
 'EMBEZZLEMENT' 'EXTORTION' 'FAMILY OFFENSES' 'FORGERY/COUNTERFEITING'
 'FRAUD' 'GAMBLING' 'KIDNAPPING' 'LARCENY/THEFT' 'LIQUOR LAWS' 'LOITERING'
 'MISSING PERSON' 'NON-CRIMINAL' 'OTHER OFFENSES'
 'PORNOGRAPHY/OBSCENE MAT' 'PROSTITUTION' 'RECOVERED VEHICLE' 'ROBBERY'
 'RUNAWAY' 'SECONDARY CODES' 'SEX OFFENSES FORCIBLE'
 'SEX OFFENSES NON FORCIBLE' 'STOLEN PROPERTY' 'SUICIDE' 'SUSPICIOUS OCC'
 'TREA' 'TRESPASS' 'VANDALISM' 'VEHICLE THEFT' 'WARRANTS' 'WEAPON LAWS']


In [13]:
num_train = X_train.shape[0]
all_data = pd.concat((X_train, X_test), ignore_index=True)

In [14]:
date = pd.DatetimeIndex(all_data['Dates'])
all_data['year'] = date.year
all_data['month'] = date.month
all_data['day'] = date.day
all_data['hour'] = date.hour
all_data['minute'] = date.minute
# all_data['second'] = date.second  # all zero
all_data.drop("Dates", axis=1, inplace=True)

In [15]:
all_data["DayOfWeek"].value_counts()

Friday       268437
Wednesday    259610
Saturday     253848
Tuesday      251905
Thursday     251579
Monday       243810
Sunday       233122
Name: DayOfWeek, dtype: int64

In [16]:
all_data["PdDistrict"].value_counts()

SOUTHERN      314638
MISSION       240357
NORTHERN      212313
BAYVIEW       179022
CENTRAL       171590
TENDERLOIN    163556
INGLESIDE     158929
TARAVAL       132213
PARK           99512
RICHMOND       90181
Name: PdDistrict, dtype: int64

In [17]:
all_data['block'] = all_data["Address"].str.contains("block", case=False)
all_data.drop("Address", axis=1, inplace=True)

In [18]:
# extra features, provide another way to scan through the map
all_data["X+Y"] = all_data["X"] + all_data["Y"]
all_data["X-Y"] = all_data["X"] - all_data["Y"]

In [21]:
categorical_features = ["DayOfWeek", "PdDistrict", "block"]
ct = ColumnTransformer(transformers=[("categorical_features", OrdinalEncoder(), categorical_features)],
                       remainder="passthrough")
all_data = ct.fit_transform(all_data)

In [22]:
X_train = all_data[:num_train]
X_test = all_data[num_train:]

In [23]:
def cross_val_score_prod(clf, X, y):
    scores = []
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
    for train_index, test_index in cv.split(X, y):
        est = clone(clf)
        est.fit(X[train_index], y[train_index],
                cat_features=np.arange(len(categorical_features)))
        prob = est.predict_proba(X[test_index])
        scores.append(log_loss(y[test_index], prob))
    return scores

In [24]:
clf = catboost.CatBoostClassifier(n_estimators=10, random_seed=0, task_type="GPU")

In [25]:
scores = cross_val_score_prod(clf, X_train, y_train)
print(np.mean(scores), np.std(scores))



0:	learn: 3.5469196	total: 374ms	remaining: 3.37s
1:	learn: 3.4564224	total: 785ms	remaining: 3.14s
2:	learn: 3.3824846	total: 1.02s	remaining: 2.38s
3:	learn: 3.3187132	total: 1.42s	remaining: 2.13s
4:	learn: 3.2627582	total: 1.67s	remaining: 1.67s
5:	learn: 3.2128743	total: 2.07s	remaining: 1.38s
6:	learn: 3.1685349	total: 2.31s	remaining: 992ms
7:	learn: 3.1283301	total: 2.73s	remaining: 683ms
8:	learn: 3.0912584	total: 2.98s	remaining: 331ms
9:	learn: 3.0574729	total: 3.38s	remaining: 0us




0:	learn: 3.5472231	total: 278ms	remaining: 2.5s
1:	learn: 3.4576149	total: 663ms	remaining: 2.65s
2:	learn: 3.3836994	total: 985ms	remaining: 2.3s
3:	learn: 3.3198007	total: 1.32s	remaining: 1.98s
4:	learn: 3.2636399	total: 1.61s	remaining: 1.61s
5:	learn: 3.2137461	total: 1.97s	remaining: 1.31s
6:	learn: 3.1693159	total: 2.28s	remaining: 978ms
7:	learn: 3.1287643	total: 2.62s	remaining: 655ms
8:	learn: 3.0917071	total: 2.97s	remaining: 330ms
9:	learn: 3.0574393	total: 3.27s	remaining: 0us




0:	learn: 3.5460966	total: 334ms	remaining: 3s
1:	learn: 3.4565618	total: 623ms	remaining: 2.49s
2:	learn: 3.3827121	total: 976ms	remaining: 2.28s
3:	learn: 3.3190880	total: 1.32s	remaining: 1.97s
4:	learn: 3.2630475	total: 1.63s	remaining: 1.63s
5:	learn: 3.2132390	total: 1.92s	remaining: 1.28s
6:	learn: 3.1688151	total: 2.27s	remaining: 974ms
7:	learn: 3.1282253	total: 2.59s	remaining: 648ms
8:	learn: 3.0911504	total: 2.92s	remaining: 325ms
9:	learn: 3.0572534	total: 3.25s	remaining: 0us
3.0578852209954897 0.0006017723480445758


In [26]:
clf = catboost.CatBoostClassifier(random_seed=0, task_type="GPU", verbose=50)

In [27]:
scores = cross_val_score_prod(clf, X_train, y_train)
print(np.mean(scores), np.std(scores))



0:	learn: 3.5469200	total: 304ms	remaining: 5m 4s
50:	learn: 2.5269918	total: 16.6s	remaining: 5m 9s
100:	learn: 2.4142244	total: 32.5s	remaining: 4m 49s
150:	learn: 2.3804444	total: 48.5s	remaining: 4m 32s
200:	learn: 2.3614340	total: 1m 4s	remaining: 4m 16s
250:	learn: 2.3476762	total: 1m 20s	remaining: 3m 59s
300:	learn: 2.3370306	total: 1m 36s	remaining: 3m 43s
350:	learn: 2.3279499	total: 1m 52s	remaining: 3m 27s
400:	learn: 2.3199761	total: 2m 8s	remaining: 3m 11s
450:	learn: 2.3128167	total: 2m 24s	remaining: 2m 55s
500:	learn: 2.3064476	total: 2m 40s	remaining: 2m 39s
550:	learn: 2.3006904	total: 2m 56s	remaining: 2m 23s
600:	learn: 2.2953235	total: 3m 11s	remaining: 2m 7s
650:	learn: 2.2900916	total: 3m 27s	remaining: 1m 51s
700:	learn: 2.2855155	total: 3m 43s	remaining: 1m 35s
750:	learn: 2.2811066	total: 3m 59s	remaining: 1m 19s
800:	learn: 2.2769834	total: 4m 15s	remaining: 1m 3s
850:	learn: 2.2728039	total: 4m 31s	remaining: 47.5s
900:	learn: 2.2691680	total: 4m 47s	remain



0:	learn: 3.5472224	total: 365ms	remaining: 6m 4s
50:	learn: 2.5274248	total: 16.7s	remaining: 5m 11s
100:	learn: 2.4139847	total: 32.7s	remaining: 4m 51s
150:	learn: 2.3799262	total: 48.8s	remaining: 4m 34s
200:	learn: 2.3618390	total: 1m 4s	remaining: 4m 18s
250:	learn: 2.3481816	total: 1m 20s	remaining: 4m 1s
300:	learn: 2.3374956	total: 1m 36s	remaining: 3m 44s
350:	learn: 2.3281474	total: 1m 52s	remaining: 3m 28s
400:	learn: 2.3199321	total: 2m 9s	remaining: 3m 12s
450:	learn: 2.3126136	total: 2m 25s	remaining: 2m 56s
500:	learn: 2.3061203	total: 2m 41s	remaining: 2m 40s
550:	learn: 2.3002391	total: 2m 57s	remaining: 2m 24s
600:	learn: 2.2949899	total: 3m 13s	remaining: 2m 8s
650:	learn: 2.2901831	total: 3m 29s	remaining: 1m 52s
700:	learn: 2.2853045	total: 3m 45s	remaining: 1m 36s
750:	learn: 2.2812037	total: 4m 1s	remaining: 1m 20s
800:	learn: 2.2772243	total: 4m 18s	remaining: 1m 4s
850:	learn: 2.2731013	total: 4m 34s	remaining: 48s
900:	learn: 2.2694837	total: 4m 50s	remaining



0:	learn: 3.5460968	total: 269ms	remaining: 4m 28s
50:	learn: 2.5274804	total: 16.9s	remaining: 5m 14s
100:	learn: 2.4141118	total: 33.2s	remaining: 4m 55s
150:	learn: 2.3811484	total: 49.4s	remaining: 4m 37s
200:	learn: 2.3622273	total: 1m 5s	remaining: 4m 20s
250:	learn: 2.3477803	total: 1m 22s	remaining: 4m 4s
300:	learn: 2.3374665	total: 1m 38s	remaining: 3m 48s
350:	learn: 2.3275925	total: 1m 55s	remaining: 3m 32s
400:	learn: 2.3201781	total: 2m 11s	remaining: 3m 15s
450:	learn: 2.3126493	total: 2m 27s	remaining: 2m 59s
500:	learn: 2.3060333	total: 2m 43s	remaining: 2m 43s
550:	learn: 2.3001153	total: 2m 59s	remaining: 2m 26s
600:	learn: 2.2948130	total: 3m 16s	remaining: 2m 10s
650:	learn: 2.2899824	total: 3m 32s	remaining: 1m 53s
700:	learn: 2.2852320	total: 3m 48s	remaining: 1m 37s
750:	learn: 2.2810375	total: 4m 4s	remaining: 1m 21s
800:	learn: 2.2767373	total: 4m 21s	remaining: 1m 4s
850:	learn: 2.2727963	total: 4m 37s	remaining: 48.6s
900:	learn: 2.2690991	total: 4m 53s	rema

In [28]:
clf.fit(X_train, y_train)
prob = clf.predict_proba(X_test)



0:	learn: 3.5465062	total: 211ms	remaining: 3m 30s
50:	learn: 2.5303235	total: 8.66s	remaining: 2m 41s
100:	learn: 2.4157556	total: 16.8s	remaining: 2m 29s
150:	learn: 2.3824566	total: 24.9s	remaining: 2m 20s
200:	learn: 2.3641908	total: 33.2s	remaining: 2m 12s
250:	learn: 2.3506671	total: 41.3s	remaining: 2m 3s
300:	learn: 2.3396218	total: 49.6s	remaining: 1m 55s
350:	learn: 2.3302088	total: 58s	remaining: 1m 47s
400:	learn: 2.3227237	total: 1m 6s	remaining: 1m 38s
450:	learn: 2.3158979	total: 1m 14s	remaining: 1m 30s
500:	learn: 2.3094420	total: 1m 22s	remaining: 1m 22s
550:	learn: 2.3038824	total: 1m 30s	remaining: 1m 14s
600:	learn: 2.2986250	total: 1m 39s	remaining: 1m 5s
650:	learn: 2.2940739	total: 1m 47s	remaining: 57.4s
700:	learn: 2.2897657	total: 1m 55s	remaining: 49.2s
750:	learn: 2.2857990	total: 2m 3s	remaining: 41s
800:	learn: 2.2816833	total: 2m 11s	remaining: 32.8s
850:	learn: 2.2781491	total: 2m 20s	remaining: 24.6s
900:	learn: 2.2745018	total: 2m 28s	remaining: 16.3s

In [29]:
%%time
submission = pd.DataFrame(np.c_[test_ID, prob], columns=["Id"] + list(le.classes_))
submission["Id"] = submission["Id"].astype(int)
submission.to_csv("submission/v1.gz", compression="gzip", index=False)

CPU times: user 2min 40s, sys: 0 ns, total: 2min 40s
Wall time: 2min 40s
