- Dates - timestamp of the crime incident
- Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
- Descript - detailed description of the crime incident (only in train.csv)
- DayOfWeek - the day of the week
- PdDistrict - name of the Police Department District
- Resolution - how the crime incident was resolved (only in train.csv)
- Address - the approximate street address of the crime incident 
- X - Longitude
- Y - Latitude

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import lightgbm as lgb
import catboost

In [2]:
X_train = pd.read_csv("data/train.csv")
X_test = pd.read_csv("data/test.csv")

In [3]:
X_train[:5]

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
X_test[:5]

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [5]:
y_train = X_train['Category']
X_train_description = X_train['Descript']
X_train_resolution = X_train['Resolution']
X_train.drop(["Category", "Descript", "Resolution"], axis=1, inplace=True)

In [6]:
test_ID = X_test["Id"]
X_test.drop("Id", axis=1, inplace=True)

In [7]:
X_train[:5]

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,Wednesday,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,Wednesday,PARK,100 Block of BRODERICK ST,-122.438738,37.771541


In [8]:
X_test[:5]

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [9]:
X_train.shape

(878049, 6)

In [10]:
X_test.shape

(884262, 6)

In [11]:
y_train.value_counts()

LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS 

In [12]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
print(le.classes_)

['ARSON' 'ASSAULT' 'BAD CHECKS' 'BRIBERY' 'BURGLARY' 'DISORDERLY CONDUCT'
 'DRIVING UNDER THE INFLUENCE' 'DRUG/NARCOTIC' 'DRUNKENNESS'
 'EMBEZZLEMENT' 'EXTORTION' 'FAMILY OFFENSES' 'FORGERY/COUNTERFEITING'
 'FRAUD' 'GAMBLING' 'KIDNAPPING' 'LARCENY/THEFT' 'LIQUOR LAWS' 'LOITERING'
 'MISSING PERSON' 'NON-CRIMINAL' 'OTHER OFFENSES'
 'PORNOGRAPHY/OBSCENE MAT' 'PROSTITUTION' 'RECOVERED VEHICLE' 'ROBBERY'
 'RUNAWAY' 'SECONDARY CODES' 'SEX OFFENSES FORCIBLE'
 'SEX OFFENSES NON FORCIBLE' 'STOLEN PROPERTY' 'SUICIDE' 'SUSPICIOUS OCC'
 'TREA' 'TRESPASS' 'VANDALISM' 'VEHICLE THEFT' 'WARRANTS' 'WEAPON LAWS']


In [13]:
num_train = X_train.shape[0]
all_data = pd.concat((X_train, X_test), ignore_index=True)

In [14]:
date = pd.DatetimeIndex(all_data['Dates'])
all_data['year'] = date.year
all_data['month'] = date.month
all_data['day'] = date.day
all_data['hour'] = date.hour
all_data['minute'] = date.minute
# all_data['second'] = date.second  # all zero
all_data.drop("Dates", axis=1, inplace=True)

In [15]:
all_data["DayOfWeek"].value_counts()

Friday       268437
Wednesday    259610
Saturday     253848
Tuesday      251905
Thursday     251579
Monday       243810
Sunday       233122
Name: DayOfWeek, dtype: int64

In [16]:
all_data["PdDistrict"].value_counts()

SOUTHERN      314638
MISSION       240357
NORTHERN      212313
BAYVIEW       179022
CENTRAL       171590
TENDERLOIN    163556
INGLESIDE     158929
TARAVAL       132213
PARK           99512
RICHMOND       90181
Name: PdDistrict, dtype: int64

In [17]:
all_data['block'] = all_data["Address"].str.contains("block", case=False)
all_data.drop("Address", axis=1, inplace=True)

In [18]:
# categorical_features = ["DayOfWeek", "PdDistrict"]
# all_data = pd.get_dummies(all_data, columns=categorical_features)

In [19]:
# X_train = all_data[:num_train].values
# X_test = all_data[num_train:].values

In [20]:
categorical_features = ["DayOfWeek", "PdDistrict", "block"]
ct = ColumnTransformer(transformers=[("categorical_features", OrdinalEncoder(), categorical_features)],
                       remainder="passthrough")
all_data = ct.fit_transform(all_data)

In [21]:
X_train = all_data[:num_train]
X_test = all_data[num_train:]

In [22]:
def cross_val_score_prod(clf, X, y):
    scores = []
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
    for train_index, test_index in cv.split(X, y):
        est = clone(clf)
        est.fit(X[train_index], y[train_index],
                cat_features=np.arange(len(categorical_features)))
        prob = est.predict_proba(X[test_index])
        scores.append(log_loss(y[test_index], prob))
    return scores

In [23]:
clf = catboost.CatBoostClassifier(n_estimators=10, random_seed=0, task_type="GPU")

In [24]:
scores = cross_val_score_prod(clf, X_train, y_train)
print(np.mean(scores), np.std(scores))



0:	learn: 3.5472508	total: 315ms	remaining: 2.83s
1:	learn: 3.4577693	total: 747ms	remaining: 2.99s
2:	learn: 3.3840382	total: 1.02s	remaining: 2.38s
3:	learn: 3.3208290	total: 1.47s	remaining: 2.2s
4:	learn: 3.2656582	total: 1.75s	remaining: 1.75s
5:	learn: 3.2166879	total: 2.2s	remaining: 1.47s
6:	learn: 3.1725448	total: 2.47s	remaining: 1.06s
7:	learn: 3.1322111	total: 2.84s	remaining: 709ms
8:	learn: 3.0955264	total: 3.2s	remaining: 355ms
9:	learn: 3.0607752	total: 3.6s	remaining: 0us




0:	learn: 3.5473783	total: 348ms	remaining: 3.13s
1:	learn: 3.4582504	total: 714ms	remaining: 2.85s
2:	learn: 3.3844607	total: 1.07s	remaining: 2.49s
3:	learn: 3.3211862	total: 1.43s	remaining: 2.14s
4:	learn: 3.2657183	total: 1.8s	remaining: 1.8s
5:	learn: 3.2166462	total: 2.15s	remaining: 1.43s
6:	learn: 3.1724061	total: 2.54s	remaining: 1.09s
7:	learn: 3.1321073	total: 3s	remaining: 751ms
8:	learn: 3.0953511	total: 3.37s	remaining: 375ms
9:	learn: 3.0608045	total: 3.77s	remaining: 0us




0:	learn: 3.5472817	total: 348ms	remaining: 3.13s
1:	learn: 3.4586954	total: 688ms	remaining: 2.75s
2:	learn: 3.3850097	total: 1.06s	remaining: 2.47s
3:	learn: 3.3216266	total: 1.4s	remaining: 2.1s
4:	learn: 3.2657381	total: 1.76s	remaining: 1.76s
5:	learn: 3.2166917	total: 2.11s	remaining: 1.4s
6:	learn: 3.1722469	total: 2.5s	remaining: 1.07s
7:	learn: 3.1317460	total: 2.84s	remaining: 710ms
8:	learn: 3.0947664	total: 3.21s	remaining: 357ms
9:	learn: 3.0602856	total: 3.56s	remaining: 0us
3.0609510885266817 0.0006858602100685234


In [25]:
clf = catboost.CatBoostClassifier(random_seed=0, task_type="GPU", verbose=50)

In [26]:
scores = cross_val_score_prod(clf, X_train, y_train)
print(np.mean(scores), np.std(scores))



0:	learn: 3.5472512	total: 348ms	remaining: 5m 47s
50:	learn: 2.5326386	total: 18.4s	remaining: 5m 42s
100:	learn: 2.4225850	total: 36.1s	remaining: 5m 21s
150:	learn: 2.3880597	total: 53.9s	remaining: 5m 3s
200:	learn: 2.3693517	total: 1m 11s	remaining: 4m 45s
250:	learn: 2.3553229	total: 1m 29s	remaining: 4m 26s
300:	learn: 2.3447621	total: 1m 47s	remaining: 4m 8s
350:	learn: 2.3358546	total: 2m 5s	remaining: 3m 51s
400:	learn: 2.3278483	total: 2m 22s	remaining: 3m 33s
450:	learn: 2.3205262	total: 2m 40s	remaining: 3m 15s
500:	learn: 2.3142586	total: 2m 58s	remaining: 2m 57s
550:	learn: 2.3084099	total: 3m 15s	remaining: 2m 39s
600:	learn: 2.3029638	total: 3m 33s	remaining: 2m 21s
650:	learn: 2.2976602	total: 3m 51s	remaining: 2m 3s
700:	learn: 2.2927163	total: 4m 9s	remaining: 1m 46s
750:	learn: 2.2884808	total: 4m 27s	remaining: 1m 28s
800:	learn: 2.2842703	total: 4m 44s	remaining: 1m 10s
850:	learn: 2.2805768	total: 5m 2s	remaining: 52.9s
900:	learn: 2.2766368	total: 5m 20s	remain



0:	learn: 3.5473781	total: 348ms	remaining: 5m 47s
50:	learn: 2.5330918	total: 18.3s	remaining: 5m 41s
100:	learn: 2.4234620	total: 36.3s	remaining: 5m 23s
150:	learn: 2.3884070	total: 54.1s	remaining: 5m 4s
200:	learn: 2.3693751	total: 1m 11s	remaining: 4m 44s
250:	learn: 2.3559739	total: 1m 29s	remaining: 4m 27s
300:	learn: 2.3448807	total: 1m 47s	remaining: 4m 9s
350:	learn: 2.3359338	total: 2m 5s	remaining: 3m 51s
400:	learn: 2.3277099	total: 2m 22s	remaining: 3m 32s
450:	learn: 2.3205328	total: 2m 40s	remaining: 3m 15s
500:	learn: 2.3139775	total: 2m 58s	remaining: 2m 57s
550:	learn: 2.3081196	total: 3m 16s	remaining: 2m 40s
600:	learn: 2.3027764	total: 3m 34s	remaining: 2m 22s
650:	learn: 2.2977990	total: 3m 51s	remaining: 2m 4s
700:	learn: 2.2931763	total: 4m 9s	remaining: 1m 46s
750:	learn: 2.2886202	total: 4m 27s	remaining: 1m 28s
800:	learn: 2.2842191	total: 4m 45s	remaining: 1m 10s
850:	learn: 2.2802395	total: 5m 2s	remaining: 53s
900:	learn: 2.2765224	total: 5m 20s	remainin



0:	learn: 3.5472817	total: 346ms	remaining: 5m 45s
50:	learn: 2.5336245	total: 18.2s	remaining: 5m 38s
100:	learn: 2.4231068	total: 36.2s	remaining: 5m 22s
150:	learn: 2.3880812	total: 54.3s	remaining: 5m 5s
200:	learn: 2.3705025	total: 1m 11s	remaining: 4m 46s
250:	learn: 2.3554121	total: 1m 29s	remaining: 4m 27s
300:	learn: 2.3451068	total: 1m 47s	remaining: 4m 10s
350:	learn: 2.3356224	total: 2m 5s	remaining: 3m 52s
400:	learn: 2.3273621	total: 2m 23s	remaining: 3m 34s
450:	learn: 2.3195365	total: 2m 41s	remaining: 3m 16s
500:	learn: 2.3131487	total: 2m 58s	remaining: 2m 58s
550:	learn: 2.3072822	total: 3m 16s	remaining: 2m 40s
600:	learn: 2.3021501	total: 3m 34s	remaining: 2m 22s
650:	learn: 2.2976191	total: 3m 52s	remaining: 2m 4s
700:	learn: 2.2929119	total: 4m 9s	remaining: 1m 46s
750:	learn: 2.2886115	total: 4m 27s	remaining: 1m 28s
800:	learn: 2.2843954	total: 4m 45s	remaining: 1m 10s
850:	learn: 2.2805133	total: 5m 3s	remaining: 53.1s
900:	learn: 2.2769479	total: 5m 20s	remai

In [27]:
clf.fit(X_train, y_train)
prob = clf.predict_proba(X_test)



0:	learn: 3.5477371	total: 180ms	remaining: 3m
50:	learn: 2.5349226	total: 9.4s	remaining: 2m 54s
100:	learn: 2.4226979	total: 18.3s	remaining: 2m 43s
150:	learn: 2.3885899	total: 27.2s	remaining: 2m 32s
200:	learn: 2.3710492	total: 36.1s	remaining: 2m 23s
250:	learn: 2.3569136	total: 45.1s	remaining: 2m 14s
300:	learn: 2.3462448	total: 53.8s	remaining: 2m 4s
350:	learn: 2.3365924	total: 1m 3s	remaining: 1m 56s
400:	learn: 2.3285360	total: 1m 12s	remaining: 1m 47s
450:	learn: 2.3217701	total: 1m 21s	remaining: 1m 39s
500:	learn: 2.3154838	total: 1m 30s	remaining: 1m 30s
550:	learn: 2.3100650	total: 1m 39s	remaining: 1m 21s
600:	learn: 2.3048074	total: 1m 48s	remaining: 1m 11s
650:	learn: 2.3000547	total: 1m 57s	remaining: 1m 2s
700:	learn: 2.2958509	total: 2m 6s	remaining: 53.9s
750:	learn: 2.2918009	total: 2m 15s	remaining: 44.9s
800:	learn: 2.2881807	total: 2m 24s	remaining: 35.9s
850:	learn: 2.2849393	total: 2m 33s	remaining: 26.9s
900:	learn: 2.2816040	total: 2m 42s	remaining: 17.9

In [28]:
%%time
submission = pd.DataFrame(np.c_[test_ID, prob], columns=["Id"] + list(le.classes_))
submission["Id"] = submission["Id"].astype(int)
submission.to_csv("submission/v4.gz", compression="gzip", index=False)

CPU times: user 2min 41s, sys: 53.7 ms, total: 2min 41s
Wall time: 2min 41s
