- Dates - timestamp of the crime incident
- Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
- Descript - detailed description of the crime incident (only in train.csv)
- DayOfWeek - the day of the week
- PdDistrict - name of the Police Department District
- Resolution - how the crime incident was resolved (only in train.csv)
- Address - the approximate street address of the crime incident 
- X - Longitude
- Y - Latitude

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
import catboost
import gensim

In [2]:
X_train = pd.read_csv("data/train.csv")
X_test = pd.read_csv("data/test.csv")

In [3]:
X_train[:5]

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
print(X_train.duplicated().sum())
X_train.drop_duplicates(inplace=True)
assert X_train.duplicated().sum() == 0

2323


In [5]:
X_test[:5]

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [6]:
y_train = X_train['Category']
X_train_description = X_train['Descript']
X_train_resolution = X_train['Resolution']
X_train.drop(["Category", "Descript", "Resolution"], axis=1, inplace=True)

In [7]:
test_ID = X_test["Id"]
X_test.drop("Id", axis=1, inplace=True)

In [8]:
X_train[:5]

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,Wednesday,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,Wednesday,PARK,100 Block of BRODERICK ST,-122.438738,37.771541


In [9]:
X_test[:5]

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [10]:
X_train.shape

(875726, 6)

In [11]:
X_test.shape

(884262, 6)

In [12]:
y_train.value_counts()

LARCENY/THEFT                  174320
OTHER OFFENSES                 125960
NON-CRIMINAL                    91915
ASSAULT                         76815
DRUG/NARCOTIC                   53919
VEHICLE THEFT                   53706
VANDALISM                       44581
WARRANTS                        42145
BURGLARY                        36600
SUSPICIOUS OCC                  31394
MISSING PERSON                  25669
ROBBERY                         22988
FRAUD                           16637
FORGERY/COUNTERFEITING          10592
SECONDARY CODES                  9979
WEAPON LAWS                      8550
PROSTITUTION                     7446
TRESPASS                         7318
STOLEN PROPERTY                  4537
SEX OFFENSES FORCIBLE            4380
DISORDERLY CONDUCT               4313
DRUNKENNESS                      4277
RECOVERED VEHICLE                3132
KIDNAPPING                       2340
DRIVING UNDER THE INFLUENCE      2268
LIQUOR LAWS                      1899
RUNAWAY     

In [13]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
print(le.classes_)

['ARSON' 'ASSAULT' 'BAD CHECKS' 'BRIBERY' 'BURGLARY' 'DISORDERLY CONDUCT'
 'DRIVING UNDER THE INFLUENCE' 'DRUG/NARCOTIC' 'DRUNKENNESS'
 'EMBEZZLEMENT' 'EXTORTION' 'FAMILY OFFENSES' 'FORGERY/COUNTERFEITING'
 'FRAUD' 'GAMBLING' 'KIDNAPPING' 'LARCENY/THEFT' 'LIQUOR LAWS' 'LOITERING'
 'MISSING PERSON' 'NON-CRIMINAL' 'OTHER OFFENSES'
 'PORNOGRAPHY/OBSCENE MAT' 'PROSTITUTION' 'RECOVERED VEHICLE' 'ROBBERY'
 'RUNAWAY' 'SECONDARY CODES' 'SEX OFFENSES FORCIBLE'
 'SEX OFFENSES NON FORCIBLE' 'STOLEN PROPERTY' 'SUICIDE' 'SUSPICIOUS OCC'
 'TREA' 'TRESPASS' 'VANDALISM' 'VEHICLE THEFT' 'WARRANTS' 'WEAPON LAWS']


In [14]:
num_train = X_train.shape[0]
all_data = pd.concat((X_train, X_test), ignore_index=True)

In [15]:
date = pd.to_datetime(all_data['Dates'])
all_data['year'] = date.dt.year
all_data['month'] = date.dt.month
all_data['day'] = date.dt.day
all_data['hour'] = date.dt.hour
all_data['minute'] = date.dt.minute
all_data['special_time'] = all_data['minute'].isin([0, 30]).astype(int)
# all_data['second'] = date.dt.second  # all zero
all_data["n_days"] = (date - date.min()).apply(lambda x: x.days)
all_data.drop("Dates", axis=1, inplace=True)

In [16]:
all_data["DayOfWeek"].value_counts()

Friday       268074
Wednesday    259228
Saturday     253507
Tuesday      251543
Thursday     251298
Monday       243529
Sunday       232809
Name: DayOfWeek, dtype: int64

In [17]:
all_data["PdDistrict"].value_counts()

SOUTHERN      313984
MISSION       240172
NORTHERN      212122
BAYVIEW       178689
CENTRAL       171397
TENDERLOIN    163389
INGLESIDE     158806
TARAVAL       132017
PARK           99360
RICHMOND       90052
Name: PdDistrict, dtype: int64

In [18]:
sentences = []
for s in all_data["Address"]:
    sentences.append(s.split(" "))
address_model = gensim.models.Word2Vec(sentences, min_count=1)
encoded_address = np.zeros((all_data.shape[0], 100))
for i in range(len(sentences)):
    for j in range(len(sentences[i])):
        encoded_address[i] += address_model.wv[sentences[i][j]]
    encoded_address[i] /= len(sentences[i])

In [19]:
all_data['block'] = all_data["Address"].str.contains("block", case=False)
all_data.drop("Address", axis=1, inplace=True)

In [20]:
print(all_data["X"].min(), all_data["X"].max())
print(all_data["Y"].min(), all_data["Y"].max())

-122.51364206429 -120.5
37.7078790224135 90.0


In [21]:
X_median = all_data[all_data["X"] < -120.5]["X"].median()
Y_median = all_data[all_data["Y"] < 90]["Y"].median()
all_data.loc[all_data["X"] >= -120.5, "X"] = X_median
all_data.loc[all_data["Y"] >= 90, "Y"] = Y_median

In [22]:
print(all_data["X"].min(), all_data["X"].max())
print(all_data["Y"].min(), all_data["Y"].max())

-122.51364206429 -122.364750704393
37.7078790224135 37.82062083807021


In [23]:
all_data["X+Y"] = all_data["X"] + all_data["Y"]
all_data["X-Y"] = all_data["X"] - all_data["Y"]
all_data["XY30_1"] = all_data["X"] * np.cos(np.pi / 6) + all_data["Y"] * np.sin(np.pi / 6)
all_data["XY30_2"] = all_data["Y"] * np.cos(np.pi / 6) - all_data["X"] * np.sin(np.pi / 6)
all_data["XY60_1"] = all_data["X"] * np.cos(np.pi / 3) + all_data["Y"] * np.sin(np.pi / 3)
all_data["XY60_2"] = all_data["Y"] * np.cos(np.pi / 3) - all_data["X"] * np.sin(np.pi / 3)
all_data["XY1"] = (all_data["X"] - all_data["X"].min()) ** 2 + (all_data["Y"] - all_data["Y"].min()) ** 2
all_data["XY2"] = (all_data["X"].max() - all_data["X"]) ** 2 + (all_data["Y"] - all_data["Y"].min()) ** 2
all_data["XY3"] = (all_data["X"] - all_data["X"].min()) ** 2 + (all_data["Y"].max() - all_data["Y"]) ** 2
all_data["XY4"] = (all_data["X"].max() - all_data["X"]) ** 2 + (all_data["Y"].max() - all_data["Y"]) ** 2
all_data["XY5"] = (all_data["X"] - X_median) ** 2 + (all_data["Y"] - Y_median) ** 2
pca = PCA(n_components=2).fit(all_data[["X", "Y"]])
XYt = pca.transform(all_data[["X", "Y"]])
all_data["XYpca1"] = XYt[:, 0]
all_data["XYpca2"] = XYt[:, 1]
# n_components selected by aic/bic
clf = GaussianMixture(n_components=150, covariance_type="diag",
                      random_state=0).fit(all_data[["X", "Y"]])
all_data["XYcluster"] = clf.predict(all_data[["X", "Y"]])

In [24]:
categorical_features = ["DayOfWeek", "PdDistrict", "block", "special_time", "XYcluster"]
ct = ColumnTransformer(transformers=[("categorical_features", OrdinalEncoder(), categorical_features)],
                       remainder="passthrough")
all_data = ct.fit_transform(all_data)

In [25]:
all_data = np.hstack((all_data, encoded_address))

In [26]:
X_train = all_data[:num_train]
X_test = all_data[num_train:]

In [27]:
prob = np.zeros((X_test.shape[0], len(le.classes_)))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for train_index, test_index in skf.split(X_train, y_train):
    clf = catboost.CatBoostClassifier(n_estimators=10000, learning_rate=0.05,
                                      cat_features=np.arange(len(categorical_features)),
                                      early_stopping_rounds=100, random_seed=0, task_type="GPU",
                                      devices="0", verbose=100)
    clf.fit(X_train[train_index], y_train[train_index],
            eval_set=(X_train[test_index], y_train[test_index]))
    prob += clf.predict_proba(X_test)
prob /= 5

0:	learn: 3.4570954	test: 3.4562086	best: 3.4562086 (0)	total: 315ms	remaining: 52m 25s
100:	learn: 2.3449752	test: 2.3517079	best: 2.3517079 (100)	total: 28.5s	remaining: 46m 36s
200:	learn: 2.3000190	test: 2.3138448	best: 2.3138448 (200)	total: 56.5s	remaining: 45m 52s
300:	learn: 2.2740477	test: 2.2941371	best: 2.2941371 (300)	total: 1m 24s	remaining: 45m 31s
400:	learn: 2.2537477	test: 2.2797339	best: 2.2797339 (400)	total: 1m 53s	remaining: 45m 9s
500:	learn: 2.2365541	test: 2.2686504	best: 2.2686504 (500)	total: 2m 21s	remaining: 44m 47s
600:	learn: 2.2227056	test: 2.2605330	best: 2.2605330 (600)	total: 2m 50s	remaining: 44m 25s
700:	learn: 2.2111141	test: 2.2545062	best: 2.2545062 (700)	total: 3m 18s	remaining: 43m 58s
800:	learn: 2.2005298	test: 2.2494739	best: 2.2494739 (800)	total: 3m 47s	remaining: 43m 31s
900:	learn: 2.1905256	test: 2.2451575	best: 2.2451575 (900)	total: 4m 16s	remaining: 43m 8s
1000:	learn: 2.1813676	test: 2.2415418	best: 2.2415418 (1000)	total: 4m 45s	rem

2700:	learn: 2.0738735	test: 2.2191923	best: 2.2191923 (2700)	total: 12m 56s	remaining: 34m 58s
2800:	learn: 2.0689857	test: 2.2185511	best: 2.2185511 (2800)	total: 13m 25s	remaining: 34m 29s
2900:	learn: 2.0637435	test: 2.2178999	best: 2.2178999 (2900)	total: 13m 53s	remaining: 34m
3000:	learn: 2.0589442	test: 2.2173175	best: 2.2173175 (3000)	total: 14m 22s	remaining: 33m 31s
3100:	learn: 2.0541285	test: 2.2168600	best: 2.2168600 (3100)	total: 14m 51s	remaining: 33m 2s
3200:	learn: 2.0491303	test: 2.2163762	best: 2.2163731 (3199)	total: 15m 19s	remaining: 32m 33s
3300:	learn: 2.0442943	test: 2.2158813	best: 2.2158813 (3300)	total: 15m 48s	remaining: 32m 5s
3400:	learn: 2.0394122	test: 2.2154308	best: 2.2154308 (3400)	total: 16m 17s	remaining: 31m 36s
3500:	learn: 2.0349669	test: 2.2151275	best: 2.2151217 (3497)	total: 16m 46s	remaining: 31m 7s
3600:	learn: 2.0303150	test: 2.2146907	best: 2.2146907 (3600)	total: 17m 14s	remaining: 30m 39s
3700:	learn: 2.0257423	test: 2.2143485	best: 2.

5600:	learn: 1.9475211	test: 2.2075186	best: 2.2074713 (5526)	total: 27m 6s	remaining: 21m 17s
bestTest = 2.207471309
bestIteration = 5526
Shrink model to first 5527 iterations.
0:	learn: 3.4597135	test: 3.4594616	best: 3.4594616 (0)	total: 321ms	remaining: 53m 27s
100:	learn: 2.3458575	test: 2.3492931	best: 2.3492931 (100)	total: 28.8s	remaining: 47m 5s
200:	learn: 2.3004160	test: 2.3104413	best: 2.3104413 (200)	total: 58.1s	remaining: 47m 14s
300:	learn: 2.2738883	test: 2.2900679	best: 2.2900679 (300)	total: 1m 27s	remaining: 46m 45s
400:	learn: 2.2541984	test: 2.2759718	best: 2.2759718 (400)	total: 1m 55s	remaining: 46m 11s
500:	learn: 2.2381268	test: 2.2655997	best: 2.2655997 (500)	total: 2m 24s	remaining: 45m 45s
600:	learn: 2.2242463	test: 2.2574293	best: 2.2574293 (600)	total: 2m 53s	remaining: 45m 16s
700:	learn: 2.2120513	test: 2.2507685	best: 2.2507685 (700)	total: 3m 22s	remaining: 44m 50s
800:	learn: 2.2012729	test: 2.2453510	best: 2.2453510 (800)	total: 3m 51s	remaining: 4

2800:	learn: 2.0703671	test: 2.2109449	best: 2.2109412 (2799)	total: 13m 31s	remaining: 34m 46s
2900:	learn: 2.0653956	test: 2.2104205	best: 2.2104205 (2900)	total: 14m	remaining: 34m 17s
3000:	learn: 2.0605675	test: 2.2098652	best: 2.2098652 (3000)	total: 14m 29s	remaining: 33m 47s
3100:	learn: 2.0557356	test: 2.2093641	best: 2.2093641 (3100)	total: 14m 58s	remaining: 33m 18s
3200:	learn: 2.0511318	test: 2.2089732	best: 2.2089732 (3200)	total: 15m 27s	remaining: 32m 49s
3300:	learn: 2.0465347	test: 2.2085103	best: 2.2085103 (3300)	total: 15m 56s	remaining: 32m 20s
3400:	learn: 2.0417577	test: 2.2080595	best: 2.2080595 (3400)	total: 16m 25s	remaining: 31m 51s
3500:	learn: 2.0371648	test: 2.2076759	best: 2.2076759 (3500)	total: 16m 53s	remaining: 31m 22s
3600:	learn: 2.0323607	test: 2.2072760	best: 2.2072760 (3600)	total: 17m 22s	remaining: 30m 53s
3700:	learn: 2.0278013	test: 2.2069257	best: 2.2069222 (3697)	total: 17m 51s	remaining: 30m 24s
3800:	learn: 2.0234095	test: 2.2066328	best:

In [28]:
%%time
submission = pd.DataFrame(np.c_[test_ID, prob], columns=["Id"] + list(le.classes_))
submission["Id"] = submission["Id"].astype(int)
submission.to_csv("submission/v1-1.csv.gz", compression="gzip", index=False)  # 0.21805

CPU times: user 2min 28s, sys: 0 ns, total: 2min 28s
Wall time: 2min 28s


In [29]:
prob = np.zeros((X_test.shape[0], len(le.classes_)))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for train_index, test_index in skf.split(X_train, y_train):
    clf = catboost.CatBoostClassifier(n_estimators=10000, learning_rate=0.05,
                                      cat_features=np.arange(len(categorical_features)),
                                      early_stopping_rounds=100, random_seed=1, task_type="GPU",
                                      devices="0", verbose=100)
    clf.fit(X_train[train_index], y_train[train_index],
            eval_set=(X_train[test_index], y_train[test_index]))
    prob += clf.predict_proba(X_test)
prob /= 5

0:	learn: 3.4550132	test: 3.4544962	best: 3.4544962 (0)	total: 317ms	remaining: 52m 46s
100:	learn: 2.3444611	test: 2.3501782	best: 2.3501782 (100)	total: 28.5s	remaining: 46m 34s
200:	learn: 2.2994780	test: 2.3126920	best: 2.3126920 (200)	total: 57.2s	remaining: 46m 30s
300:	learn: 2.2728012	test: 2.2927436	best: 2.2927436 (300)	total: 1m 25s	remaining: 46m 11s
400:	learn: 2.2530490	test: 2.2791951	best: 2.2791951 (400)	total: 1m 55s	remaining: 45m 53s
500:	learn: 2.2369486	test: 2.2689118	best: 2.2689118 (500)	total: 2m 23s	remaining: 45m 20s
600:	learn: 2.2228104	test: 2.2606142	best: 2.2606142 (600)	total: 2m 52s	remaining: 44m 56s
700:	learn: 2.2109038	test: 2.2545061	best: 2.2545061 (700)	total: 3m 21s	remaining: 44m 28s
800:	learn: 2.2001144	test: 2.2494287	best: 2.2494287 (800)	total: 3m 49s	remaining: 43m 59s
900:	learn: 2.1901582	test: 2.2450613	best: 2.2450613 (900)	total: 4m 19s	remaining: 43m 37s
1000:	learn: 2.1813762	test: 2.2415545	best: 2.2415545 (1000)	total: 4m 48s	r

2900:	learn: 2.0659253	test: 2.2173000	best: 2.2173000 (2900)	total: 14m 1s	remaining: 34m 19s
3000:	learn: 2.0605356	test: 2.2167377	best: 2.2167377 (3000)	total: 14m 30s	remaining: 33m 50s
3100:	learn: 2.0559028	test: 2.2163419	best: 2.2163407 (3099)	total: 14m 59s	remaining: 33m 21s
3200:	learn: 2.0508787	test: 2.2158063	best: 2.2158054 (3198)	total: 15m 28s	remaining: 32m 52s
3300:	learn: 2.0460336	test: 2.2153446	best: 2.2153446 (3300)	total: 15m 57s	remaining: 32m 23s
3400:	learn: 2.0413171	test: 2.2149549	best: 2.2149549 (3400)	total: 16m 26s	remaining: 31m 54s
3500:	learn: 2.0365636	test: 2.2145728	best: 2.2145728 (3500)	total: 16m 55s	remaining: 31m 25s
3600:	learn: 2.0320495	test: 2.2141858	best: 2.2141858 (3600)	total: 17m 24s	remaining: 30m 56s
3700:	learn: 2.0272891	test: 2.2139283	best: 2.2139264 (3696)	total: 17m 53s	remaining: 30m 27s
3800:	learn: 2.0228163	test: 2.2137069	best: 2.2137069 (3800)	total: 18m 22s	remaining: 29m 57s
3900:	learn: 2.0182931	test: 2.2134625	be

100:	learn: 2.3443361	test: 2.3505966	best: 2.3505966 (100)	total: 29s	remaining: 47m 18s
200:	learn: 2.2997939	test: 2.3120161	best: 2.3120161 (200)	total: 58s	remaining: 47m 8s
300:	learn: 2.2735299	test: 2.2917455	best: 2.2917455 (300)	total: 1m 26s	remaining: 46m 43s
400:	learn: 2.2536357	test: 2.2776220	best: 2.2776220 (400)	total: 1m 55s	remaining: 46m 9s
500:	learn: 2.2370816	test: 2.2671158	best: 2.2671158 (500)	total: 2m 24s	remaining: 45m 43s
600:	learn: 2.2232364	test: 2.2587479	best: 2.2587479 (600)	total: 2m 53s	remaining: 45m 14s
700:	learn: 2.2110210	test: 2.2524604	best: 2.2524604 (700)	total: 3m 22s	remaining: 44m 47s
800:	learn: 2.2003433	test: 2.2473701	best: 2.2473701 (800)	total: 3m 51s	remaining: 44m 19s
900:	learn: 2.1906058	test: 2.2430508	best: 2.2430508 (900)	total: 4m 20s	remaining: 43m 51s
1000:	learn: 2.1813373	test: 2.2394018	best: 2.2394018 (1000)	total: 4m 49s	remaining: 43m 24s
1100:	learn: 2.1730354	test: 2.2362953	best: 2.2362953 (1100)	total: 5m 18s	

2800:	learn: 2.0705192	test: 2.2145326	best: 2.2145326 (2800)	total: 13m 33s	remaining: 34m 50s
2900:	learn: 2.0655615	test: 2.2139989	best: 2.2139989 (2900)	total: 14m 2s	remaining: 34m 22s
3000:	learn: 2.0604255	test: 2.2133531	best: 2.2133531 (3000)	total: 14m 32s	remaining: 33m 53s
3100:	learn: 2.0556151	test: 2.2127850	best: 2.2127850 (3100)	total: 15m 1s	remaining: 33m 25s
3200:	learn: 2.0509307	test: 2.2123574	best: 2.2123548 (3199)	total: 15m 30s	remaining: 32m 55s
3300:	learn: 2.0461903	test: 2.2119315	best: 2.2119265 (3299)	total: 15m 59s	remaining: 32m 26s
3400:	learn: 2.0416122	test: 2.2114925	best: 2.2114863 (3398)	total: 16m 28s	remaining: 31m 57s
3500:	learn: 2.0369685	test: 2.2110709	best: 2.2110709 (3500)	total: 16m 57s	remaining: 31m 28s
3600:	learn: 2.0324682	test: 2.2106999	best: 2.2106999 (3600)	total: 17m 26s	remaining: 30m 59s
3700:	learn: 2.0279382	test: 2.2103646	best: 2.2103575 (3691)	total: 17m 55s	remaining: 30m 30s
3800:	learn: 2.0234040	test: 2.2100606	bes

In [32]:
%%time
submission = pd.DataFrame(np.c_[test_ID, prob], columns=["Id"] + list(le.classes_))
submission["Id"] = submission["Id"].astype(int)
submission.to_csv("submission/v1-2.csv.gz", compression="gzip", index=False)  # 0.21798

CPU times: user 2min 29s, sys: 26.6 ms, total: 2min 29s
Wall time: 2min 29s


In [33]:
df1 = pd.read_csv("submission/v1-1.csv.gz", compression="gzip")
df2 = pd.read_csv("submission/v1-2.csv.gz", compression="gzip")
for col in df1.columns[1:]:
    df1[col] = (df1[col] + df2[col]) / 2
df1.to_csv("submission/v1.csv.gz", compression="gzip", index=False)  # 0.21749