In [None]:
#!unzip classification.zip

In [None]:
!pip install catboost



In [None]:
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Report time,target,Descript,police depatment,resolved option,Address,Longitude,Latitude,number of units,register time
0,"2019-07-16,23:53",WARRANTS,WARRANT ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,51.574108,35.574599,3,"2019-07-17,00:11"
1,"2019-07-16,23:53",OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,51.574108,35.574599,3,"2019-07-17,00:09"
2,"2019-07-16,23:33",OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,51.575637,35.600414,1,"2019-07-16,23:44"
3,"2019-07-16,23:30",LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,NORTHERN,NONE,1500 Block of LOMBARD ST,51.573005,35.600873,1,"2019-07-16,23:41"
4,"2019-07-16,23:30",LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,PARK,NONE,100 Block of BRODERICK ST,51.561262,35.571541,3,"2019-07-16,23:46"


In [None]:
df['Report time'] = pd.to_datetime(df['Report time'], format="%Y-%m-%d,%H:%M")
df['register time'] = pd.to_datetime(df['register time'], format="%Y-%m-%d,%H:%M")

In [None]:
df['target_cat']  = pd.factorize(df['target'], sort=True)[0]
df.drop(['target'], axis = 1, inplace = True)

In [None]:
df.head()

Unnamed: 0,Report time,Descript,police depatment,resolved option,Address,Longitude,Latitude,number of units,register time,target_cat
0,2019-07-16 23:53:00,WARRANT ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,51.574108,35.574599,3,2019-07-17 00:11:00,35
1,2019-07-16 23:53:00,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,51.574108,35.574599,3,2019-07-17 00:09:00,21
2,2019-07-16 23:33:00,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,51.575637,35.600414,1,2019-07-16 23:44:00,21
3,2019-07-16 23:30:00,GRAND THEFT FROM LOCKED AUTO,NORTHERN,NONE,1500 Block of LOMBARD ST,51.573005,35.600873,1,2019-07-16 23:41:00,16
4,2019-07-16 23:30:00,GRAND THEFT FROM LOCKED AUTO,PARK,NONE,100 Block of BRODERICK ST,51.561262,35.571541,3,2019-07-16 23:46:00,16


***
Validation

In [None]:
X = df.loc[:, df.columns != 'target_cat']
y = df['target_cat']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    stratify=y, 
                                                    test_size=0.3)

***
Modeling

In [None]:
cat_features = df[['Descript', 'police depatment', 'resolved option', 'Address']]

In [None]:
train_dataset = Pool(data=X_train,
                     label=y_train,
                     cat_features=cat_features)

In [None]:
eval_dataset = Pool(data=X_test,
                    label=y_test,
                    cat_features=cat_features)

In [None]:
model = CatBoostClassifier(
    iterations=1000, 
    task_type="GPU",
    devices='0:1',
    early_stopping_rounds = 5,
    loss_function='MultiClass')

In [None]:
model.fit(train_dataset,
          eval_set=eval_dataset)

Learning rate set to 0.20103
0:	learn: 2.3304852	test: 2.3200077	best: 2.3200077 (0)	total: 332ms	remaining: 5m 31s
1:	learn: 1.9549407	test: 1.9452062	best: 1.9452062 (1)	total: 598ms	remaining: 4m 58s
2:	learn: 1.8247961	test: 1.8154615	best: 1.8154615 (2)	total: 874ms	remaining: 4m 50s
3:	learn: 1.5985581	test: 1.5898099	best: 1.5898099 (3)	total: 1.14s	remaining: 4m 44s
4:	learn: 1.4420221	test: 1.4330284	best: 1.4330284 (4)	total: 1.42s	remaining: 4m 42s
5:	learn: 1.3476682	test: 1.3386834	best: 1.3386834 (5)	total: 1.7s	remaining: 4m 41s
6:	learn: 1.2768563	test: 1.2680704	best: 1.2680704 (6)	total: 1.97s	remaining: 4m 39s
7:	learn: 1.2084441	test: 1.1998883	best: 1.1998883 (7)	total: 2.24s	remaining: 4m 37s
8:	learn: 1.1663793	test: 1.1582058	best: 1.1582058 (8)	total: 2.51s	remaining: 4m 36s
9:	learn: 1.1348711	test: 1.1267313	best: 1.1267313 (9)	total: 2.77s	remaining: 4m 34s
10:	learn: 1.0832625	test: 1.0750470	best: 1.0750470 (10)	total: 3.05s	remaining: 4m 34s
11:	learn: 1.

<catboost.core.CatBoostClassifier at 0x7fe9f472abe0>

***
Test


In [None]:
df_test = pd.read_csv('test_notarget.csv')
df_test.head()

Unnamed: 0,Report time,Descript,police depatment,resolved option,Address,Longitude,Latitude,number of units,register time
0,"2019-07-16,18:15",PETTY THEFT FROM LOCKED AUTO,CENTRAL,NONE,GREENWICH ST / LEAVENWORTH ST,51.582199,35.601268,3,"2019-07-16,18:33"
1,"2019-07-16,17:47",POSSESSION OF NARCOTICS PARAPHERNALIA,BAYVIEW,NONE,0 Block of WHITFIELD CT,51.618162,35.531104,4,"2019-07-16,18:06"
2,"2019-07-16,17:45",DOMESTIC VIOLENCE,CENTRAL,"ARREST, BOOKED",700 Block of GEARY ST,51.584367,35.586359,1,"2019-07-16,17:56"
3,"2019-07-16,02:22","STOLEN PROPERTY, POSSESSION WITH KNOWLEDGE, RE...",TARAVAL,"ARREST, BOOKED",34TH AV / VICENTE ST,51.508476,35.538613,2,"2019-07-16,02:36"
4,"2019-07-15,19:00",GRAND THEFT FROM UNLOCKED AUTO,SOUTHERN,NONE,800 Block of BRYANT ST,51.596595,35.575421,1,"2019-07-15,19:10"


In [None]:
df_test['Report time'] = pd.to_datetime(df_test['Report time'], format="%Y-%m-%d,%H:%M")
df_test['register time'] = pd.to_datetime(df_test['register time'], format="%Y-%m-%d,%H:%M")

In [None]:
test_dataset = Pool(data=df_test,
                    cat_features=cat_features)

In [None]:
preds_class = model.predict_proba(test_dataset)
preds_class

array([[1.62847281e-11, 8.37929614e-04, 4.83673287e-11, ...,
        1.33142949e-09, 1.02034165e-09, 1.02631307e-09],
       [4.58795778e-08, 5.40691530e-03, 6.48213747e-09, ...,
        1.46536951e-07, 1.91639611e-08, 1.74620027e-07],
       [1.91408044e-08, 8.37209084e-08, 1.99209868e-08, ...,
        2.79537409e-02, 1.26166630e-03, 4.19328223e-08],
       ...,
       [2.51983685e-08, 4.73782325e-08, 8.68271878e-09, ...,
        6.18176069e-03, 4.86285768e-02, 5.66362997e-09],
       [1.28734054e-07, 1.59466647e-02, 2.45560511e-07, ...,
        1.04583008e-07, 9.81467542e-08, 4.26335054e-09],
       [2.29841838e-08, 1.21984172e-07, 1.26183860e-08, ...,
        9.36700728e-03, 1.71706692e-03, 1.42672131e-01]])

In [None]:
out = pd.DataFrame(preds_class)

In [None]:
df = pd.read_csv('train.csv')
keys = dict(df.target.value_counts().sort_index()).keys()
out.columns = keys

In [None]:
out.insert(0, 'id', list(range(0,len(out))))
out.head()

Unnamed: 0,id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,1.628473e-11,0.0008379296,4.836733e-11,4.957239e-11,1.805622e-08,7.903393e-10,6.931685e-10,1.626788e-06,3.896517e-09,1.628113e-11,5.003393e-11,5.090695e-11,3.610713e-10,5.52121e-06,8.360624e-11,6.147285e-10,0.9991379,4.864786e-11,4.559902e-11,1.696384e-05,8.319295e-09,2.729032e-10,3.430456e-10,4.373481e-10,8.452443e-11,2.790164e-10,1.569257e-09,2.87188e-10,3.201317e-10,8.474279e-10,2.733357e-10,2.301025e-09,2.568516e-10,3.580402e-09,1.331429e-09,1.020342e-09,1.026313e-09
1,1,4.587958e-08,0.005406915,6.482137e-09,7.035099e-09,2.126358e-05,6.149331e-08,6.334019e-08,0.02003728,5.012891e-07,2.45463e-09,3.650804e-09,3.037314e-09,2.734117e-07,0.07865625,1.035091e-08,5.274865e-08,0.0803775,1.549515e-08,4.256261e-09,0.8154988,2.758451e-08,8.742995e-08,2.707368e-08,2.896544e-08,2.620664e-08,2.451904e-08,4.921087e-08,1.680352e-08,1.381262e-08,9.199543e-08,2.056097e-08,6.619239e-08,1.037569e-08,1.417796e-07,1.46537e-07,1.916396e-08,1.7462e-07
2,2,1.91408e-08,8.372091e-08,1.992099e-08,2.412392e-08,3.313568e-08,3.452305e-08,9.914877e-09,3.25416e-08,2.062616e-08,1.825744e-08,1.715991e-08,2.13682e-08,8.085021e-08,1.075471e-07,2.083143e-08,2.379697e-08,9.06318e-08,1.193098e-08,1.86632e-08,5.914205e-08,0.0002539935,0.6501359,1.0418e-05,0.0001413813,0.0001085735,9.061701e-09,0.3181106,5.724298e-08,3.805847e-09,2.36275e-08,5.732241e-09,0.0006866759,0.0001848484,0.001151371,0.02795374,0.001261666,4.193282e-08
3,3,1.234401e-07,6.624687e-07,1.322e-07,7.023748e-08,1.361345e-07,3.369563e-08,1.263144e-07,9.120045e-08,1.152133e-07,3.627216e-08,4.874424e-08,1.101348e-07,1.544399e-07,2.049875e-07,1.116117e-07,8.134675e-08,5.060715e-07,1.44688e-07,2.211205e-07,1.439364e-07,0.1915273,0.3668153,0.0006982329,4.81314e-07,0.0006128538,0.0009564508,0.001432108,0.005040574,1.683899e-06,0.4131746,1.292971e-06,0.001496818,0.0001371177,7.46053e-05,0.0006311617,0.001819346,0.01557678
4,4,6.402597e-07,0.006610085,5.440189e-08,6.390487e-08,0.1589192,3.894928e-06,5.084414e-06,0.04565973,0.03267886,2.047373e-08,3.069669e-08,1.822023e-08,2.582865e-05,0.001676781,1.140612e-07,2.485813e-06,0.7543936,1.137698e-07,2.780422e-08,2.208433e-05,1.246019e-07,7.934606e-08,1.733641e-08,2.507428e-08,3.430069e-08,1.54545e-08,1.414122e-07,5.137588e-08,3.240525e-08,1.055772e-07,2.313331e-08,3.805682e-08,2.422961e-08,3.601063e-08,3.925692e-08,2.972789e-07,2.930582e-07


In [None]:
out.to_csv('output.csv', index=False)