# Train on accident and person data


In [1]:
%reset 
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import xgboost as xgb
from numpy import nan as NA
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score as auc

Once deleted, variables cannot be recovered. Proceed (y/[n])? y




# Load data and make labels consistent

In [2]:
print("Load labels")
label_file = pd.read_csv("./train/labels_ext.csv", index_col=0)

Load labels


In [3]:
Xtrain = pd.read_csv("./train/joint_accident_person_train_ext.csv", index_col=0)
n_train, n_dim = Xtrain.shape
print([n_train, n_dim])

[708569, 260]


In [4]:
temp = pd.merge(Xtrain, label_file, on=['ID'], how='inner')

In [5]:
y = temp['DRUNK_DR'].apply(lambda x: x*1 ).values

In [6]:
%xdel temp

In [7]:
Xtrain.drop('ID', axis= 1, inplace=True)
print("Splitting data")

Splitting data


In [8]:
Xtrain_c, Xeval, Ytrain_c, Yeval = train_test_split(Xtrain.values, y, test_size = 0.3)

In [9]:
%xdel Xtrain
%xdel label_file

In [10]:
%whos

Variable           Type        Data/Info
----------------------------------------
NA                 float       nan
Xeval              ndarray     212571x259: 55055889 elems, type `float64`, 440447112 bytes (420.04309844970703 Mb)
Xtrain_c           ndarray     495998x259: 128463482 elems, type `float64`, 1027707856 bytes (980.0985870361328 Mb)
Yeval              ndarray     212571: 212571 elems, type `int64`, 1700568 bytes (1.6217880249023438 Mb)
Ytrain_c           ndarray     495998: 495998 elems, type `int64`, 3967984 bytes (3.7841644287109375 Mb)
auc                function    <function roc_auc_score at 0x2b8af0383048>
n_dim              int         260
n_train            int         708569
np                 module      <module 'numpy' from '/us<...>kages/numpy/__init__.py'>
os                 module      <module 'os' from '/usr/c<...>est/lib/python3.5/os.py'>
pd                 module      <module 'pandas' from '/u<...>ages/pandas/__init__.py'>
plt                module      <mo

In [11]:
print("Load data into sparse matrix")
dtrain = xgb.DMatrix(data=Xtrain_c, missing = NA, label= Ytrain_c)
deval  = xgb.DMatrix(data=Xeval, missing = NA, label = Yeval)
print("Specifying the parameters ... ")

Load data into sparse matrix
Specifying the parameters ... 


In [12]:
%xdel Xtrain_c
%xdel Xeval
%whos

Variable           Type        Data/Info
----------------------------------------
NA                 float       nan
Yeval              ndarray     212571: 212571 elems, type `int64`, 1700568 bytes (1.6217880249023438 Mb)
Ytrain_c           ndarray     495998: 495998 elems, type `int64`, 3967984 bytes (3.7841644287109375 Mb)
auc                function    <function roc_auc_score at 0x2b8af0383048>
deval              DMatrix     <xgboost.core.DMatrix object at 0x2b8af0a333c8>
dtrain             DMatrix     <xgboost.core.DMatrix object at 0x2b8af0a33390>
n_dim              int         260
n_train            int         708569
np                 module      <module 'numpy' from '/us<...>kages/numpy/__init__.py'>
os                 module      <module 'os' from '/usr/c<...>est/lib/python3.5/os.py'>
pd                 module      <module 'pandas' from '/u<...>ages/pandas/__init__.py'>
plt                module      <module 'matplotlib.pyplo<...>es/matplotlib/pyplot.py'>
sns                m

In [13]:
param = {'max_depth': 12,
         'eta': 0.02,
         'subsample': 0.7,
         'colsample_bytree': 0.8,
         'silent': 0,
         'eval_metric': 'auc',
         'alpha': 0,
         'lambda': 1,
         'nthread': 8,
         'objective': 'binary:logistic'}

In [14]:
watchlist = [(deval, 'eval'), (dtrain, 'train')]
num_round = 520
print("Training ... ")

Training ... 


In [15]:
bst = xgb.train(param, dtrain, num_round, watchlist)

In [16]:
print("Saving the model")
bst.save_model('./models/xgb_acc_per.model')
bst.dump_model('./models/xgb_raw_acc_per.txt')

dtrain.save_binary('./models/binary/dtrain.buffer')
deval.save_binary('./models/binary/deval.buffer')

%xdel dtrain
%xdel deval

Saving the model


In [17]:
%whos

Variable           Type        Data/Info
----------------------------------------
NA                 float       nan
Yeval              ndarray     212571: 212571 elems, type `int64`, 1700568 bytes (1.6217880249023438 Mb)
Ytrain_c           ndarray     495998: 495998 elems, type `int64`, 3967984 bytes (3.7841644287109375 Mb)
auc                function    <function roc_auc_score at 0x2b8af0383048>
bst                Booster     <xgboost.core.Booster object at 0x2b8af0a33080>
n_dim              int         260
n_train            int         708569
np                 module      <module 'numpy' from '/us<...>kages/numpy/__init__.py'>
num_round          int         520
os                 module      <module 'os' from '/usr/c<...>est/lib/python3.5/os.py'>
param              dict        n=10
pd                 module      <module 'pandas' from '/u<...>ages/pandas/__init__.py'>
plt                module      <module 'matplotlib.pyplo<...>es/matplotlib/pyplot.py'>
sns                module   

In [18]:
Xtest = pd.read_csv("./test/joint_accident_person_test_ext.csv", index_col=0)
ID= Xtest['ID'].astype(np.int64)
Xtest.drop('ID', axis= 1, inplace=True)
print("Load test samples")

Load test samples


In [19]:
uniqueID = np.unique(ID)

In [20]:
uniqueID

array([     0,      1,      2, ..., 121062, 121063, 121064])

In [21]:
dtest= xgb.DMatrix(data=Xtest.values, missing = NA)

In [22]:
print("Prediction")
preds = bst.predict(dtest)

Prediction


In [23]:
dtest.save_binary('./models/binary/dtest.buffer')


In [24]:
print("submit")
predict_df = pd.DataFrame(data={'ID': ID.values, 'prob': preds})
predict_df

submit


Unnamed: 0,ID,prob
0,0,0.166868
1,1,0.365786
2,1,0.559330
3,1,0.403655
4,2,0.245116
5,2,0.159998
6,2,0.165970
7,3,0.038040
8,3,0.023363
9,4,0.041091


In [25]:
predict_df.groupby('ID').mean()

Unnamed: 0_level_0,prob
ID,Unnamed: 1_level_1
0,0.166868
1,0.442924
2,0.190361
3,0.030701
4,0.041091
5,0.687770
6,0.068713
7,0.067682
8,0.263840
9,0.153521


In [26]:
grouped_predict = predict_df.groupby('ID')
prediction = grouped_predict.mean()

In [27]:
prediction.index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            121055, 121056, 121057, 121058, 121059, 121060, 121061, 121062,
            121063, 121064],
           dtype='int64', name='ID', length=121065)

In [28]:
submit = pd.DataFrame(data={'ID': prediction.index, 'DRUNK_DR': prediction})
submit.to_csv('fars_submit.csv', index = False)