# Train on accident and person data


In [None]:
%reset 
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import xgboost as xgb
from numpy import nan as NA
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score as auc

# Load data and make labels consistent

In [None]:
print("Load labels")
label_file = pd.read_csv("./train/labels_ext.csv", index_col=0)

In [None]:
Xtrain = pd.read_csv("./train/joint_accident_person_train_ext.csv", index_col=0)
n_train, n_dim = Xtrain.shape
print([n_train, n_dim])

In [None]:
temp = pd.merge(Xtrain, label_file, on=['ID'], how='inner')

In [None]:
y = temp['DRUNK_DR'].apply(lambda x: x*1 ).values

In [None]:
%xdel temp

In [None]:
Xtrain.drop('ID', axis= 1, inplace=True)
print("Splitting data")

In [None]:
Xtrain_c, Xeval, Ytrain_c, Yeval = train_test_split(Xtrain.values, y, test_size = 0.3)

In [None]:
%xdel Xtrain
%xdel label_file

In [None]:
print("Load data into sparse matrix")
dtrain = xgb.DMatrix(data=Xtrain_c, missing = NA, label= Ytrain_c)
deval  = xgb.DMatrix(data=Xeval, missing = NA, label = Yeval)
print("Specifying the parameters ... ")

In [None]:
%xdel Xtrain_c
%xdel Xeval
%whos

In [None]:
param = {'max_depth': 12,
         'eta': 0.02,
         'subsample': 0.7,
         'colsample_bytree': 0.8,
         'silent': 0,
         'eval_metric': 'auc',
         'alpha': 0,
         'lambda': 1,
         'nthread': 8,
         'objective': 'binary:logistic'}

In [None]:
watchlist = [(deval, 'eval'), (dtrain, 'train')]
num_round = 520
print("Training ... ")

In [None]:
bst = xgb.train(param, dtrain, num_round, watchlist)

In [None]:
print("Saving the model")
bst.save_model('./models/xgb_acc_per.model')
bst.dump_model('./models/xgb_raw_acc_per.txt')

dtrain.save_binary('./models/binary/dtrain.buffer')
deval.save_binary('./models/binary/deval.buffer')

%xdel dtrain
%xdel deval

In [None]:
%whos

In [None]:
Xtest = pd.read_csv("./test/joint_accident_person_test_ext.csv", index_col=0)
ID= Xtest['ID'].astype(np.int64)
Xtest.drop('ID', axis= 1, inplace=True)
print("Load test samples")

In [None]:
uniqueID = np.unique(ID)

In [None]:
uniqueID

In [None]:
dtest= xgb.DMatrix(data=Xtest.values, missing = NA)

In [None]:
print("Prediction")
preds = bst.predict(dtest)

In [None]:
dtest.save_binary('./models/binary/dtest.buffer')


In [None]:
print("submit")
predict_df = pd.DataFrame(data={'ID': ID.values, 'prob': preds})
predict_df

In [None]:
#predict_df.groupby('ID').mean()

In [None]:
grouped_predict = predict_df.groupby('ID')
prediction = grouped_predict.max()

In [None]:
prediction.index

In [None]:
submit = pd.DataFrame(data={'ID': prediction.index, 'DRUNK_DR': prediction['prob'].values})
submit.to_csv('fars_submit.csv', index = False)