In [1]:
import pandas as pd
import numpy as np

### Read the data

In [65]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [66]:
print train.shape
print test.shape

(260753, 299)
(173836, 298)


In [67]:
train[:3]

Unnamed: 0,QuoteNumber,Original_Quote_Date,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,...,GeographicField59A,GeographicField59B,GeographicField60A,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64
0,1,2013-08-16,0,B,23,0.9403,0.0006,965,1.02,N,...,9,9,-1,8,-1,18,-1,10,N,CA
1,2,2014-04-22,0,F,7,1.0006,0.004,548,1.2433,N,...,10,10,-1,11,-1,17,-1,20,N,NJ
2,4,2014-08-25,0,F,7,1.0006,0.004,548,1.2433,N,...,15,18,-1,21,-1,11,-1,8,N,NJ


### Prepare the data

In [68]:
def prepare_features(data):
    data.fillna(-1, inplace=True)
    
    data['Date'] = pd.to_datetime(pd.Series(data['Original_Quote_Date']))
    data['Day'] = data.Date.apply(lambda x: x.day)
    data['Month'] = data.Date.apply(lambda x: x.month)
    data['Year'] = data.Date.apply(lambda x: x.year)
    data['DayOfYear'] = data.Date.map(lambda x: x.dayofyear)
    data['WeekOfYear'] = data.Date.apply(lambda x: x.weekofyear)
    
    data.drop(['Original_Quote_Date', 'QuoteNumber', 'Date'], axis=1, inplace=True)

In [69]:
prepare_features(train)
prepare_features(test)

In [63]:
train[:3]

Unnamed: 0,QuoteNumber,Original_Quote_Date,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,...,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Date,Day,Month,Year,DayOfYear,WeekOfYear
0,1,2013-08-16,0,B,23,0.9403,0.0006,965,1.02,N,...,-1,10,N,CA,2013-08-16,16,8,2013,228,33
1,2,2014-04-22,0,F,7,1.0006,0.004,548,1.2433,N,...,-1,20,N,NJ,2014-04-22,22,4,2014,112,17
2,4,2014-08-25,0,F,7,1.0006,0.004,548,1.2433,N,...,-1,8,N,NJ,2014-08-25,25,8,2014,237,35


In [75]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for f in test.columns:
    if train[f].dtype=='object':
        lbl_enc.fit(list(train[f])+list(test[f]))
        train[f] = lbl_enc.transform(list(train[f].values))
        test[f] = lbl_enc.transform(list(test[f].values))

### Try Random Forest 

In [76]:
from sklearn.cross_validation import train_test_split
x_tr, x_cv, y_tr, y_cv = train_test_split(train[test.columns], train.QuoteConversion_Flag, test_size = 0.05,  random_state = 42)

In [85]:
%%time
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

rf = RandomForestRegressor(
    n_estimators=100,
    n_jobs=3,
    random_state=42,
    oob_score=True
)
rf.fit(x_tr, y_tr)

Wall time: 15min 33s


In [87]:
pred_RF[:10]

array([ 0.34,  1.  ,  0.  ,  1.  ,  0.05,  0.51,  0.14,  0.03,  0.  ,  0.  ])

In [86]:
pred_RF = rf.predict(x_cv)
roc_auc_score(y_cv, pred_RF)

0.95950107101433779

### Try XGB 

In [89]:
# Set params

params = {"objective": "binary:logistic",
          "booster": "gbtree",
          "eval_metric": "auc",
          "eta": 0.02,
          "max_depth": 10,
          "subsample": 0.85,
          "colsample_bytree": 0.66,
          "silent": 1
          }
num_trees = 1500
stop = 30

In [91]:
import xgboost as xgb

dtrain = xgb.DMatrix(x_tr.values, y_tr)
dvalid = xgb.DMatrix(x_cv.values, y_cv)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

Will train until train error hasn't decreased in 30 rounds.
[0]	eval-auc:0.933753	train-auc:0.934189
[1]	eval-auc:0.951426	train-auc:0.951894
[2]	eval-auc:0.955413	train-auc:0.956654
[3]	eval-auc:0.956164	train-auc:0.957158
[4]	eval-auc:0.957388	train-auc:0.958301
[5]	eval-auc:0.957718	train-auc:0.958613
[6]	eval-auc:0.958094	train-auc:0.959025
[7]	eval-auc:0.958301	train-auc:0.959075
[8]	eval-auc:0.958303	train-auc:0.959092
[9]	eval-auc:0.958143	train-auc:0.958910
[10]	eval-auc:0.957711	train-auc:0.958597
[11]	eval-auc:0.957949	train-auc:0.958937
[12]	eval-auc:0.957978	train-auc:0.959192
[13]	eval-auc:0.958215	train-auc:0.959480
[14]	eval-auc:0.958463	train-auc:0.959579
[15]	eval-auc:0.958573	train-auc:0.959686
[16]	eval-auc:0.958782	train-auc:0.960100
[17]	eval-auc:0.958688	train-auc:0.959964
[18]	eval-auc:0.958851	train-auc:0.960004
[19]	eval-auc:0.958989	train-auc:0.960125
[20]	eval-auc:0.959001	train-auc:0.960237
[21]	eval-auc:0.959086	train-auc:0.960298
[22]	eval-auc:0.959186	tra

In [92]:
dtest = xgb.DMatrix(test.as_matrix())
XGB = gbm.predict(dtest)
indices = XGB < 0
XGB[indices] = 0

In [93]:
ids = pd.read_csv('data/test.csv').QuoteNumber

In [99]:
submit = pd.read_csv('data/sample_submission.csv')
submit.QuoteConversion_Flag = XGB
submit.to_csv('submit.csv', index=False)