In [1]:
import pandas as pd
import numpy as np

### Read the data

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
print train.shape
print test.shape

(260753, 299)
(173836, 298)


In [5]:
train[:3]

Unnamed: 0,QuoteNumber,Original_Quote_Date,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,...,GeographicField59A,GeographicField59B,GeographicField60A,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64
0,1,2013-08-16,0,B,23,0.9403,0.0006,965,1.02,N,...,9,9,-1,8,-1,18,-1,10,N,CA
1,2,2014-04-22,0,F,7,1.0006,0.004,548,1.2433,N,...,10,10,-1,11,-1,17,-1,20,N,NJ
2,4,2014-08-25,0,F,7,1.0006,0.004,548,1.2433,N,...,15,18,-1,21,-1,11,-1,8,N,NJ


### Prepare the data

In [6]:
def prepare_features(data):
    data.fillna(-1, inplace=True)
    
    data['Date'] = pd.to_datetime(pd.Series(data['Original_Quote_Date']))
    data['Day'] = data.Date.apply(lambda x: x.day)
    data['Month'] = data.Date.apply(lambda x: x.month)
    data['Year'] = data.Date.apply(lambda x: x.year)
    data['DayOfYear'] = data.Date.map(lambda x: x.dayofyear)
    data['WeekOfYear'] = data.Date.apply(lambda x: x.weekofyear)
    
    data.drop(['Original_Quote_Date', 'QuoteNumber', 'Date'], axis=1, inplace=True)

In [7]:
prepare_features(train)
prepare_features(test)

In [8]:
train[:3]

Unnamed: 0,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,CoverageField1B,...,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Day,Month,Year,DayOfYear,WeekOfYear
0,0,B,23,0.9403,0.0006,965,1.02,N,17,23,...,18,-1,10,N,CA,16,8,2013,228,33
1,0,F,7,1.0006,0.004,548,1.2433,N,6,8,...,17,-1,20,N,NJ,22,4,2014,112,17
2,0,F,7,1.0006,0.004,548,1.2433,N,7,12,...,11,-1,8,N,NJ,25,8,2014,237,35


In [9]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for f in test.columns:
    if train[f].dtype=='object':
        lbl_enc.fit(list(train[f])+list(test[f]))
        train[f] = lbl_enc.transform(list(train[f].values))
        test[f] = lbl_enc.transform(list(test[f].values))

### Try Random Forest 

In [10]:
from sklearn.cross_validation import train_test_split
x_tr, x_cv, y_tr, y_cv = train_test_split(train[test.columns], train.QuoteConversion_Flag, test_size = 0.05,  random_state = 42)

In [85]:
%%time
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

rf = RandomForestRegressor(
    n_estimators=100,
    n_jobs=3,
    random_state=42,
    oob_score=True
)
rf.fit(x_tr, y_tr)

Wall time: 15min 33s


In [87]:
pred_RF[:10]

array([ 0.34,  1.  ,  0.  ,  1.  ,  0.05,  0.51,  0.14,  0.03,  0.  ,  0.  ])

In [86]:
pred_RF = rf.predict(x_cv)
roc_auc_score(y_cv, pred_RF)

0.95950107101433779

### Try XGB 

In [12]:
# Set params

params = {"objective": "binary:logistic",
          "booster": "gbtree",
          "eval_metric": "auc",
          "eta": 0.023,
          "max_depth": 6,
          "subsample": 0.83,
          "colsample_bytree": 0.77,
          "silent": 1
          }
num_trees = 1800
stop = 30

In [13]:
import xgboost as xgb

dtrain = xgb.DMatrix(x_tr.values, y_tr)
dvalid = xgb.DMatrix(x_cv.values, y_cv)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

Will train until train error hasn't decreased in 30 rounds.
[0]	eval-auc:0.935847	train-auc:0.934701
[1]	eval-auc:0.941370	train-auc:0.940498
[2]	eval-auc:0.944255	train-auc:0.943538
[3]	eval-auc:0.945179	train-auc:0.944454
[4]	eval-auc:0.946622	train-auc:0.946485
[5]	eval-auc:0.948013	train-auc:0.947743
[6]	eval-auc:0.948580	train-auc:0.948169
[7]	eval-auc:0.949423	train-auc:0.949089
[8]	eval-auc:0.949142	train-auc:0.948848
[9]	eval-auc:0.949210	train-auc:0.948940
[10]	eval-auc:0.949609	train-auc:0.949128
[11]	eval-auc:0.949472	train-auc:0.948956
[12]	eval-auc:0.949169	train-auc:0.948674
[13]	eval-auc:0.949522	train-auc:0.948916
[14]	eval-auc:0.950109	train-auc:0.949460
[15]	eval-auc:0.950123	train-auc:0.949464
[16]	eval-auc:0.950641	train-auc:0.950048
[17]	eval-auc:0.950714	train-auc:0.950171
[18]	eval-auc:0.950618	train-auc:0.950067
[19]	eval-auc:0.950343	train-auc:0.949756
[20]	eval-auc:0.950206	train-auc:0.949618
[21]	eval-auc:0.950529	train-auc:0.950054
[22]	eval-auc:0.950686	tra

In [21]:
import gc
gc.enable()

In [28]:
dtest = xgb.DMatrix(test.values)
XGB = gbm.predict(dtest)
indices = XGB < 0
XGB[indices] = 0

In [93]:
ids = pd.read_csv('data/test.csv').QuoteNumber

In [29]:
submit = pd.read_csv('data/sample_submission.csv')
submit.QuoteConversion_Flag = XGB
submit.to_csv('submit.csv', index=False)