In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import pylab as plt
import scipy

### Read the data

In [68]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [69]:
print train.shape
print test.shape

(114321, 133)
(114393, 132)


In [70]:
train[:3]

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,,,C,,9.191265,,,2.30163,...,,,0.598896,AF,,,1.957825,0,,
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472


### Prepare the data

In [71]:
def prepare_features(data):
    data['NaNCount'] = data.isnull().sum(axis=1)
    data['Mean'] = data.mean(axis=1)
    data['Max'] = data.max(numeric_only=True, axis=1)
    data['Min'] = data.min(numeric_only=True, axis=1)
    data.fillna(-1, inplace=True)

In [72]:
prepare_features(train)
prepare_features(test)

In [73]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for f in test.columns:
    if train[f].dtype=='object':
        lbl_enc.fit(list(train[f])+list(test[f]))
        train[f] = lbl_enc.transform(list(train[f].values))
        test[f] = lbl_enc.transform(list(test[f].values))

In [74]:
train[:3]

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v126,v127,v128,v129,v130,v131,NaNCount,Mean,Max,Min
0,3,1,1.335739,8.727474,3,3.921026,7.915266,2.599278,3.176895,0.012941,...,1.804126,3.113719,2.024285,0,0.636365,2.857144,1,5.052057,19.470199,-6.297423e-07
1,4,1,-1.0,-1.0,3,-1.0,9.191265,-1.0,-1.0,2.30163,...,-1.0,-1.0,1.957825,0,-1.0,-1.0,81,7.217756,81.0,0.0
2,5,1,0.943877,5.310079,3,4.410969,5.326159,3.979592,3.928571,0.019645,...,1.773709,3.922193,1.120468,2,0.883118,1.176472,2,4.921187,17.952332,-2.792745e-07


### Try RF Regression with K-Best

In [75]:
from sklearn.cross_validation import train_test_split
x_tr, x_cv, y_tr, y_cv = train_test_split(train[test.columns], train.target, test_size = 0.05,  random_state = 42)

In [76]:
%%time
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline

rf = RandomForestRegressor(
    n_estimators=25,
    max_depth=10,
    n_jobs=3,
    random_state=42,
    oob_score=True
)

clf = Pipeline([
  ('feature_selection', SelectKBest(f_regression, k=110)),
  ('classification', rf)
])
clf.fit(x_tr, y_tr)

Wall time: 40.5 s


In [46]:
%%time
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(
    n_estimators=25,
    max_depth=10,
    n_jobs=3,
    random_state=42,
    oob_score=True
)
clf.fit(x_tr, y_tr)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=25, n_jobs=3, oob_score=True, random_state=42,
           verbose=0, warm_start=False)

In [77]:
from sklearn.metrics import log_loss
pred_RF = clf.predict(x_cv)
log_loss(y_cv, pred_RF)

0.466827384423457

In [None]:
pred_RF[:10]

The best: 0.46652821064674577

In [None]:
# Show feature importances
importances = rf.feature_importances_
low_value = 0.010
# std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
inds = [i for i in indices if importances[i] > low_value]

print("Feature ranking:")

labels = [list(test.columns)[i] for i in indices.tolist() if importances[i] > low_value]
x = scipy.array(xrange(len(inds)))
y = scipy.array(importances[inds])
f = plt.figure(figsize=(12,5))
ax = f.add_axes([1, 1, 1, 1])
ax.bar(x, y, align='center')
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=90)
f.show()

### Submit RF 

In [None]:
RF = rf.predict(test)

In [None]:
submit = pd.read_csv('data/sample_submission.csv')
submit.PredictedProb = RF
submit.to_csv('submit.csv', index=False)

### Try XGB 

In [None]:
from sklearn.cross_validation import train_test_split
x_tr, x_cv, y_tr, y_cv = train_test_split(train[test.columns], train.target, test_size = 0.05,  random_state = 42)

In [92]:
# Set params

params = {"objective": "binary:logistic",
          "booster": "gbtree",
          "eval_metric": "logloss",
          "eta": 0.01,
          "max_depth": 10,
          "subsample": 0.75,
          "colsample_bytree": 0.68,
          "silent": 1
          }
num_trees = 1800
stop = 30
boost_round = 1200

In [89]:
import xgboost as xgb

dtrain = xgb.DMatrix(x_tr.values, y_tr)
dvalid = xgb.DMatrix(x_cv.values, y_cv)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_boost_round=boost_round, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

Will train until train error hasn't decreased in 30 rounds.
[0]	eval-logloss:0.689321	train-logloss:0.688769
[1]	eval-logloss:0.685628	train-logloss:0.684654
[2]	eval-logloss:0.682086	train-logloss:0.680604
[3]	eval-logloss:0.678795	train-logloss:0.676910
[4]	eval-logloss:0.675756	train-logloss:0.673409
[5]	eval-logloss:0.672287	train-logloss:0.669442
[6]	eval-logloss:0.668872	train-logloss:0.665546
[7]	eval-logloss:0.665821	train-logloss:0.662084
[8]	eval-logloss:0.662527	train-logloss:0.658283
[9]	eval-logloss:0.659652	train-logloss:0.655000
[10]	eval-logloss:0.656636	train-logloss:0.651434
[11]	eval-logloss:0.653750	train-logloss:0.648137
[12]	eval-logloss:0.650856	train-logloss:0.644691
[13]	eval-logloss:0.647747	train-logloss:0.641132
[14]	eval-logloss:0.644939	train-logloss:0.637884
[15]	eval-logloss:0.642288	train-logloss:0.634877
[16]	eval-logloss:0.639400	train-logloss:0.631516
[17]	eval-logloss:0.636614	train-logloss:0.628276
[18]	eval-logloss:0.634005	train-logloss:0.625192


The best score: 0.454461 (train), 0.45910 (test)

In [90]:
dtest = xgb.DMatrix(test.as_matrix())
XGB = gbm.predict(dtest)
indices = XGB < 0
XGB[indices] = 0

In [91]:
submit = pd.read_csv('data/sample_submission.csv')
submit.PredictedProb = XGB
submit.to_csv('submit.csv', index=False)