In [2]:
%matplotlib inline
import pandas as pd
import numpy as np

## 1. Get the data 

In [3]:
train_x = pd.read_csv('data/Xtrain.csv', low_memory=False)
test = pd.read_csv('data/Xtest.csv', low_memory=False)
train_y = pd.read_csv('data/ytrain.csv', low_memory=False)

In [4]:
full = pd.concat([train_x, test])

## 2. Prepare the data

In [5]:
full[:3]

Unnamed: 0,id,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,X1368,X1369,X1370,X1371,X1372,X1373,X1374,X1375,X1376,X1377
0,7865,,,0.025449,,,0.368421,,,,...,,,,a,,q,,,,a
1,10105,,,0.024475,,,0.342105,,,,...,,a,,a,,b,,,,a
2,8721,,,0.041694,,,0.447368,,,,...,,,,a,,,,,,a


In [6]:
for col in full.columns:
    if (full[col].isnull().sum(axis=0) > 7000):
        del full[col]

In [7]:
full.fillna(0, inplace=True)

In [8]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for c in list(full.columns):
    if(full[c].dtypes == np.object):
        full[c] = lbl_enc.fit_transform(full[c])

## 3. Cross Validation 

In [9]:
train_x_new = full[:10000]

In [10]:
cols = list(train_x_new.columns)
cols.remove('id')

In [21]:
from sklearn.cross_validation import train_test_split

xtrain, xcv, ytrain, ycv = train_test_split(train_x_new[cols], train_y[train_y.columns[1:]], test_size = 0.1,  random_state = 10)

In [27]:
from sklearn.metrics import mean_squared_error

def rmse(y_true, pred):
    return mean_squared_error(y_true, pred) ** 0.5

#### 1. Try Random Forest 

In [82]:
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

ypred = np.zeros((len(xcv), 14))
errors = []
for i in range(14):
    kbest = SelectKBest(k=90)
    rf = RandomForestClassifier(
        n_estimators=100,
        max_depth=8,
        oob_score=True, 
        random_state=0, 
        n_jobs=3)
    pipeline = Pipeline([('kbest', kbest), ('clf', rf)])
    pipeline.fit(xtrain.values, ytrain.values[:,i])
    ypred[:, i] = pipeline.predict_proba(xcv)[:, 1]

print "mean error is: ", rmse(ycv, ypred)

mean error is:  0.305625652119


#### 2. Try XGBoost

In [162]:
params = {"objective": "binary:logistic",
          "booster": "gbtree",
          "eta": 0.05,
          "max_depth": 5,
          "subsample": 1,
          "colsample_bytree": 0.9,
          "silent": 1,
          "seed": 4,
          "eval_metric": 'rmse'
          }
num_trees = 200
stop = 20

In [163]:
import xgboost as xgb

ypred_xgb = np.zeros((len(xcv), 14))
for i in range(14):
    dtrain = xgb.DMatrix(xtrain.values, label = ytrain.values[:, i])
    dvalid = xgb.DMatrix(xcv.values, label=ycv.values[:, i])
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=False)
    
    dcv = xgb.DMatrix(xcv.values)
    ypred_xgb[:, i] = gbm.predict(dcv)
    indices = ypred_xgb[:, i] < 0
    ypred_xgb[:, i][indices] = 0

Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.
Will train until train error hasn't decreased in 20 rounds.


In [164]:
print "mean error is: ", rmse(ycv, ypred_xgb)

mean error is:  0.295937774878


In [352]:
ensemble = (0.47 * ypred ** 0.68  + \
        0.5 * ypred_xgb ** 11.0 + \
        0.34 * ypred ** 0.01 * ypred_xgb ** 1.08) ** 1.01

print "mean error is: ", rmse(ycv, ensemble)

mean error is:  0.302254525191


## 4. Predict probas with ensemble 

In [None]:
import xgboost as xgb

ypred_xgb = np.zeros((len(xcv), 14))
for i in range(14):
    dtrain = xgb.DMatrix(xtrain.values, label = ytrain.values[:, i])
    dvalid = xgb.DMatrix(xcv.values, label=ycv.values[:, i])
    dtest = xgb.DMatrix(full[10000:].values)
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=False)

    ypred_xgb[:, i] = gbm.predict(dtest)
    indices = ypred_xgb[:, i] < 0
    ypred_xgb[:, i][indices] = 0