In [2]:
import xgboost as xgb
import lightgbm as lgb
from sklearn import *
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Read

In [14]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
col = [c for c in train.columns if c not in ['ID', 'target']]
print(train.shape, test.shape)

(4459, 4993) (49342, 4992)


# Random Forest to get feature importance

In [15]:
scl = preprocessing.StandardScaler()
def rmsle(y, pred):
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(pred), 2)))

In [16]:
x1, x2, y1, y2 = model_selection.train_test_split(train[col], train.target.values, test_size=0.20, random_state=5)
model = ensemble.RandomForestRegressor(n_jobs = -1, random_state = 7)
model.fit(scl.fit_transform(x1), y1)
print(rmsle(y2, model.predict(scl.transform(x2))))

1.7700695707637661


In [17]:
col = pd.DataFrame({'importance': model.feature_importances_, 'feature': col}).sort_values(by=['importance'],ascending=[False])[:480]['feature'].values

# Light gbm and xgboost

In [20]:
test['target_lgb'] = 0.0
test['target_xgb'] = 0.0
folds = 5
for fold in range(folds):
    x1, x2, y1, y2 = model_selection.train_test_split(train[col], np.log1p(train.target.values), 
                                                      test_size=0.20, random_state=fold)
    #LightGBM
    params = {'learning_rate': 0.02, 'max_depth': 13, 'boosting': 'gbdt', 'objective': 'regression', 
              'metric': 'rmse', 'is_training_metric': True, 'num_leaves': 12**2, 
              'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'seed':fold}
    model = lgb.train(params, lgb.Dataset(x1, label=y1), 3000, lgb.Dataset(x2, label=y2), 
                      verbose_eval=200, early_stopping_rounds=100)
    test['target_lgb'] += np.expm1(model.predict(test[col], num_iteration=model.best_iteration))
    #XGB
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    #https://www.kaggle.com/samratp/santander-value-prediction-xgb-and-lightgbm
    params = {'objective': 'reg:linear', 'eval_metric': 'rmse', 'eta': 0.005, 'max_depth': 10, 
              'subsample': 0.7, 'colsample_bytree': 0.5, 'alpha':0, 'silent': True, 'random_state':fold}
    model = xgb.train(params, xgb.DMatrix(x1, y1), 5000,  watchlist, maximize=False, verbose_eval=200, early_stopping_rounds=100)
    test['target_xgb'] += np.expm1(model.predict(xgb.DMatrix(test[col]), ntree_limit=model.best_ntree_limit))

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's rmse: 1.41191
[400]	valid_0's rmse: 1.40675
Early stopping, best iteration is:
[410]	valid_0's rmse: 1.40468
[0]	train-rmse:14.0159	valid-rmse:14.0864
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[200]	train-rmse:5.35766	valid-rmse:5.43593
[400]	train-rmse:2.31589	valid-rmse:2.48894
[600]	train-rmse:1.36245	valid-rmse:1.66786
[800]	train-rmse:1.09548	valid-rmse:1.48707
[1000]	train-rmse:1.00215	valid-rmse:1.44685
[1200]	train-rmse:0.948219	valid-rmse:1.43443
[1400]	train-rmse:0.906295	valid-rmse:1.42867
[1600]	train-rmse:0.87119	valid-rmse:1.42518
[1800]	train-rmse:0.835045	valid-rmse:1.42179
[2000]	train-rmse:0.806133	valid-rmse:1.42077
[2200]	train-rmse:0.779525	valid-rmse:1.42049
Stopping. Best iteration:
[2142]	train-rmse:0.786891	valid-rmse:1.42035

Training until validation scores don't improve for 100 

In [21]:
test['target_lgb'] /= folds
test['target_xgb'] /= folds
test['target'] = (test['target_lgb'] + test['target_xgb'])/2
test[['ID', 'target']].to_csv('submission1.csv', index=False)