### Imports

In [1]:
import pandas as pd
import numpy as np
import time

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

### Load data

In [2]:
DATA_PATH = "../../data/home_default/"

train = pd.read_csv(DATA_PATH + "train.csv")
test  = pd.read_csv(DATA_PATH + "test.csv")

test['is_train'] = 0
train['is_train'] = 1

print("Shape of train:", train.shape)
print("Shape of test:",  test.shape)

Shape of train: (307511, 123)
Shape of test: (48744, 122)


### Load other data sources

In [3]:
bureau = pd.read_csv(DATA_PATH + "bureau.csv")

previous_application = pd.read_csv(DATA_PATH + "previous_application.csv")

### Split again into predictors, target, and id

In [4]:
train_y = train.TARGET
train_x = train.drop(["TARGET"], axis=1)

test_id = test.SK_ID_CURR
test_x  = test

### Merge train and test data

In [6]:
full = pd.concat([train_x, test_x])
train_N = len(train_x)

### [Olivier](https://www.kaggle.com/ogrellier) approach 

([his work](https://www.kaggle.com/ogrellier/good-fun-with-ligthgbm))

In [7]:
def cat_feats(df):
    return [col for col in df.columns if df[col].dtype == 'object']

# get categorical features
data_cats = cat_feats(full)
prev_app_cats = cat_feats(previous_application)
bureau_cats = cat_feats(bureau)

# Turn categorical features to dummy columns

In [8]:
previous_application = pd.get_dummies(previous_application)
bureau = pd.get_dummies(bureau)

In [10]:
# function to factorize categorical features
def factorize_df(df, cats):
    for col in cats:
        df[col], _ = pd.factorize(df[col])
    return df 

# factorize the categorical features from train and test data
full = factorize_df(full, data_cats)

### Aggregate Previous Applications Data and Merge with Original Data

[sban](https://www.kaggle.com/shivamb) provided the code ([link](https://www.kaggle.com/shivamb/homecreditrisk-extensive-eda-baseline-model))

In [9]:
# count the number of previous applications for a given ID
prev_apps_count = previous_application[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
previous_application['SK_ID_PREV'] = previous_application['SK_ID_CURR'].map(prev_apps_count['SK_ID_PREV'])

# Average values for all other features in previous applications
prev_apps_avg = previous_application.groupby('SK_ID_CURR').mean()
prev_apps_avg.columns = ['p_' + col for col in prev_apps_avg.columns]
full = full.merge(right=prev_apps_avg.reset_index(), how='left', on='SK_ID_CURR')

### Aggregate Bureau Data and Merge with Original Data

In [10]:
# Average Values for all bureau features 
bureau_avg = bureau.groupby('SK_ID_CURR').mean()
bureau_avg['buro_count'] = bureau[['SK_ID_BUREAU','SK_ID_CURR']].groupby('SK_ID_CURR').count()['SK_ID_BUREAU']
bureau_avg.columns = ['b_' + f_ for f_ in bureau_avg.columns]
full = full.merge(right=bureau_avg.reset_index(), how='left', on='SK_ID_CURR')

### Split full back into train and test

In [11]:
full = full.drop(["SK_ID_CURR"])

train_x = full[:train_N]
test_x = full[train_N:]
del full, train_N

ValueError: labels ['SK_ID_CURR'] not contained in axis

### Split data into train and validation data

In [12]:
from sklearn.model_selection import train_test_split 
import lightgbm as lgb

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=17)
lgb_train = lgb.Dataset(data=train_x, label=train_y)
lgb_eval  = lgb.Dataset(data=val_x, label=val_y)

### Train the model

In [13]:
params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
          'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 5000, 'verbose': 0 ,
          'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
          'min_split_gain':.01, 'min_child_weight':1}

start = time.time()
model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=200)
print("Training took {} seconds".format(round(time.time() - start)))



Training until validation scores don't improve for 150 rounds.
[200]	valid_0's auc: 0.737383
[400]	valid_0's auc: 0.751357
[600]	valid_0's auc: 0.762349
[800]	valid_0's auc: 0.767113
[1000]	valid_0's auc: 0.769436
[1200]	valid_0's auc: 0.77078
[1400]	valid_0's auc: 0.771461
[1600]	valid_0's auc: 0.771772
[1800]	valid_0's auc: 0.772031
[2000]	valid_0's auc: 0.772075
Early stopping, best iteration is:
[1901]	valid_0's auc: 0.772147


### Predict

In [14]:
predictions = model.predict(test_x)

### Save to CSV

In [15]:
pd.DataFrame({
    "SK_ID_CURR": test_id,
    "TARGET": predictions
}).to_csv("../../submissions/sban_help", index=False)

# Modeling

### Imports and scoring helper function

In [10]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold

kfold = KFold(n_splits=2)
    
def rmsle_cv(model):
    rmse= np.sqrt(-cross_val_score(model, train_x, train_y, cv=kfold, scoring="neg_mean_squared_error"))
    print(rmse)
    print()
    print(sum(rmse) / len(rmse))

## LGBM Regressor

(open Markdown for notes on [LGBM](http://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMRegressor) and parameter search attempts)

<div hidden>

# LGBM Parameters

boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, objective=None, class_weight=None, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True, **kwargs

<\div>

In [11]:
lgbm_model = LGBMRegressor()

start = time.time()
lgbm_model.fit(train_x, train_y)
print("Training took {} seconds".format(round(time.time() - start)))

Training took 15 seconds


### Scoring

In [12]:
start = time.time()
rmsle_cv(lgbm_model)
print("\nScoring took {} seconds".format(round(time.time() - start)))

[ 0.26223286  0.26024019]

0.261236525624

Scoring took 19 seconds


## Linear Regressor

(open Markdown for notes on [Linear Regression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html) and parameter search attempts)

<div hidden>

# Linear Parameters

fit_intercept=True,
normalize=False,
copy_X=True,
n_jobs=1

<\div>

In [13]:
lin_model = LinearRegression()

start = time.time()
lin_model.fit(train_x, train_y)
print("Training took {} seconds".format(round(time.time() - start)))

Training took 7 seconds


### Scoring

In [14]:
start = time.time()
rmsle_cv(lin_model)
print("\nScoring took {} seconds".format(round(time.time() - start)))

[ 0.26508292  0.26264828]

0.263865596742

Scoring took 7 seconds


## Random Forest Regressor

WARNING: Takes a lot of time

In [24]:
rf_model = RandomForestRegressor()

start = time.time()
rf_model.fit(train_x, train_y)
print("Training took {} seconds".format(round(time.time() - start)))

[ 0.27978732  0.28013387  0.28058117]

0.280167454734


### Scoring

In [None]:
start = time.time()
rmsle_cv(rf_model)
print("\nScoring took {} seconds".format(round(time.time() - start)))

## Logistic Regression

(open Markdown for notes on [Logistic Regression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) and parameter search attempts)

<div hidden>

# Logistic Parameters

penalty=’l2’,
dual=False,
tol=0.0001,
C=1.0,
fit_intercept=True,
intercept_scaling=1,
class_weight=None,
random_state=None,
solver=’liblinear’,
max_iter=100,
multi_class=’ovr’,
verbose=0,
warm_start=False,
n_jobs=1

<\div>

In [15]:
log_model = LogisticRegression()

start = time.time()
log_model.fit(train_x, train_y)
print("Training took {} seconds".format(round(time.time() - start)))

Training took 15 seconds


### Scoring

In [16]:
start = time.time()
rmsle_cv(log_model)
print("\nScoring took {} seconds".format(round(time.time() - start)))

[ 0.28561741  0.2826656 ]

0.2841415038

Scoring took 18 seconds


## Lasso Regression

(open Markdown for notes on [Lasso](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) and parameter search attempts)

<div hidden>

# Lasso Parameters

alpha=1.0,
fit_intercept=True,
normalize=False,
precompute=False,
copy_X=True,
max_iter=1000,
tol=0.0001,
warm_start=False,
positive=False,
random_state=None,
selection=’cyclic’

<\div>

In [17]:
las_model = Lasso(random_state=17)

start = time.time()
las_model.fit(train_x, train_y)
print("Training took {} seconds".format(round(time.time() - start)))

Training took 68 seconds


### Scoring

In [18]:
start = time.time()
rmsle_cv(las_model)
print("\nScoring took {} seconds".format(round(time.time() - start)))

[ 0.27209767  0.26929769]

0.270697679661

Scoring took 53 seconds


## Ridge Regression


(open Markdown for notes on [Ridge](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) and parameter search attempts)

<div hidden>

# Ridge Parameters

alpha=1.0,
fit_intercept=True,
normalize=False,
copy_X=True,
max_iter=None,
tol=0.001,
solver=’auto’,
random_state=None

<\div>

In [20]:
rid_model = Ridge(random_state=17)

start = time.time()
rid_model.fit(train_x, train_y)
print("Training took {} seconds".format(round(time.time() - start)))

Training took 3 seconds


### Scoring

In [21]:
start = time.time()
rmsle_cv(rid_model)
print("\nScoring took {} seconds".format(round(time.time() - start)))

[ 0.26507705  0.26264958]

0.263863313366

Scoring took 3 seconds


## XGB Regressor

In [25]:
xgb_model = XGBRegressor()

start = time.time()
xgb_model.fit(train_x, train_y)
print("Training took {} seconds".format(round(time.time() - start)))

NameError: name 'score_model' is not defined

### Scoring

Scoring takes 349 seconds for me

In [26]:
start = time.time()
rmsle_cv(xgb_model)
print("\nScoring took {} seconds".format(round(time.time() - start)))

[ 0.26315737  0.26094917]

0.262053265711

Scoring took 349 seconds


### Get predictions

In [27]:
predictions = xgb_model.predict(test_x)

### Restrict predictions to appropriate range

In [28]:
predictions = np.clip(predictions, 0, 1)

# Sanity check
any(predictions < 0) or any(predictions > 1)

False

### Save file to CSV

In [29]:
pd.DataFrame({
    "SK_ID_CURR": test_id,
    "TARGET": predictions
}).to_csv("../../submissions/xgb.csv", index=False)