In [2]:
import os
import datetime as dt

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.cross_validation import cross_val_score
from sklearn import metrics


from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

import xgboost as xgb
import lightgbm as lgb



In [25]:
today = dt.date.today()

In [3]:
PK = 'sk_id_curr'
TARGET = 'target'
N_CV = 5
SEED = 1111
DATA_DIR = 'clean_data/'
DATA_DIR = '.'

TRAIN_FILE = os.path.join(DATA_DIR, 'mrgd_train.csv')
TEST_FILE = os.path.join(DATA_DIR, 'mrgd_test.csv')
SUBMISSION_OUTPUT_FILE = os.path.join(DATA_DIR, 'submission_out.csv')

DTYPES = {'sk_id_curr':str, 'sk_id_bureau':str, 'sk_id_prev':str,'num_instalment_version':str}



In [4]:
train = pd.read_csv(TRAIN_FILE, dtype=DTYPES)
test = pd.read_csv(TEST_FILE, dtype=DTYPES)
train.shape, test.shape

((307511, 527), (48744, 526))

In [5]:
train.head()

Unnamed: 0,sk_id_curr,flag_own_car,flag_own_realty,name_contract_type,flag_cont_mobile,flag_document_10,flag_document_11,flag_document_12,flag_document_13,flag_document_14,...,sk_dpd_pos,name_contract_status_Active,name_contract_status_Amortized debt,name_contract_status_Approved_pos,name_contract_status_Canceled_pos,name_contract_status_Completed_pos,name_contract_status_Demand_pos,name_contract_status_Returned to the store,name_contract_status_Signed_pos,target
0,100002,0,1,0,1,0,0,0,0,0,...,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,100003,0,0,0,1,0,0,0,0,0,...,0.0,26.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0
2,100004,1,1,1,1,0,0,0,0,0,...,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3,100006,0,1,0,1,0,0,0,0,0,...,0.0,18.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0
4,100007,0,1,0,1,0,0,0,0,0,...,0.0,62.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0


In [6]:
y_train = train[TARGET].values
pd.value_counts(y_train)

0    282686
1     24825
dtype: int64

In [7]:
train_id = train[PK]
test_id = test[PK]

In [8]:
train.drop([PK, TARGET], axis=1, inplace=True)
test.drop(PK, axis=1, inplace=True)

In [9]:
print("{},{}".format(train.shape, test.shape))

(307511, 525),(48744, 525)


In [10]:
features = train.columns

In [11]:
x_train = train.values
x_test = test.values

In [12]:
def eval_skl_model(est, n_cv, seed=0, params=None):
    clf = est(**params)
    aucs = cross_val_score(clf, x_train, y_train, scoring='roc_auc', cv=n_cv, n_jobs=-1, verbose=1) 
    return aucs, clf
    

In [40]:
et_params = {
    'n_jobs': -1,
    'n_estimators': 200,
    'max_features': 0.25,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': -1,
    'n_estimators': 200,
    'max_features': 0.25,
    'max_depth': 8,
    'min_samples_leaf': 2,
}


gb_params = {
    'n_estimators': 50,
    'learning_rate': 0.1,
    'max_features': 0.25,
    'max_depth': 8,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 0,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 7,
    'num_parallel_tree': 16,
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'nrounds': 128
}


In [15]:
results = {}
for n, c, p in zip(
                ['extra_trees','random_forest'],
                [ExtraTreesClassifier, RandomForestClassifier],
                [et_params, rf_params]):
    print(n)
    aucs, clf = eval_skl_model(c, N_CV, params=p)
    results[n] = aucs


extra_trees


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  9.0min remaining: 13.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  9.1min finished


random_forest


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  6.1min remaining:  9.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.2min finished


In [18]:
[np.mean(v) for v in results.values()]

[0.74213404800838867, 0.73847363646301123]

RANDOM Explore

In [20]:
et_clf = ExtraTreesClassifier(**et_params)
et_clf.fit(x_train, y_train)
probs = et_clf.predict_proba(x_test)

In [21]:
submission = pd.DataFrame([test_id,probs[:,1]], index=['SK_ID_CURR', 'TARGET']).T

In [27]:
algo = 'extra_trees'
submission.to_csv('submissions/{}_{}.csv'.format(algo, today.strftime('%Y%m%d')), index=False)

In [42]:

dtrain = xgb.DMatrix(x_train, label=y_train)
nrounds = xgb_params.pop('nrounds', 75)
clf = xgb.train(xgb_params, dtrain, nrounds)


In [43]:
clf.best_iteration

74

In [46]:
dtest = xgb.DMatrix(x_test)
preds = clf.predict(dtest)

In [51]:
submission['TARGET'] = preds

In [53]:
algo = 'xgboost'
submission.to_csv('submissions/{}_{}.csv'.format(algo, today.strftime('%Y%m%d')), index=False)

In [None]:
def grid_search_params(est, param_dic)