In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [2]:
train = pd.read_csv('data/credit_train_final_optimized.csv.gz', index_col='client_id')
test = pd.read_csv('data/credit_test_final_optimized.csv.gz', index_col='client_id')
train.columns

Index(['age', 'credit_month', 'credit_sum', 'education', 'gender',
       'job_position', 'marital_status', 'monthly_income', 'open_account_flg',
       'score_shk', 'tariff_id', 'lat', 'lon', 'overdue_credit_count',
       'credit_count', 'living_region', 'monthly_credit', 'credit_count_ratio',
       'credit_income_ratio'],
      dtype='object')

In [3]:
def prepare_labels(train, test):
    # нужно категориальные признаки пометить числами, а в тестовой выборке есть новые данные к сожалению
    all_data = pd.concat([train, test])
    for cat in ['education', 'gender', 'job_position', 'marital_status', 'education', 'tariff_id', 'living_region']:
        all_data[cat] = LabelEncoder().fit_transform(all_data[cat])
    return all_data.ix[train.index], all_data.ix[test.index]

In [4]:
def prepare_dummies(train, test):
    all_data = pd.concat([train, test]).drop(['living_region'], axis=1)
    all_data = pd.get_dummies(all_data, columns=['education', 'gender', 'job_position', 'marital_status', 'education', 'tariff_id'])
    return all_data.ix[train.index], all_data.ix[test.index]

In [5]:
tr_d, te_d = prepare_dummies(train, test)
te_d.fillna(0, inplace=True)
te_d.drop(['living_region', 'open_account_flg'], axis=1, inplace=True)

In [6]:
tr, te = prepare_labels(train, test)
te.fillna(0, inplace=True)
te.drop(['living_region', 'open_account_flg'], axis=1, inplace=True)

In [7]:
X = tr.drop(['open_account_flg', 'living_region'], axis = 1)
y = tr['open_account_flg']

In [8]:
X_d = tr_d.drop(['open_account_flg'], axis = 1)
y_d = tr_d['open_account_flg']

In [9]:
def testClf(clf, X, y):
    scores = cross_val_score(clf, X, y, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='roc_auc')
    print("AUC: {} (+/- {})".format(scores.mean(), scores.std() * 2))

In [10]:
X.describe()

Unnamed: 0,age,credit_count,credit_count_ratio,credit_income_ratio,credit_month,credit_sum,education,gender,job_position,lat,lon,marital_status,monthly_credit,monthly_income,overdue_credit_count,score_shk,tariff_id
count,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0,170746.0
mean,5.003352,2.071674,0.016833,20.534932,10.980749,9.992663,2.202716,0.480532,12.160015,55.362687,54.856784,2.197586,2463.30622,13.797413,0.043579,0.469473,12.57698
std,0.638798,1.740467,0.094114,16.156323,3.536698,0.600467,1.064024,0.499622,3.132421,5.307008,25.442463,0.738562,1564.051379,1.774436,0.206014,0.124268,11.073903
min,3.7,0.0,0.0,0.5,3.0,7.9143,0.0,0.0,0.0,42.87,21.23,0.0,104.7,8.41,0.0,0.0,0.0
25%,4.5,1.0,0.0,11.1,10.0,9.6097,1.0,0.0,13.0,53.28,37.62,2.0,1410.8,12.57,0.0,0.379495,1.0
50%,4.9,2.0,0.0,16.6,10.0,9.9631,3.0,0.0,13.0,55.76,44.63,2.0,1990.75,13.68,0.0,0.461599,17.0
75%,5.4,3.0,0.0,24.9,12.0,10.3756,3.0,1.0,13.0,58.59,61.53,3.0,2972.8,14.95,0.0,0.552419,26.0
max,6.8,21.0,1.0,1022.8,36.0,12.2061,4.0,1.0,17.0,67.71,174.43,4.0,39966.7,31.22,3.0,1.128291,31.0


In [21]:
%time testClf(RandomForestClassifier(n_estimators=100, min_samples_split=5, max_leaf_nodes=60, max_depth=20), X, y)

AUC: 0.7417376453664434 (+/- 0.003921889022184429)
CPU times: user 1min 33s, sys: 1.64 s, total: 1min 34s
Wall time: 1min 39s


In [22]:
rfc = RandomForestClassifier(n_estimators=100, min_samples_split=5, max_leaf_nodes=60, max_depth=20)
rfc.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=60,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [11]:
def describeImportance(clf, X):
    indices = np.argsort(clf.feature_importances_)[::-1]
    for f in range(X.shape[1]):
        print('%d. feature %s (%f)' % (f + 1, X.columns[indices[f]],
                                          clf.feature_importances_[indices[f]]))

In [59]:
describeImportance(rfc, X.drop(['living_region'], axis=1))

1. feature tariff_id (0.363890)
2. feature age (0.131619)
3. feature credit_month (0.124891)
4. feature score_shk (0.088131)
5. feature credit_sum (0.078717)
6. feature monthly_credit (0.063247)
7. feature education (0.049782)
8. feature job_position (0.030525)
9. feature marital_status (0.021316)
10. feature gender (0.012706)
11. feature credit_income_ratio (0.012057)
12. feature lon (0.008580)
13. feature credit_count (0.006755)
14. feature lat (0.003484)
15. feature monthly_income (0.003476)
16. feature credit_count_ratio (0.000484)
17. feature overdue_credit_count (0.000340)


In [85]:
XX = StandardScaler().fit_transform(X_d)
%time testClf(LogisticRegression(C=0.001), XX, y_d)

AUC: 0.7151334916137858 (+/- 0.002493764669875866)
CPU times: user 10.2 s, sys: 1.02 s, total: 11.2 s
Wall time: 11.5 s


In [86]:
lrc = LogisticRegression(C=0.001)
lrc.fit(XX, y_d)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [73]:
sScaler = StandardScaler(copy=True, with_mean=True, with_std=True)
xScaled = sScaler.fit_transform(X_d)
forPca = pd.DataFrame(xScaled)

pcaMod = PCA(n_components=10)
xPca = pcaMod.fit_transform(X_d)

%time testClf(LogisticRegression(C=0.001), xPca, y_d)

AUC: 0.599958401216734 (+/- 0.007839782691710186)
CPU times: user 4.45 s, sys: 181 ms, total: 4.63 s
Wall time: 4.68 s


In [105]:
%time testClf(GaussianNB(), X_d, y_d)

AUC: 0.6107287671860239 (+/- 0.007682263961415929)
CPU times: user 1.21 s, sys: 272 ms, total: 1.48 s
Wall time: 1.51 s


In [81]:
%time testClf(MLPClassifier(),X_d, y_d)

AUC: 0.6441133831112698 (+/- 0.06955015099955547)
CPU times: user 53.9 s, sys: 5.91 s, total: 59.8 s
Wall time: 52.1 s


In [18]:
mlpParams = {'solver': ['lbfgs'], 'max_iter': [1500], 'alpha': 10.0 ** -np.arange(1, 7), 'hidden_layer_sizes':np.arange(5, 12)}

gridMLP = GridSearchCV(MLPClassifier(), mlpParams)
%time gridMLP.fit(X, y)
print('Best score: {}'.format(gridMLP.best_score_))
print('Best parameters: {}'.format(gridMLP.best_params_))
# оверфит!

CPU times: user 11min 27s, sys: 4min 3s, total: 15min 30s
Wall time: 10min 6s
Best score: 0.8239783069588746
Best parameters: {'hidden_layer_sizes': 9, 'alpha': 1.0000000000000001e-05, 'max_iter': 1500, 'solver': 'lbfgs'}


In [82]:
rfc.predict_proba(te)[:, 0]

array([ 0.88803218,  0.83596163,  0.70590088, ...,  0.92437903,
        0.83619413,  0.87375662])

In [88]:
pred_lrc=lrc.predict_proba(te_d)[:, 0]
# !!!!!!!!!!!!!!! фуфло

In [94]:
set(pred_lrc)

{0.0,
 2.2204460492503131e-16,
 8.8817841970012523e-16,
 5.1070259132757201e-15,
 6.4392935428259079e-15,
 8.4376949871511897e-15,
 1.5099033134902129e-14,
 2.3225865675158275e-13,
 2.6645352591003757e-13,
 1.6550893988664939e-10,
 6.8036198896948008e-10,
 7.1252472810812151e-09,
 1.1946775069215931e-07}

In [58]:
nncl = MLPClassifier(hidden_layer_sizes=9, alpha=1.0000000000000001e-05, max_iter=1500, solver='lbfgs')
nncl.fit(X, y)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=9, learning_rate='constant',
       learning_rate_init=0.001, max_iter=1500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [95]:
set(nncl.predict_proba(te)[:, 0]) #эээээм 1??

{1.0}

In [96]:
%time testClf(GradientBoostingClassifier(), X, y)

AUC: 0.7553057119982064 (+/- 0.0030721528631138546)
CPU times: user 2min 21s, sys: 2.99 s, total: 2min 24s
Wall time: 2min 35s


In [97]:
clf = GradientBoostingClassifier()
clf.fit(X, y)
clf.predict_proba(te)

array([[ 0.9084779 ,  0.0915221 ],
       [ 0.83439962,  0.16560038],
       [ 0.72530596,  0.27469404],
       ..., 
       [ 0.94495476,  0.05504524],
       [ 0.81656896,  0.18343104],
       [ 0.88373348,  0.11626652]])

In [98]:
%time testClf(AdaBoostClassifier(), X, y)

AUC: 0.7436541213131352 (+/- 0.0033777978859896645)
CPU times: user 40 s, sys: 617 ms, total: 40.6 s
Wall time: 41.2 s


In [16]:
clf1 = GradientBoostingClassifier(n_estimators=300)
clf2 = AdaBoostClassifier(n_estimators=300)
clf3 = RandomForestClassifier(n_estimators=300)
eclf = VotingClassifier(estimators=[('gb', clf1), ('ada', clf2), ('rfc', clf3)], voting='soft')
%time testClf(eclf, X, y) # короче надо подбирать гиперпараметры и xgboost

AUC: 0.757765626665458 (+/- 0.0027627909176038074)
CPU times: user 20min 53s, sys: 21.7 s, total: 21min 15s
Wall time: 21min 46s


In [17]:
%time eclf.fit(X, y)

CPU times: user 5min 59s, sys: 5.69 s, total: 6min 5s
Wall time: 6min 19s


VotingClassifier(estimators=[('gb', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_w...mators=300, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))],
         n_jobs=1, voting='soft', weights=None)

In [18]:
pred = eclf.predict_proba(te)
# 0.7549 in public

In [14]:
def save(fname, pred, test):
    res1 = pd.DataFrame()
    res1['_ID_'] = test.index
    res1['_VAL_'] = pred
    res1.to_csv("submissions/{0}.csv".format(fname), index=False)
    return res1

In [27]:
res = save("ensemble_gbc_ada_rfc", pred[:,1], te)

In [13]:
gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, nthread=4)
%time testClf(gbm, X, y)

AUC: 0.7570767305789323 (+/- 0.005244599130536529)
CPU times: user 12min 18s, sys: 15.1 s, total: 12min 33s
Wall time: 5min 56s


In [16]:
clf1 = GradientBoostingClassifier(n_estimators=300)
clf2 = AdaBoostClassifier(n_estimators=300)
clf3 = RandomForestClassifier(n_estimators=300, max_depth=20)
clf4 = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, nthread=4)
eclf = VotingClassifier(estimators=[('gb', clf1), ('ada', clf2), ('rfc', clf3), ('xgb', clf4)], voting='soft')

In [19]:
%time testClf(eclf, X, y)

AUC: 0.7625974115839058 (+/- 0.0037793467900848815)
CPU times: user 30min 1s, sys: 28.4 s, total: 30min 29s
Wall time: 23min 1s


In [17]:
%time eclf.fit(X, y)

CPU times: user 8min 15s, sys: 7.52 s, total: 8min 22s
Wall time: 6min 33s


VotingClassifier(estimators=[('gb', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_w...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))],
         n_jobs=1, voting='soft', weights=None)

In [18]:
pred = eclf.predict_proba(te) # 0.7603
res = save("ensemble_gbc_ada_rfc_XGB", pred[:,1], te)