In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (15, 10)

In [2]:
train = pd.read_csv('data/credit_train_clean_loc.csv.zip', index_col='client_id', compression='gzip')
test = pd.read_csv('data/credit_test_clean_loc.csv.zip', index_col='client_id', compression='gzip').drop(['open_account_flg'], axis = 1)

In [7]:
train.columns

Index(['age', 'credit_month', 'credit_sum', 'education', 'gender',
       'job_position', 'marital_status', 'monthly_income', 'open_account_flg',
       'score_shk', 'tariff_id', 'lat', 'lon', 'overdue_credit_count',
       'credit_count'],
      dtype='object')

In [5]:
def prepareDummies(train, test):
    all_data = pd.get_dummies(pd.concat([test, train]),columns=['gender', 'marital_status', 'job_position', 'tariff_id', 'education'])
    return all_data.ix[train.index], all_data.ix[test.index].drop(['open_account_flg'], axis = 1)

In [8]:
from sklearn.preprocessing import LabelEncoder

def prepareLabels(train, test):
    d = pd.concat([test, train])
    all_data = pd.DataFrame(index=d.index)
    for cat in ['education', 'gender', 'job_position', 'marital_status', 'education', 'tariff_id']:
        all_data[cat] = LabelEncoder().fit_transform(d[cat])
    for cat in ['age', 'credit_month', 'credit_sum', 'monthly_income', 'open_account_flg', 'score_shk','lat', 'lon', 'overdue_credit_count', 'credit_count']:
        all_data[cat] = d[cat]
    return all_data.ix[train.index], all_data.ix[test.index].drop(['open_account_flg'], axis = 1)

In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

def testClf(clf, X, y):
    scores = cross_val_score(clf, X, y, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='roc_auc')
    print("Accuracy: {} (+/- {})".format(scores.mean(), scores.std() * 2))

# Labeled data

In [33]:
tr, te = prepareLabels(train, test)

In [22]:
from sklearn.ensemble import RandomForestClassifier

%%time
testClf(RandomForestClassifier(), tr.drop(['open_account_flg'], axis = 1), tr['open_account_flg'])

Accuracy: 0.6921941008609472 (+/- 0.004719509198061229)


In [30]:
from sklearn.linear_model import LogisticRegression

testClf(LogisticRegression(), tr.drop(['open_account_flg'], axis = 1), tr['open_account_flg'])

Accuracy: 0.5751160526245724 (+/- 0.013154121053305932)


In [70]:
tariff_empty = ['1,29', '1,29', '1,0', '1,2', '1,21', '1,22', '1,23', '1,26', '1,27', '1,28', '1,48', '1,52', '1,56', '1,96']
job_empty = ['ONB', 'HSK', 'INV', 'ONB', 'PNS', 'PNI', 'PNV']
train_without_noise['monthly_income'] = train_without_noise['monthly_income'] // 5000
train_without_noise = train[~train['tariff_id'].isin(tariff_empty) | ~train['tariff_id'].isin(job_empty) | train['monthly_income'] < 61.0]
tr, te = prepareLabels(train_without_noise, test)

In [118]:
%%time
testClf(RandomForestClassifier(n_estimators=100, min_samples_split=5, max_leaf_nodes=60, max_depth=20), tr.drop(['open_account_flg'], axis = 1), tr['open_account_flg'])

KeyboardInterrupt: 

In [104]:
train.head()

Unnamed: 0_level_0,age,credit_month,credit_sum,education,gender,job_position,marital_status,monthly_income,open_account_flg,score_shk,tariff_id,lat,lon,overdue_credit_count,credit_count,monthly_income_log,dist
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,48,10,59998.0,GRD,M,UMN,MAR,30000.0,0.0,0.770249,16,45.272365,38.951409,1.0,1.0,10.3,11.0
2,28,6,10889.0,GRD,F,UMN,MAR,43000.0,0.0,0.248514,11,55.755814,37.617635,0.0,2.0,10.7,0.0
3,32,12,10728.0,SCH,M,SPC,MAR,23000.0,0.0,0.459589,11,51.578529,46.797223,0.0,5.0,10.0,10.0
4,27,12,12009.09,GRD,F,SPC,DIV,17000.0,0.0,0.362536,11,49.615821,44.151406,0.0,2.0,9.7,9.0
5,45,10,16908.89,SCH,M,SPC,MAR,25000.0,0.0,0.421385,11,54.446199,60.395641,0.0,1.0,10.1,23.0


In [122]:
tariff_empty = ['1,29', '1,29', '1,0', '1,2', '1,21', '1,22', '1,23', '1,26', '1,27', '1,28', '1,48', '1,52', '1,56', '1,96']
job_empty = ['ONB', 'HSK', 'INV', 'ONB', 'PNS', 'PNI', 'PNV']
# train_without_noise['age'] = np.round(np.log(train_without_noise['age'] * 2), decimals=2)
train_without_noise = train[~train['tariff_id'].isin(tariff_empty) | ~train['tariff_id'].isin(job_empty) | ((train['monthly_income'] > 8.8) & (train['monthly_income'] < 12.7))]
tr, te = prepareLabels(train_without_noise, test)
tr['monthly_income'] = np.round(np.log(tr['monthly_income']), decimals=1)
tr['credit_sum'] = np.round(np.log(tr['credit_sum']), decimals=1)
tr['lat'] = np.round(tr['lat'], decimals=2)
tr['lon'] = np.round(tr['lon'], decimals=2)
# tr['dist'] = np.round(np.sqrt(np.power(tr['lon'] - 37.617635, 2) + np.power(tr['lat'] - 55.755814, 2)))
tr.head()

Unnamed: 0_level_0,education,gender,job_position,marital_status,tariff_id,age,credit_month,credit_sum,monthly_income,open_account_flg,score_shk,lat,lon,overdue_credit_count,credit_count
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1,1,14,2,26,48,10,11.0,10.3,0.0,0.770249,45.27,38.95,1.0,1.0
2,1,0,14,2,1,28,6,9.3,10.7,0.0,0.248514,55.76,37.62,0.0,2.0
3,3,1,13,2,1,32,12,9.3,10.0,0.0,0.459589,51.58,46.8,0.0,5.0
4,1,0,13,1,1,27,12,9.4,9.7,0.0,0.362536,49.62,44.15,0.0,2.0
5,3,1,13,2,1,45,10,9.7,10.1,0.0,0.421385,54.45,60.4,0.0,1.0


In [120]:
%%time
testClf(RandomForestClassifier(n_estimators=100, min_samples_split=5, max_leaf_nodes=60, max_depth=20), tr.drop(['open_account_flg'], axis = 1), tr['open_account_flg'])

Accuracy: 0.7425261997901911 (+/- 0.0028270161254154735)
CPU times: user 1min 13s, sys: 1.77 s, total: 1min 15s
Wall time: 1min 27s


In [125]:
from sklearn.model_selection import GridSearchCV

def findParamsRFC(X, Y):
    clf = RandomForestClassifier(random_state=42)
    parameter_grid = {'n_estimators' : [20, 50, 100, 300],
                     'max_depth' : [5, 20, 50, 100],
                     'min_samples_split' : [2, 5, 10],
                     'max_leaf_nodes' : [40, 60, 100]
                     }
    grid_search = GridSearchCV(clf, param_grid=parameter_grid, scoring='roc_auc', cv=StratifiedKFold(5, shuffle=True, random_state=42))
    grid_search.fit(X, Y)
    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))
    return grid_search

In [126]:
findParamsRFC(tr.drop(['open_account_flg'], axis = 1), tr['open_account_flg'])

Best score: 0.7471296806141059
Best parameters: {'n_estimators': 300, 'min_samples_split': 10, 'max_depth': 20, 'max_leaf_nodes': 100}


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 5, 10], 'n_estimators': [20, 50, 100, 300], 'max_depth': [5, 20, 50, 100], 'max_leaf_nodes': [40, 60, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

# Dummies data

In [31]:
tr, te = prepareDummies(train, test)

In [27]:
%%time
testClf(RandomForestClassifier(), tr.drop(['open_account_flg'], axis = 1), tr['open_account_flg'])

Accuracy: 0.6917458584557096 (+/- 0.0011888204976569846)
CPU times: user 21.3 s, sys: 1.11 s, total: 22.4 s
Wall time: 24 s


In [32]:
testClf(LogisticRegression(), tr.drop(['open_account_flg'], axis = 1), tr['open_account_flg'])

Accuracy: 0.5738600202713968 (+/- 0.01978490604986587)
