In [1]:
import os
import datetime as dt
import time

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, chi2, RFE

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

In [2]:
today = dt.date.today()

### Config Vars

In [24]:
PK = 'sk_id_curr'
TARGET = 'target'
N_FEATURES = 250
DATA_DIR = 'clean_data/'

TRAIN_FILE = os.path.join(DATA_DIR, 'mrgd_train.csv')
TEST_FILE = os.path.join(DATA_DIR, 'mrgd_test.csv')
SUBMISSION_OUTPUT_FILE = os.path.join(DATA_DIR, 'submission_out.csv')

DTYPES = {'sk_id_curr':str, 'sk_id_bureau':str, 'sk_id_prev':str,'num_instalment_version':str}



### Read Data

In [4]:
train = pd.read_csv(TRAIN_FILE, dtype=DTYPES)
test = pd.read_csv(TEST_FILE, dtype=DTYPES)
train.shape, test.shape

((307511, 527), (48744, 526))

In [5]:
train.head()

Unnamed: 0,sk_id_curr,flag_own_car,flag_own_realty,name_contract_type,flag_cont_mobile,flag_document_10,flag_document_11,flag_document_12,flag_document_13,flag_document_14,...,sk_dpd_pos,name_contract_status_Active,name_contract_status_Amortized debt,name_contract_status_Approved_pos,name_contract_status_Canceled_pos,name_contract_status_Completed_pos,name_contract_status_Demand_pos,name_contract_status_Returned to the store,name_contract_status_Signed_pos,target
0,100002,0,1,0,1,0,0,0,0,0,...,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,100003,0,0,0,1,0,0,0,0,0,...,0.0,26.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0
2,100004,1,1,1,1,0,0,0,0,0,...,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3,100006,0,1,0,1,0,0,0,0,0,...,0.0,18.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0
4,100007,0,1,0,1,0,0,0,0,0,...,0.0,62.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0


In [6]:
x_train_id = train[PK]
y_train = train[TARGET]

In [11]:
x_test_id = test[PK]
test.drop(PK, axis=1, inplace=True)

In [7]:
train.drop([PK, TARGET], axis=1, inplace=True)

Also load engineered features

In [9]:
eng_ftr_train = pd.read_csv('clean_data/eng_ftrs_train.csv')
eng_ftr_test = pd.read_csv('clean_data/eng_ftrs_test.csv')

In [12]:
xtrain_all = pd.concat((train, eng_ftr_train), axis=1)
xtest_all = pd.concat((test, eng_ftr_test), axis=1)

xtrain_all.shape, xtest_all.shape

((307511, 886), (48744, 886))

In [13]:
features = xtrain_all.columns.tolist()

## Feature Selection

    - Choose the top N_FEATURES most correlated features to target (UNIVARIATE)
    - Chi2 test (UNIVARIATE)
    - Recursive Feature Elimination (RFE)
    - Lasso Regularization

### Correlations

In [15]:
corrs = []
for f in features:
    corr = xtrain_all[f].corr(y_train)
    corrs.append((f,corr))

len(corrs), min(corrs, key=lambda t: t[1]), max(corrs, key=lambda t: t[1])

(886,
 ('ext_source_2', -0.16029475320996664),
 ('nb_probs', 0.10506295395184502))

In [25]:
top_n = sorted(corrs, key = lambda tup: abs(tup[1]), reverse=True)[:N_FEATURES]
corr_ftrs = pd.DataFrame(top_n, columns = ['ftr', 'correlation'])

In [26]:
corr_ftrs.head()

Unnamed: 0,ftr,correlation
0,ext_source_2,-0.160295
1,ext_source_3,-0.155892
2,nb_probs,0.105063
3,ext_source_1,-0.098887
4,days_credit,-0.082079


### Chi2

Not entirely sure the details here.

Chi2 is a measure of dependence between counts of categorical variables. Not sure how we are using this on numerical variables.

SelectKBest calculates chi2 statistics for each feature against the target and returns the N_Features most dependent.

Data must be positive...

In [19]:
x_norm = MinMaxScaler().fit_transform(xtrain_all)

In [27]:
selector = SelectKBest(chi2, k = N_FEATURES)
selector.fit(x_norm, y_train)
# bool index on selected columns
selected = selector.get_support()

In [28]:
chi2_scores = pd.DataFrame(list(zip(features, selector.scores_)), columns=['ftr', 'chi2_score'])
chi2_ftrs = chi2_scores.loc[selected]

In [29]:
chi2_ftrs.sort_values('chi2_score', ascending=False).head()

Unnamed: 0,ftr,chi2_score
73,ext_source_2,654.36317
32,reg_city_not_work_city,615.377434
31,reg_city_not_live_city,558.708941
121,name_income_type_Pensioner,538.416471
213,organization_type_XNA,533.222197


### Recursive Feature Elimination (RFE)

Using normalized x so coefs are in same units

In [30]:
selector = RFE(LogisticRegression(), N_FEATURES, 0.05, 1)
selector.fit(x_norm, y_train)
selected = selector.get_support()

Fitting estimator with 886 features.
Fitting estimator with 842 features.
Fitting estimator with 798 features.
Fitting estimator with 754 features.
Fitting estimator with 710 features.
Fitting estimator with 666 features.
Fitting estimator with 622 features.
Fitting estimator with 578 features.
Fitting estimator with 534 features.
Fitting estimator with 490 features.
Fitting estimator with 446 features.
Fitting estimator with 402 features.
Fitting estimator with 358 features.
Fitting estimator with 314 features.
Fitting estimator with 270 features.


In [31]:
rfe_logreg = np.asarray(features)[selected]
rfe_logreg = pd.DataFrame({'ftr': rfe_logreg, 'rfe_placeholder': np.ones_like(rfe_logreg)})
rfe_logreg.head()

Unnamed: 0,ftr,rfe_placeholder
0,flag_document_10,1
1,flag_document_13,1
2,flag_document_14,1
3,flag_document_15,1
4,flag_document_16,1


### Lasso (L1)

In [41]:

l1 = LogisticRegression(penalty = 'l1', C =0.2)
l1.fit(x_norm, y_train)


LogisticRegression(C=0.2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
l1.coef_[np.abs(l1.coef_)>0].shape

(244,)

In [51]:
l1_ftrs = pd.DataFrame(list(zip(features, l1.coef_.tolist()[0])), columns=['ftr', 'l1_coef'])
l1_ftrs = l1_ftrs[l1_ftrs.l1_coef.abs()>0].sort_values('l1_coef', ascending=False)

In [52]:
l1_ftrs.head()

Unnamed: 0,ftr,l1_coef
43,amt_credit_application,6.398791
352,name_contract_status_Refused_application,4.267294
42,amt_annuity_application,2.968831
451,num_instalment_version_1.0,1.994686
303,cnt_payment,1.833105


## Combine All DFs Together

In [53]:
mrgd = l1_ftrs.merge(rfe_logreg, 'outer', 'ftr').merge(chi2_ftrs, 'outer', 'ftr').merge(corr_ftrs, 'outer', 'ftr')

In [55]:
# we are going to sum the "scores"/ coefficients so make sure all are positive
mrgd['correlation'] = mrgd.correlation.abs()
mrgd['l1_coef'] = mrgd.l1_coef.abs()

In [56]:
mrgd.fillna(0,inplace=True)

In [57]:
# scale
mrgd.loc[:, ['l1_coef', 'rfe_placeholder', 'chi2_score', 'correlation']] = MinMaxScaler().fit_transform(
    mrgd.loc[:, ['l1_coef', 'rfe_placeholder', 'chi2_score', 'correlation']]
)


In [58]:
all_selected_ftrs = mrgd.set_index('ftr').sum(1).sort_values(ascending=False)
overall_topN = all_selected_ftrs.iloc[:100]

In [59]:
overall_topN

ftr
ext_source_2                                    3.219939
ext_source_3                                    2.989627
amt_goods_price_application                     2.297600
nb_probs                                        2.285854
name_income_type_Pensioner                      2.122629
name_contract_status_Refused_application        2.068912
amt_credit_application                          2.046631
occupation_type_Laborers                        2.009241
ext_source_1                                    1.956521
days_birth                                      1.874627
days_employed                                   1.737794
region_rating_client_w_city                     1.628708
days_last_phone_change                          1.603677
occupation_type_Low-skill Laborers              1.581340
code_reject_reason_SCOFR                        1.479183
flag_emp_phone                                  1.465822
amt_annuity_application                         1.462963
num_instalment_version_1.0 

Save Output

In [60]:
all_selected_ftrs.to_csv('extra/all_selected_ftrs.csv')