## Santander

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn import cross_validation

from sklearn.metrics import roc_auc_score
from collections import defaultdict

training = pd.read_csv("../data/train.csv", index_col=0)
test = pd.read_csv("../data/test.csv", index_col=0)

print(training.shape)
print(test.shape)

X = training.iloc[:,:-1]
y = training.TARGET

from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale

# First select features based on chi2 and f_classif
p = 50

X_bin = Binarizer().fit_transform(scale(X))
selectChi2 = SelectPercentile(chi2, percentile=p).fit(X_bin, y)
selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y)

chi2_selected = selectChi2.get_support()
chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]]
print('Chi2 selected {} features {}.'.format(chi2_selected.sum(),
   chi2_selected_features))
f_classif_selected = selectF_classif.get_support()
f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]]
print('F_classif selected {} features {}.'.format(f_classif_selected.sum(),
   f_classif_selected_features))
selected = chi2_selected & f_classif_selected
print('Chi2 & F_classif selected {} features'.format(selected.sum()))
features = [ f for f,s in zip(X.columns, selected) if s]
print (features)

X_sel = X[features]

# Check what features help the random forest classifier using a model RFC with 100 trees
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1,
   criterion='gini', class_weight='balanced')

scores = defaultdict(list)

y = np.array(y.astype(int)).ravel()
#crossvalidate the scores on a number of different random splits of the data
for train_idx, test_idx in cross_validation.StratifiedShuffleSplit(y, n_iter=3, test_size=0.2, random_state=1301):
    print ('Folding')
    X_train, X_test = X_sel[train_idx], X_sel[test_idx]
    Y_train, Y_test = y[train_idx], y[test_idx]
    r = rfc.fit(X_train, Y_train)
    auc = roc_auc_score(Y_test, rfc.predict(X_test))
    for i in range(X_sel.shape[1]):
        X_t = X_test.copy()
        np.random.shuffle(X_t[:, i])
        shuff_auc = roc_auc_score(Y_test, rfc.predict(X_t))
        scores[features[i]].append((auc-shuff_auc)/auc)
print ("Features sorted by their score:")
print (sorted([(round(np.mean(score), 4), feat) for
              feat, score in scores.items()], reverse=True))
              
features = [feat for feat, score in scores.items() if np.mean(score) > 0.0]

print('Selected features by crossvalidation with RFC with 100 trees:')
print(features)
X_sel_s = X_sel[features]

rfc = RandomForestClassifier(n_estimators=1000, random_state=1301, n_jobs=-1,
   criterion='gini', class_weight='balanced')
rfc.fit(X_sel_s, y)

print('RFC with 1000 trees fitted, now predict on test values')

sel_test = test[features]    
y_pred = rfc.predict_proba(sel_test)

submission = pd.DataFrame({"ID":test.index, "TARGET":y_pred[:,1]})
submission.to_csv("submission_rfc.csv", index=False)

(76020, 370)
(75818, 369)


 189 192 220 222 234 238 244 248 261 262 303 307 315 319 327 349] are constant.


Chi2 selected 184 features ['var15', 'imp_op_var39_comer_ult1', 'imp_op_var40_efect_ult1', 'imp_op_var40_efect_ult3', 'imp_op_var40_ult1', 'imp_op_var41_comer_ult1', 'imp_op_var41_efect_ult1', 'imp_op_var41_efect_ult3', 'imp_op_var39_efect_ult1', 'imp_op_var39_efect_ult3', 'imp_sal_var16_ult1', 'ind_var1', 'ind_var5_0', 'ind_var5', 'ind_var8_0', 'ind_var8', 'ind_var12_0', 'ind_var12', 'ind_var13_0', 'ind_var13_corto_0', 'ind_var13_corto', 'ind_var13_largo_0', 'ind_var13_largo', 'ind_var13', 'ind_var14_0', 'ind_var14', 'ind_var17_0', 'ind_var17', 'ind_var19', 'ind_var20_0', 'ind_var20', 'ind_var24_0', 'ind_var24', 'ind_var25_cte', 'ind_var26_0', 'ind_var26_cte', 'ind_var26', 'ind_var25_0', 'ind_var25', 'ind_var30', 'ind_var31_0', 'ind_var31', 'ind_var33_0', 'ind_var33', 'ind_var39_0', 'ind_var40', 'ind_var41_0', 'ind_var39', 'ind_var44_0', 'ind_var44', 'num_var1', 'num_var4', 'num_var5_0', 'num_var5', 'num_var8_0', 'num_var8', 'num_var12_0', 'num_var12', 'num_var13_0', 'num_var13_corto_

IndexError: indices are out-of-bounds