In [1]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier,ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

In [3]:
# Reading input train and test data
train = pd.read_csv("train.csv")
test=pd.read_csv("test.csv")


# removing the constant columns
remove = []
for col in train.columns:
    if train[col].std() == 0:
        remove.append(col)
        
train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)

# remove duplicated columns
remove = []
c = train.columns
for i in range(len(c)-1):
    v = train[c[i]].values
    for j in range(i+1,len(c)):
        if np.array_equal(v,train[c[j]].values):
            remove.append(c[j])

train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)


target = train['TARGET'].values
train = train.drop(['ID','TARGET'], axis=1)

id_test = test['ID'].values
test = test.drop(['ID'], axis=1)

# length of dataset
len_train = len(train)
len_test  = len(test)



In [11]:
# feature engineering
len(train.columns)

# Functions,code for feature engineering

def getCounts(x):
    count=0
    for i in x:
        if i==0:
            count=count+1
    return count
#sum of zeros across the instances
train['zeroCounts']=train.apply(getCounts,axis=1)
test['zeroCounts']=test.apply(getCounts,axis=1)


In [14]:
# Training a GBT classifier
from sklearn import metrics
from sklearn import cross_validation
gbt= GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, max_depth=5,max_features=250,n_estimators=160,
                                random_state=1,verbose=1)
scores=cross_validation.cross_val_score(gbt,train,target,cv=5,scoring='roc_auc')
print scores
print("Accuracy: %f (+/- %f)" % (scores.mean(), scores.std() * 2))

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.3317           0.0016            2.93m
         2           0.3290           0.0015            2.96m
         3           0.3256           0.0015            2.95m
         4           0.3263           0.0014            2.99m
         5           0.3255           0.0013            3.01m
         6           0.3265           0.0012            2.94m
         7           0.3262           0.0012            2.92m
         8           0.3238           0.0011            2.92m
         9           0.3224           0.0010            2.89m
        10           0.3199           0.0010            2.87m
        20           0.3068           0.0008            2.64m
        30           0.3026           0.0006            2.45m
        40           0.2953           0.0004            2.26m
        50           0.2908           0.0004            2.06m
        60           0.2790           0.0003            1.87m
       

In [4]:
# Selecting startified sampling
nfolds=5
skf = list(StratifiedKFold(target,nfolds))

In [6]:
# Creating the base classifiers for blending
clfs = [RandomForestClassifier(n_estimators=10, n_jobs=-1, criterion='entropy',verbose=1),
        ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy',verbose=1),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10,verbose=1)]



In [7]:
print "Creating train and test sets for blending."

dataset_blend_train = np.zeros((train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((test.shape[0], len(clfs)))

Creating train and test sets for blending.


In [19]:
# Check for indexing
#train.iloc[[1,2],]
#target[[1,2]]

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,zeroCounts,binaryCounts_ind,binaryCounts_numvar,binaryCounts_deltaimp,binaryCounts_indothers
1,2,34,0,0,0,0,0,0,0,0,...,0,0,0,0,49278.03,190,5,0,0,0
2,2,23,0,0,0,0,0,0,0,0,...,0,0,0,0,67333.77,194,4,0,0,0


In [20]:
for j, clf in enumerate(clfs):
        print j, clf
        dataset_blend_test_j = np.zeros((test.shape[0], len(skf)))
        for i, (trainIndex, testIndex) in enumerate(skf):
            print "Fold", i
            X_train = train.iloc[trainIndex,]
            y_train = target[trainIndex]
            X_test = train.iloc[testIndex,]
            y_test = target[testIndex]
            clf.fit(X_train, y_train)
            y_submission = clf.predict_proba(X_test)[:,1]
            dataset_blend_train[testIndex, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(test)[:,1]
        dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.8s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


0 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)
Fold 0
Fold

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.8s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


 1
Fold

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.8s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


 2
Fold

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


 3
Fold

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


 4
1

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=1, warm_start=False)
Fold 0
Fold

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


 1
Fold

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


 2
Fold

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.4s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


 3
Fold

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


 4
2 GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=6, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              presort='auto', random_state=None, subsample=0.5, verbose=1,
              warm_start=False)
Fold 0
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.3241           0.0081           15.45s
         2           0.3049           0.0064           13.64s
         3           0.3150           0.0049           12.02s
         4           0.3140           0.0041           10.27s
         5           0.2979           0.0035            8.50s
         6           0.2916           0.0030            6.79s
         7           0.2939           0.0027            5.08s
         8           0.2973           0.0022            3.40s
         9           0.2844           0.0021            1.70s
  

In [22]:
print "Blending with Logistic Regression"
clf = LogisticRegression()
clf.fit(dataset_blend_train, target)
y_submission = clf.predict_proba(dataset_blend_test)[:,1]

Blending with Logistic Regression


In [24]:
pd.Series(y_submission).describe()

count    75818.000000
mean         0.038952
std          0.044357
min          0.022629
25%          0.023377
50%          0.025729
75%          0.031212
max          0.999788
dtype: float64

In [25]:
# cross validating the blending logic
from sklearn import metrics
from sklearn import cross_validation
clf = LogisticRegression()
scores=cross_validation.cross_val_score(clf,dataset_blend_train,target,cv=5,scoring='roc_auc')
print scores
print("Accuracy: %f (+/- %f)" % (scores.mean(), scores.std() * 2))

[ 0.8223573   0.81069738  0.82676894  0.84509262  0.82419362]
Accuracy: 0.825822 (+/- 0.022193)


In [1]:
l=[1,2,3,4]
l[2:]

[3, 4]

In [2]:
k=[0,1]
l[k]

TypeError: list indices must be integers, not list