In [28]:
import pandas as pd

import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble

import matplotlib.pyplot as plt

from sklearn.cross_validation import cross_val_score
from sklearn import metrics


In [2]:
common_drop_columns = ['ID','v8','v23','v25','v31','v36','v37','v46',
                       'v51','v53','v54','v63','v73','v75','v79','v81',
                       'v82','v89','v92','v95','v105','v107','v108','v109',
                       'v110','v116','v117','v118','v119','v123','v124','v128']

Load the training data - store the target separately

In [3]:
train = pd.read_csv("train.csv")
target = train['target'].values

#Remove 'drop' columns
train_drop_columns  = common_drop_columns[:] + ['target']
train = train.drop(train_drop_columns,axis=1)
print('Loaded TRAIN data...')

Loaded TRAIN data...


Load the test data 

In [4]:
test = pd.read_csv("test.csv")

id_test = test['ID'].values
#Remove 'drop' columns
test = test.drop(common_drop_columns,axis=1)
print('Loaded TEST data...')

Loaded TEST data...


In [5]:
def find_delimiter(df, col):
    #Function that trying to find an approximate delimiter used for scaling.
    #So we can undo the feature scaling.

    vals = df[col].dropna()

    vals = vals.sort_values().round(8)
    vals = pd.rolling_apply(vals, 2, lambda x: x[1] - x[0])
    vals = vals[vals > 0.000001]
    return vals.value_counts().idxmax() 

num_vars = ['v1', 'v2', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v11',
            'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19', 'v20',
            'v21', 'v26', 'v27', 'v28', 'v29', 'v32', 'v33', 'v34', 'v35', 'v38',
            'v39', 'v40', 'v41', 'v42', 'v43', 'v44', 'v45', 'v48', 'v49', 'v50',
            'v55', 'v57', 'v58', 'v59', 'v60', 'v61', 'v62', 'v64', 'v65', 'v67',
            'v68', 'v69', 'v70', 'v72', 'v76', 'v77', 'v78', 'v80', 'v83', 'v84', 
            'v85', 'v86', 'v87', 'v88', 'v90', 'v93', 'v94', 'v96', 'v97', 'v98', 
            'v99', 'v100', 'v101', 'v102', 'v103', 'v104', 'v106', 'v111', 'v114',
            'v115', 'v120', 'v121', 'v122', 'v126', 'v127', 'v129', 'v130', 'v131']

vs = pd.concat([train, test])
for c in num_vars:
    if c not in train.columns:
        continue
    
    train.loc[train[c].round(5) == 0, c] = 0
    test.loc[test[c].round(5) == 0, c] = 0

    delimiter = find_delimiter(vs, c)
    train[c] *= 1/delimiter
    test[c] *= 1/delimiter


	Series.rolling(center=False,window=2).apply(kwargs=<dict>,args=<tuple>,func=<function>)


In [40]:
def plot_feature_importance(et_model, feature_count):

    importances = et_model.feature_importances_
    indices = np.argsort(importances)[::-1]
                            
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(feature_count), 
            importances[indices],
            color="r", 
            yerr=std[indices], 
            align="center")
    
    plt.xticks(range(feature_count), indices)
    plt.xlim([-1, feature_count])
    plt.show()

In [51]:
def print_feature_ranking(et_model,df_train):

    importances = et_model.feature_importances_
    std = np.std([tree.feature_importances_ for tree in et_model.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(df_train.shape[1]):
        print("%d. feature %d:%s (%f)" % (f + 1, indices[f], df_train.columns.values[indices[f]], importances[indices[f]]))
        

In [None]:
def print_model_stats(df_model,df_train, y_target):

    scores = cross_val_score(df_model,df_train,y_target,n_jobs=3)
    print ("Cross Val Score %f" % (scores.mean()))

    y_hat_train = df_model.predict_proba(df_train)
    print(df_model)
    print( "\nlog-loss train %f" %(metrics.log_loss(y_target,y_hat_train)))

    predictions = df_model.predict(df_train)

    #print(metrics.classification_report(target, predictions))
    print (pd.crosstab(y_target, predictions, rownames=['True'], colnames=['Predicted'], margins=True))

In [6]:
for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            #print "mean", train_series.mean()
            train.loc[train_series.isnull(), train_name] = -999 
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = -999

In [7]:
X_train = train
X_test = test

In [8]:

print('Training...')

#extc = ExtraTreesClassifier(n_estimators=750,max_features= 70,criterion= 'entropy',min_samples_split= 4,
#                            max_depth= 45, min_samples_leaf= 1, n_jobs = 2)     


extc = ExtraTreesClassifier(n_estimators=10,max_features= 70,criterion= 'entropy',min_samples_split= 4,
                            max_depth= 45, min_samples_leaf= 1, n_jobs = 2)     


Training...


In [10]:
extc.fit(X_train,target) 

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=45, max_features=70, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [20]:
print_feature_ranking(extc, X_train)

Feature ranking:
1. feature 42:v50 (0.094886)
2. feature 43:v52 (0.028987)
3. feature 12:v14 (0.028704)
4. feature 54:v66 (0.027600)
5. feature 20:v22 (0.027172)
6. feature 87:v112 (0.027154)
7. feature 94:v125 (0.027039)
8. feature 73:v91 (0.026909)
9. feature 39:v47 (0.026547)
10. feature 89:v114 (0.026178)
11. feature 8:v10 (0.025884)
12. feature 10:v12 (0.025615)
13. feature 45:v56 (0.025522)
14. feature 19:v21 (0.025445)
15. feature 33:v40 (0.025202)
16. feature 29:v34 (0.023708)
17. feature 88:v113 (0.021476)
18. feature 21:v24 (0.020885)
19. feature 26:v30 (0.019314)
20. feature 59:v71 (0.018944)
21. feature 51:v62 (0.017454)
22. feature 60:v72 (0.012019)
23. feature 97:v129 (0.010439)
24. feature 70:v87 (0.007485)
25. feature 78:v98 (0.007289)
26. feature 91:v120 (0.007097)
27. feature 4:v5 (0.006991)
28. feature 0:v1 (0.006919)
29. feature 58:v70 (0.006835)
30. feature 24:v28 (0.006742)
31. feature 99:v131 (0.006690)
32. feature 32:v39 (0.006440)
33. feature 79:v99 (0.006318)


In [17]:
# Plot the feature importances of the forest

plot_feature_importance(extc, X_train.shape[1])                          

#Calculate the Cross Validation Score

In [50]:
print_model_stats(extc, X_train,target)

Cross Val Score 0.763071
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=45, max_features=70, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

log-loss train 0.026915
Predicted      0      1     All
True                           
0          27294      6   27300
1              2  87019   87021
All        27296  87025  114321


In [53]:
# Loop through a  couple of training option

for i in [80,90]:
    extc = ExtraTreesClassifier(n_estimators=100,max_features= i,criterion= 'entropy',min_samples_split= 4,
                                max_depth= 45, min_samples_leaf= 1, n_jobs = 2)     
    extc.fit(X_train,target) 
    print_model_stats(extc, X_train,target)
    
for i in [40,50,60,70,80,90]:
    extc = ExtraTreesClassifier(n_estimators=100,max_features= i,criterion= 'gini',min_samples_split= 4,
                                max_depth= 45, min_samples_leaf= 1, n_jobs = 2)     
    extc.fit(X_train,target) 
    print_model_stats(extc, X_train,target)
    
for i in [40,50,60,70,80,90]:
    extc = ExtraTreesClassifier(n_estimators=500,max_features= i,criterion= 'entropy',min_samples_split= 4,
                                max_depth= 45, min_samples_leaf= 1, n_jobs = 2)     
    extc.fit(X_train,target) 
    print_model_stats(extc, X_train,target)
    
for i in [40,50,60,70,80,90]:
    extc = ExtraTreesClassifier(n_estimators=500,max_features= i,criterion= 'gini',min_samples_split= 4,
                                max_depth= 45, min_samples_leaf= 1, n_jobs = 2)     
    extc.fit(X_train,target) 
    print_model_stats(extc, X_train,target)

Cross Val Score 0.786111
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=45, max_features=40, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

log-loss train 0.036256
Predicted      0      1     All
True                           
0          27299      1   27300
1              6  87015   87021
All        27305  87016  114321
Cross Val Score 0.785333
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=45, max_features=50, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

log-loss train 0.032211
Predicted      0      1     All
True                           


In [60]:
depth = 10
features = 10 
estimators = 10

my_file = "extra_trees_entropy_" + str(depth) + "_" + str(features) + "_" + str(estimators) + ".csv"
my_file

'extra_trees_entropy_10_10_10.csv'

In [62]:
for depth,features,estimators in [(45,90,750),(45,95,1000)]:
    extc = ExtraTreesClassifier(n_estimators=estimators,
                                max_features= features,
                                criterion= 'entropy',
                                min_samples_split= 4,
                                max_depth= depth, 
                                min_samples_leaf= 1, 
                                n_jobs = 2)     
    extc.fit(X_train,target) 
    print_model_stats(extc, X_train,target)
    y_pred = extc.predict_proba(X_test)
    my_file = "extra_trees_entropy_" + str(depth) + "_" + str(features) + "_" + str(estimators) + ".csv"
    pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv(my_file,index=False)

Cross Val Score 0.785210
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=45, max_features=90, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=750, n_jobs=2,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

log-loss train 0.024302
Predicted      0      1     All
True                           
0          27299      1   27300
1              5  87016   87021
All        27304  87017  114321
Cross Val Score 0.785298
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=45, max_features=95, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=2,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

log-loss train 0.023851
Predicted      0      1     All
True                           

In [149]:
y_pred = extc.predict_proba(X_test)


In [103]:
pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees_entropy_750.csv',index=False)



In [102]:
X_test.describe()


Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v9,v10,v11,...,v115,v120,v121,v122,v125,v126,v127,v129,v130,v131
count,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,...,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0,114393.0
mean,900046.4,4154333.0,-0.027764,2290731.0,322.415338,1333313.0,1358637.0,5042254.0,85.256738,8457066.0,...,5768635.0,709223.8,1516701.0,3809128.0,33.223598,857054.0,1789827.0,0.307589,1067879.0,958589.7
std,1118774.0,4256947.0,0.182975,2184290.0,1159.870074,1248859.0,1271173.0,4659935.0,70.154956,7459422.0,...,5263614.0,824863.4,1669347.0,3605492.0,21.666883,829115.3,1975980.0,0.686654,1326276.0,1188139.0
min,-999.0,-999.0,-1.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-1.0,-999.0,-999.0,0.0,-999.0,-999.0
25%,-999.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,48.004382,-999.0,...,-999.0,-999.0,-999.0,-999.0,16.0,-999.0,-999.0,0.0,-999.0,-999.0
50%,522436.7,4009687.0,0.0,2692821.0,992.289114,1723894.0,1762169.0,6734066.0,60.00547,14131610.0,...,8299406.0,542547.8,1357467.0,4727809.0,29.0,1146174.0,1493776.0,0.0,831060.3,575400.2
75%,1558519.0,7446121.0,0.0,4251876.0,1327.174212,2409528.0,2452100.0,9233807.0,96.008792,15139560.0,...,10415500.0,1205467.0,2565764.0,6927194.0,47.0,1508591.0,3108213.0,0.0,1654767.0,1680673.0
max,14509800.0,19464050.0,2.0,19607840.0,2973.866491,7080406.0,19417480.0,19801980.0,914.083472,19417480.0,...,19417480.0,19417480.0,13030530.0,19801980.0,89.0,18181820.0,14448150.0,11.0,19536540.0,19607840.0
