In [14]:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble
from sklearn import preprocessing
from sklearn import cross_validation

def spreadshimaltodecimal(n):
    """Interpret a string "A", "B",.., "Z", "AA", as semi hexavigesimal (base-26) notation and return the value.
    This is how a typical spreadsheet counts the rows. e.g. A=1, B=2, ..., Z=26, AA=27, ..., AZ=52, BA=53, ..., ZZ=702, AAA=703, etc.
    Observe that this is not a proper base system, since that would require 26 to be written with two digits (e.g. as A0).
    https://github.com/macfreek/puzzle-code/blob/master/base26.py
    """
    try:
        alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        s = 1
        if n[0] == '-':
            n = n[1:]
            s = -1
        d = 0
        while n != "":
            d = 26*d + alphabet.index(n[0]) + 1
            n = n[1:]
        return s*d
    except TypeError:
        raise ValueError("invalid literal for spreadshimaltodecimal() with (semi) base 26: %r" % n)

print('Load data...')
train = pd.read_csv("C://users//Robert//Downloads//train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v81',
                    'v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("C://users//Robert//Downloads//test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
#make a big happy dataset to preprocess
train_rows=train.shape[0]
test_rows=test.shape[0]
columns=train.columns
all_data = pd.concat([train, test], ignore_index=True)
all_data['NA_Count']=all_data.count(axis=1)

for col in list(all_data.columns.values):
    label=col+'pop'
    all_data[label]=all_data[col].notnull().astype(int)

#deal with v22
all_data['v22']=all_data['v22'].fillna('ZZZZ')
all_data['v22']=all_data['v22'].apply(spreadshimaltodecimal)
all_data.loc[all_data['v22']==475254,'v22']=-999*1171 #for feature expansion
all_data['v22']=all_data['v22']/1171
#count NAs


#start some one-hot encoding!
#first turn categories into factors  naturally one of these factors will be the NANs
colnames=[]
for f in all_data.columns:
    if all_data[f].dtype == 'object':
        colnames.append(all_data[f].name) #grab a list of cols on the way through
        lbl = preprocessing.LabelEncoder()
        lbl.fit(np.unique(list(all_data[f].values)))
        all_data[f]= lbl.transform(list(all_data[f].values))
#now one-hot encode
"""enc = preprocessing.OneHotEncoder()
enc.fit(all_data[colnames])
a=enc.transform(all_data[colnames]).toarray()
#make a dataframe and merge it to the original
new=pd.DataFrame(a)
newcolnames=list(new.columns.values) 
renamedcols=[]
for i in newcolnames:
    i="enc"+i.astype(str)
    renamedcols.append(i)
new.columns=renamedcols
all_data=pd.merge(all_data, new, how='inner', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True)
#drop the categorical columns name as they are now in the 'enc' columns
all_data=all_data.drop(colnames,axis=1)
"""
# custom ohe for just a few variables
c_var=['v3','v24','v74','v30','v66','v30','v91','v47','v71','v52','v112']
enc = preprocessing.OneHotEncoder()
enc.fit(all_data[c_var])
a=enc.transform(all_data[c_var]).toarray()
#make a dataframe and merge it to the original
new=pd.DataFrame(a)
newcolnames=list(new.columns.values) 
renamedcols=[]
for i in newcolnames:
    i="enc"+i.astype(str)
    renamedcols.append(i)
new.columns=renamedcols
all_data=pd.merge(all_data, new, how='inner', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True)
for item in columns:
    labelsd='sd'+item
    labelmd='meandiff^'+item
    labelmoddif='modediff^'+item
    all_data[labelsd]=(all_data[item]-all_data[item].mean())/all_data[item].std()
    all_data[labelmd]=(all_data[item]-all_data[item].mean())
    all_data[labelmd]=(all_data[item]-all_data[item].mode().iloc[0])
#all_data=all_data.round(decimals=4, out=None)
all_data['v66*v66']=all_data.v66*all_data.v66
all_data['v18*v38']=all_data.v18*all_data.v38
all_data['v50*v62']=all_data.v50*all_data.v62  
all_data['v38*v99']=all_data.v38*all_data.v38  
all_data['v50*v72']=all_data.v50*all_data.v72  
all_data['v40*v66']=all_data.v40*all_data.v66  
all_data['v19*v39']=all_data.v19*all_data.v39  
all_data['v19*v38']=all_data.v19*all_data.v38  
all_data['v1*v19']=all_data.v1*all_data.v19  
all_data['v1*v38']=all_data.v1*all_data.v38  
all_data['v47*v66']=all_data.v47*all_data.v66  
all_data['v24*v50']=all_data.v24*all_data.v50  
all_data['v50*v66']=all_data.v50*all_data.v66  
all_data['v12*v50']=all_data.v12*all_data.v50  
all_data['v28*v38']=all_data.v28*all_data.v38  
all_data['v38*v84']=all_data.v38*all_data.v84  
all_data['v10*v66']=all_data.v10*all_data.v66  
all_data['v38*v80']=all_data.v38*all_data.v80  
all_data['v38*v47']=all_data.v38*all_data.v47  
all_data['v34*v66']=all_data.v34*all_data.v66  
all_data['v24*v24']=all_data.v24*all_data.v24  
all_data['v10*v38']=all_data.v10*all_data.v38  
all_data['v62*v66']=all_data.v62*all_data.v66  
all_data['v103*v57']=all_data.v103*all_data.v57 
all_data['v24*v66']=all_data.v24*all_data.v66  
all_data['v66*v72']=all_data.v66*all_data.v72  
all_data['v38*v90']=all_data.v38*all_data.v90 
all_data['v38*v50']=all_data.v38*all_data.v50  
all_data['v21*v50']=all_data.v21*all_data.v50  
all_data['v38*v86']=all_data.v38*all_data.v86  
all_data['v19*v99']=all_data.v19*all_data.v99  
all_data['v38*v62']=all_data.v38*all_data.v62  
all_data['v19*v86']=all_data.v19*all_data.v86  
all_data['v62*v72']=all_data.v62*all_data.v72 
all_data['v62*v62']=all_data.v62*all_data.v62  
all_data['v24*v47']=all_data.v24*all_data.v47
all_data['v19*v60']=all_data.v19*all_data.v60  
all_data['v50*v71']=all_data.v50*all_data.v71  
all_data['v33*v38']=all_data.v33*all_data.v38  
all_data['enc5*v2']=all_data.enc5*all_data.v2  
all_data['v30*v50']=all_data.v30*all_data.v50  
all_data['v34*v50']=all_data.v34*all_data.v50  
all_data['v102*v129']=all_data.v102*all_data.v129
all_data['v19*v80']=all_data.v19*all_data.v80  
all_data['v43*v66']=all_data.v43*all_data.v66  
all_data['v10*v50']=all_data.v10*all_data.v50  
all_data['v10*v90']=all_data.v10*all_data.v90  
all_data['v19*v27']=all_data.v19*all_data.v27  
all_data['enc4*v94']=all_data.enc4*all_data.v94 
all_data['v6*v66']=all_data.v6*all_data.v66   
all_data['v38*v55']=all_data.v38*all_data.v55  
all_data['v38*v58']=all_data.v38*all_data.v58  
all_data['v19*v57']=all_data.v19*all_data.v57  

all_data=all_data.fillna(-999)
train=all_data.iloc[0:train_rows,:]
test=all_data.iloc[train_rows:,:]
test.reset_index(drop=True)

X_test = test
print('Training...')
extc = ExtraTreesClassifier(n_estimators=1000,max_features= 50,criterion= 'entropy',min_samples_split= 4,
                            max_depth= 30, min_samples_leaf= 2, n_jobs = 4, verbose=1)      


#1000,100,4,70,2 ~ 0.0012161270558714169 -0.46096224737368985
#800,100,4,70,2  ~0.00130136064571 -0.461044343811
#1500, 100,4,70,2 ~0.000997310588987 -0.460980594955
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(train, target, test_size=0.4, random_state=0)
extc.fit(train,target)
print("CV")
scores0 = cross_validation.cross_val_score(extc, train, target, cv=3,scoring='log_loss')
print(scores0)


#print('Predict...')
#y_pred = extc.predict_proba(X_test)
#print y_pred

pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees9.csv',index=False)



Load data...
Training...

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   28.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  5.3min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  9.6min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed: 12.0min finished



CV

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   15.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:  6.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    5.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   15.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  4.6min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:  5.8min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:


[-0.45987714 -0.4597389  -0.45782838]


In [13]:
import matplotlib.pyplot as plt

#extc.fit(train,target)

print(scores0)
importances = extc.feature_importances_
std = np.std([tree.feature_importances_ for tree in extc.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
cols=list(train.columns.values)

print("Feature ranking:")
feature_names=[]
for f in range(train.shape[1]):
    print("%d. %s (%f) (%f) (%s)" % (f + 1, cols[indices[f]], importances[indices[f]], std[indices[f]], train.iloc[:,indices[f]].dtype))
    feature_names.append([cols[indices[f]], std[indices[f]]-importances[indices[f]]])
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(train.shape[1]), indices)
plt.xlim([-1, train.shape[1]])
plt.show()
#52 keep as int
#56 encode as hexasig
#125 encode as hexa
#91 encode
#112 encode
#ohe v3, 74

[-0.46226562 -0.46134567 -0.46026707]
Feature ranking:
1. meandiff^v50 (0.025008) (0.005244) (float64)
2. v50 (0.024803) (0.004859) (float64)
3. sdv50 (0.024748) (0.005242) (float64)
4. meandiff^v10 (0.008679) (0.001412) (float64)
5. v10 (0.008649) (0.001467) (float64)
6. sdv10 (0.008613) (0.001311) (float64)
7. meandiff^v56 (0.008509) (0.002739) (int64)
8. sdv12 (0.008469) (0.001247) (float64)
9. sdv56 (0.008461) (0.002689) (float64)
10. meandiff^v12 (0.008457) (0.001182) (float64)
11. v12 (0.008447) (0.001155) (float64)
12. v56 (0.008210) (0.002254) (int64)
13. v125 (0.007893) (0.000680) (int64)
14. meandiff^v125 (0.007876) (0.000701) (int64)
15. sdv125 (0.007852) (0.000682) (float64)
16. enc41 (0.007834) (0.010661) (float64)
17. sdv114 (0.007814) (0.001288) (float64)
18. v114 (0.007785) (0.001096) (float64)
19. v22 (0.007778) (0.000706) (float64)
20. meandiff^v22 (0.007771) (0.000712) (float64)
21. meandiff^v114 (0.007742) (0.001023) (float64)
22. v40 (0.007742) (0.000837) (float64)

In [15]:
y_pred = extc.predict_proba(X_test)
pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees0.csv',index=False)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    8.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   16.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   20.8s finished


In [10]:
all_data.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v9,v10,v11,...,sdv126,meandiff^v126,sdv127,meandiff^v127,sdv129,meandiff^v129,sdv130,meandiff^v130,sdv131,meandiff^v131
0,1.335739,8.727474,2,3.921026,7.915266,2.599278,3.176895,9.999999,0.503281,16.434108,...,0.26133,0.697304,-0.076985,1.479135,-0.447655,0,-1.019737,-0.363636,0.986142,1.948052
1,,,2,,9.191265,,,,1.31291,,...,,,,,-0.447655,0,,,,
2,0.943877,5.310079,2,4.410969,5.326159,3.979592,3.928571,12.666667,0.765864,14.756098,...,0.200904,0.666888,0.420949,2.287609,2.451045,2,-0.825091,-0.116883,-0.49456,0.26738
3,0.797415,8.304757,2,4.22593,11.627438,2.0977,1.987549,8.965516,6.542669,16.347483,...,-0.511257,0.308409,-0.175121,1.319797,1.001695,1,-0.198768,0.677107,-0.619655,0.125391
4,,,2,,,,,,1.050328,,...,,,,,-0.447655,0,,,,


-999.0

In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble
from sklearn import preprocessing
from sklearn import cross_validation

def spreadshimaltodecimal(n):
    """Interpret a string "A", "B",.., "Z", "AA", as semi hexavigesimal (base-26) notation and return the value.
    This is how a typical spreadsheet counts the rows. e.g. A=1, B=2, ..., Z=26, AA=27, ..., AZ=52, BA=53, ..., ZZ=702, AAA=703, etc.
    Observe that this is not a proper base system, since that would require 26 to be written with two digits (e.g. as A0).
    https://github.com/macfreek/puzzle-code/blob/master/base26.py
    """
    try:
        alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        s = 1
        if n[0] == '-':
            n = n[1:]
            s = -1
        d = 0
        while n != "":
            d = 26*d + alphabet.index(n[0]) + 1
            n = n[1:]
        return s*d
    except TypeError:
        raise ValueError("invalid literal for spreadshimaltodecimal() with (semi) base 26: %r" % n)

print('Load data...')
train = pd.read_csv("C://users//Robert//Downloads//train.csv")
target = train['target'].values
train = train.drop(['v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81',
                    'v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("C://users//Robert//Downloads//test.csv")
id_test = test['ID'].values
test = test.drop(['v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
#make a big happy dataset to preprocess
train_rows=train.shape[0]
test_rows=test.shape[0]
all_data = pd.concat([train, test], ignore_index=True)

#deal with v22
all_data['v22']=all_data['v22'].fillna('ZZZZ')
all_data['v22']=all_data['v22'].apply(spreadshimaltodecimal)
all_data.loc[all_data['v22']==475254,'v22']=-999 #for feature expansion
#count NAs
all_data['NA_Count']=all_data.count(axis=1)
#encode each column for NA
#for col in list(all_data.columns.values):
#label=col+'pop'
#    all_data[label]=all_data[col].notnull().astype(int)
#start some one-hot encoding!
#first turn categories into factors  naturally one of these factors will be the NANs
colnames=[]
for f in all_data.columns:
    if all_data[f].dtype == 'object':
        colnames.append(all_data[f].name) #grab a list of cols on the way through
        lbl = preprocessing.LabelEncoder()
        lbl.fit(np.unique(list(all_data[f].values)))
        all_data[f]= lbl.transform(list(all_data[f].values))
#now one-hot encode
"""enc = preprocessing.OneHotEncoder()
enc.fit(all_data[colnames])
a=enc.transform(all_data[colnames]).toarray()
#make a dataframe and merge it to the original
new=pd.DataFrame(a)
newcolnames=list(new.columns.values) 
renamedcols=[]
for i in newcolnames:
    i="enc"+i.astype(str)
    renamedcols.append(i)
new.columns=renamedcols
all_data=pd.merge(all_data, new, how='inner', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True)
#drop the categorical columns name as they are now in the 'enc' columns
all_data=all_data.drop(colnames,axis=1)
"""
# custom ohe for just a few variables
c_var=['v3','v24','v74','v30','v66','v30','v91','v47','v71','v52','v112']
enc = preprocessing.OneHotEncoder()
enc.fit(all_data[c_var])
a=enc.transform(all_data[c_var]).toarray()
#make a dataframe and merge it to the original
new=pd.DataFrame(a)
newcolnames=list(new.columns.values) 
renamedcols=[]
for i in newcolnames:
    i="enc"+i.astype(str)
    renamedcols.append(i)
new.columns=renamedcols
all_data=pd.merge(all_data, new, how='inner', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True)
colnames=['v10','v12','v56','v40','v14','v22','v114','v125','v34','v62','v113','v21','v52','v112','v91','v72','v129','v47']
index=0
for item in colnames:
    for item2 in colnames[index:]:
        label=item+'*'+item2
        label2=item+'/'+item2
        label3=item+'+'+item2
        label4=item+'-'+item2
        if (item!=item2) and (item!='v50') and (item2!='v50'):
            all_data[label]=all_data[item]*all_data[item2]
            all_data[label2]=all_data[item]/(all_data[item2]+1e-7)
            all_data[label3]=all_data[item]+all_data[item2]
            all_data[label4]=all_data[item]-all_data[item2]
            print(label)
    index=index+1
#drop the categorical columns name as they are now in the 'enc' columns
#all_data=all_data.drop(c_var,axis=1)

"""#feature engineer
v50thing=list(all_data.columns.values)
for i in v50thing:
    label='v50+'+i
    all_data[label]=all_data.v50+10*all_data[i]

all_data['v66*v66']=all_data.v66*all_data.v66
all_data['v18*v38']=all_data.v18*all_data.v38
all_data['v50*v62']=all_data.v50*all_data.v62  
all_data['v38*v99']=all_data.v38*all_data.v38  
all_data['v50*v72']=all_data.v50*all_data.v72  
all_data['v40*v66']=all_data.v40*all_data.v66  
all_data['v19*v39']=all_data.v19*all_data.v39  
all_data['v19*v38']=all_data.v19*all_data.v38  
all_data['v1*v19']=all_data.v1*all_data.v19  
all_data['v1*v38']=all_data.v1*all_data.v38  
all_data['v47*v66']=all_data.v47*all_data.v66  
all_data['v24*v50']=all_data.v24*all_data.v50  
all_data['v50*v66']=all_data.v50*all_data.v66  
all_data['v12*v50']=all_data.v12*all_data.v50  
all_data['v28*v38']=all_data.v28*all_data.v38  
all_data['v38*v84']=all_data.v38*all_data.v84  
all_data['v10*v66']=all_data.v10*all_data.v66  
all_data['v38*v80']=all_data.v38*all_data.v80  
all_data['v38*v47']=all_data.v38*all_data.v47  
all_data['v34*v66']=all_data.v34*all_data.v66  
all_data['v24*v24']=all_data.v24*all_data.v24  
all_data['v10*v38']=all_data.v10*all_data.v38  
all_data['v62*v66']=all_data.v62*all_data.v66  
all_data['v103*v57']=all_data.v103*all_data.v57 
all_data['v24*v66']=all_data.v24*all_data.v66  
all_data['v66*v72']=all_data.v66*all_data.v72  
all_data['v38*v90']=all_data.v38*all_data.v90 
all_data['v38*v50']=all_data.v38*all_data.v50  
all_data['v21*v50']=all_data.v21*all_data.v50  
all_data['v38*v86']=all_data.v38*all_data.v86  
all_data['v19*v99']=all_data.v19*all_data.v99  
all_data['v38*v62']=all_data.v38*all_data.v62  
all_data['v19*v86']=all_data.v19*all_data.v86  
all_data['v62*v72']=all_data.v62*all_data.v72 
all_data['v62*v62']=all_data.v62*all_data.v62  
all_data['v24*v47']=all_data.v24*all_data.v47
all_data['v19*v60']=all_data.v19*all_data.v60  
all_data['v50*v71']=all_data.v50*all_data.v71  
all_data['v33*v38']=all_data.v33*all_data.v38  
all_data['enc5*v2']=all_data.enc5*all_data.v2  
all_data['v30*v50']=all_data.v30*all_data.v50  
all_data['v34*v50']=all_data.v34*all_data.v50  
all_data['v102*v129']=all_data.v102*all_data.v129
all_data['v19*v80']=all_data.v19*all_data.v80  
all_data['v43*v66']=all_data.v43*all_data.v66  
all_data['v10*v50']=all_data.v10*all_data.v50  
all_data['v10*v90']=all_data.v10*all_data.v90  
all_data['v19*v27']=all_data.v19*all_data.v27  
all_data['enc4*v94']=all_data.enc4*all_data.v94 
all_data['v6*v66']=all_data.v6*all_data.v66   
all_data['v38*v55']=all_data.v38*all_data.v55  
all_data['v38*v58']=all_data.v38*all_data.v58  
all_data['v19*v57']=all_data.v19*all_data.v57  
all_data['v56pop']=all_data['v56'].notnull().astype(int)
all_data['v30pop']=all_data['v30'].notnull().astype(int)
all_data['v115pop']=all_data['v115'].notnull().astype(int)
#v56,30 115
#fill up remaining NANs
all_data=all_data.fillna(-999)

#add a random variable for feature selection
#all_data['rnd']=20*np.random.random(228714)
#split back into train and test sets.
"""
#all_data=all_data.round(decimals=4, out=None)
#all_data=all_data.fillna(-999)
train=all_data.iloc[0:train_rows,:]
test=all_data.iloc[train_rows:,:]
test.reset_index(drop=True)


Load data...
v10*v12
v10*v56
v10*v40
v10*v14
v10*v22
v10*v114
v10*v125
v10*v34
v10*v62
v10*v113
v10*v21
v10*v52
v10*v112
v10*v91
v10*v72
v10*v129
v10*v47
v12*v56
v12*v40
v12*v14
v12*v22
v12*v114
v12*v125
v12*v34
v12*v62
v12*v113
v12*v21
v12*v52
v12*v112
v12*v91
v12*v72
v12*v129
v12*v47
v56*v40
v56*v14
v56*v22
v56*v114
v56*v125
v56*v34
v56*v62
v56*v113
v56*v21
v56*v52
v56*v112
v56*v91
v56*v72
v56*v129
v56*v47
v40*v14
v40*v22
v40*v114
v40*v125
v40*v34
v40*v62
v40*v113
v40*v21
v40*v52
v40*v112
v40*v91
v40*v72
v40*v129
v40*v47
v14*v22
v14*v114
v14*v125
v14*v34
v14*v62
v14*v113
v14*v21
v14*v52
v14*v112
v14*v91
v14*v72
v14*v129
v14*v47
v22*v114
v22*v125
v22*v34
v22*v62
v22*v113
v22*v21
v22*v52
v22*v112
v22*v91
v22*v72
v22*v129
v22*v47
v114*v125
v114*v34
v114*v62
v114*v113
v114*v21
v114*v52
v114*v112
v114*v91
v114*v72
v114*v129
v114*v47
v125*v34
v125*v62
v125*v113
v125*v21
v125*v52
v125*v112
v125*v91
v125*v72
v125*v129
v125*v47
v34*v62
v34*v113
v34*v21
v34*v52
v34*v112
v34*v91
v34*v72
v34*v12

Unnamed: 0,ID,target,v1,v10,v100,v101,v102,v103,v104,v106,...,v72+v129,v72-v129,v72*v47,v72/v47,v72+v47,v72-v47,v129*v47,v129/v47,v129+v47,v129-v47
0,0,,1.375465e+00,1.312911,1.970803e+01,4.186787,1.873945,4.129022,1.701894,11.688117,...,1,1,9,0.111111,10,-8,0,0.0,9,-9
1,1,,,1.291029,,,,,,,...,5,5,10,2.500000,7,3,0,0.0,2,-2
2,2,,-4.903407e-07,1.575492,1.980000e+01,6.535555,0.762963,4.917534,1.981859,11.747048,...,1,1,8,0.125000,9,-7,0,0.0,8,-8
3,7,,2.661870e+00,1.575493,1.161434e-01,3.627655,,7.486234,4.313037,5.957084,...,1,1,8,0.125000,9,-7,0,0.0,8,-8
4,10,,1.252822e+00,1.050328,1.922770e+01,6.606787,1.493882,4.929004,1.906923,14.666965,...,2,2,16,0.250000,10,-6,0,0.0,8,-8
5,11,,1.733601e+00,0.656456,1.895062e+01,5.467954,1.820259,5.440665,2.261760,9.968654,...,1,1,6,0.166667,7,-5,0,0.0,6,-6
6,13,,,2.078774,,,,,,,...,3,3,27,0.333333,12,-6,0,0.0,9,-9
7,14,,2.027902e+00,0.262583,7.950963e+00,10.022851,2.668494,5.106525,2.987106,13.349134,...,1,1,8,0.125000,9,-7,0,0.0,8,-8
8,15,,1.296225e+00,5.229759,9.214744e+00,8.412875,2.208578,4.126977,2.690052,11.206439,...,3,1,4,1.000000,4,0,2,0.5,3,-1
9,16,,,1.050329,,,,,,,...,1,1,5,0.200000,6,-4,0,0.0,5,-5


In [7]:
for item in columns:
    item2='v66'
    label=item+'*'+item2
    label2=item+'/'+item2
    label3=item+'+'+item2
    label4=item+'-'+item2
    all_data[label]=all_data[item]*all_data[item2]
    if (item!=item2) and (item!='v50'):
        all_data[label2]=all_data[item]/(all_data[item2])
        all_data[label3]=all_data[item]+all_data[item2]
        all_data[label4]=all_data[item]-all_data[item2]

all_data=all_data.replace(np.inf, np.nan)

#all_data=all_data.round(decimals=4, out=None)
all_data=all_data.fillna(-999)
train=all_data.iloc[0:train_rows,:]
test=all_data.iloc[train_rows:,:]
test.reset_index(drop=True)

X_test = test
print('Training...')
extc = ExtraTreesClassifier(n_estimators=1000,max_features= 50,criterion= 'entropy',min_samples_split= 4,
                            max_depth= 30, min_samples_leaf= 2, n_jobs = 4, verbose=1)      


#1000,100,4,70,2 ~ 0.0012161270558714169 -0.46096224737368985
#800,100,4,70,2  ~0.00130136064571 -0.461044343811
#1500, 100,4,70,2 ~0.000997310588987 -0.460980594955
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(train, target, test_size=0.4, random_state=0)
extc.fit(train,target)
print("CV")
scores0 = cross_validation.cross_val_score(extc, train, target, cv=3,scoring='log_loss')
print(scores0)


#print('Predict...')
#y_pred = extc.predict_proba(X_test)
#print y_pred

#pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees4.csv',index=False)



Training...


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [11]:
all_data=all_data.replace([np.inf, -np.inf], np.nan)

#all_data=all_data.round(decimals=4, out=None)
all_data=all_data.fillna(-999)
train=all_data.iloc[0:train_rows,:]
test=all_data.iloc[train_rows:,:]
test.reset_index(drop=True)

X_test = test
print('Training...')
extc = ExtraTreesClassifier(n_estimators=1000,max_features= 50,criterion= 'entropy',min_samples_split= 4,
                            max_depth= 30, min_samples_leaf= 2, n_jobs = 4, verbose=1)      


#1000,100,4,70,2 ~ 0.0012161270558714169 -0.46096224737368985
#800,100,4,70,2  ~0.00130136064571 -0.461044343811
#1500, 100,4,70,2 ~0.000997310588987 -0.460980594955
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(train, target, test_size=0.4, random_state=0)
extc.fit(train,target)
print("CV")
scores0 = cross_validation.cross_val_score(extc, train, target, cv=3,scoring='log_loss')
print(scores0)


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   29.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  4.6min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  8.2min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed: 10.3min finished


Training...
CV

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   16.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  5.0min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:  6.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.8s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    7.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   16.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:  6.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:


[-0.46241561 -0.46317693 -0.45985261]


In [10]:
all_data

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v9,v10,v11,...,v129+v66,v129-v66,v130*v66,v130/v66,v130+v66,v130-v66,v131*v66,v131/v66,v131+v66,v131-v66
0,1.335739,8.727474,2,3.921026,7.915266,2.599278,3.176895,9.999999,0.503281,16.434108,...,2,-2,1.272729,0.318182,2.636365,-1.363635,5.714287,1.428572,4.857144,0.857144
1,-999.000000,-999.000000,2,-999.000000,9.191265,-999.000000,-999.000000,-999.000000,1.312910,-999.000000,...,0,0,-0.000000,-inf,-999.000000,-999.000000,-0.000000,-inf,-999.000000,-999.000000
2,0.943877,5.310079,2,4.410969,5.326159,3.979592,3.928571,12.666667,0.765864,14.756098,...,2,2,0.000000,-999.000000,0.883118,0.883118,0.000000,-999.000000,1.176472,1.176472
3,0.797415,8.304757,2,4.225930,11.627438,2.097700,1.987549,8.965516,6.542669,16.347483,...,1,1,0.000000,-999.000000,1.677108,1.677108,0.000000,-999.000000,1.034483,1.034483
4,-999.000000,-999.000000,2,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,1.050328,-999.000000,...,2,-2,-1998.000000,-499.500000,-997.000000,-1001.000000,-1998.000000,-499.500000,-997.000000,-1001.000000
5,-999.000000,-999.000000,2,-999.000000,8.856791,-999.000000,-999.000000,-999.000000,1.050328,-999.000000,...,0,0,-0.000000,-inf,-999.000000,-999.000000,-0.000000,-inf,-999.000000,-999.000000
6,0.899806,7.312995,2,3.494148,9.946200,1.926070,1.770427,5.011287,2.341356,16.274510,...,0,0,0.000000,-999.000000,2.263736,2.263736,0.000000,-999.000000,0.970873,0.970873
7,-999.000000,-999.000000,2,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,1.838074,-999.000000,...,2,-2,-1998.000000,-499.500000,-997.000000,-1001.000000,-1998.000000,-499.500000,-997.000000,-1001.000000
8,2.078651,8.462619,3,3.739030,5.265636,1.573033,2.303371,11.111111,4.463894,16.050955,...,1,-1,1.170731,1.170731,2.170731,0.170731,3.333334,3.333334,4.333334,2.333334
9,1.144802,5.880606,2,3.244469,9.538384,2.500001,1.559405,9.977529,2.363238,16.091401,...,0,0,0.000000,-999.000000,3.174603,3.174603,0.000000,-999.000000,1.000000,1.000000


In [59]:
a=a.sort(1,ascending=0)
to_drop=a.loc[a[1]>0,0]

  if __name__ == '__main__':


In [66]:

all_data=all_data.drop(to_drop,axis=1)


0          16352
1           5300
2            173
3          20155
4           5662
5           1350
6           9624
7          23059
8           5035
9          11472
10          4870
11         19389
12         11794
13          9364
14         11794
15          9755
16          9980
17        475254
18         19192
19         22418
20          6219
21         23184
22         16025
23         19937
24         15554
25         19352
26          2099
27          1960
28          9235
29          4386
           ...  
114291      3552
114292     21719
114293       677
114294     17452
114295       878
114296      3074
114297      8937
114298     15225
114299     10871
114300     11266
114301     11887
114302     22926
114303      2876
114304     19624
114305     21943
114306       367
114307     11893
114308     15278
114309     16249
114310      5898
114311     22443
114312     19324
114313      9147
114314     11893
114315       249
114316     20725
114317     16712
114318     228

Unnamed: 0,v3,0,1,2,3
0,2,0,0,1,0
1,2,0,0,1,0
2,2,0,0,1,0
3,2,0,0,1,0
4,2,0,0,1,0
5,2,0,0,1,0
6,2,0,0,1,0
7,2,0,0,1,0
8,3,0,0,0,1
9,2,0,0,1,0


In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble
from sklearn import preprocessing
from sklearn import cross_validation

def spreadshimaltodecimal(n):
    """Interpret a string "A", "B",.., "Z", "AA", as semi hexavigesimal (base-26) notation and return the value.
    This is how a typical spreadsheet counts the rows. e.g. A=1, B=2, ..., Z=26, AA=27, ..., AZ=52, BA=53, ..., ZZ=702, AAA=703, etc.
    Observe that this is not a proper base system, since that would require 26 to be written with two digits (e.g. as A0).
    https://github.com/macfreek/puzzle-code/blob/master/base26.py
    """
    try:
        alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        s = 1
        if n[0] == '-':
            n = n[1:]
            s = -1
        d = 0
        while n != "":
            d = 26*d + alphabet.index(n[0]) + 1
            n = n[1:]
        return s*d
    except TypeError:
        raise ValueError("invalid literal for spreadshimaltodecimal() with (semi) base 26: %r" % n)

print('Load data...')
train = pd.read_csv("C://users//Robert//Downloads//train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81',
                    'v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("C://users//Robert//Downloads//test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
#make a big happy dataset to preprocess
train_rows=train.shape[0]
test_rows=test.shape[0]
all_data = pd.concat([train, test], ignore_index=True)

#deal with v22
all_data['v22']=all_data['v22'].fillna('ZZZZ')
all_data['v22']=all_data['v22'].apply(spreadshimaltodecimal)
all_data.loc[all_data['v22']==475254,'v22']=-999
#start some one-hot encoding!
#first turn categories into factors  naturally one of these factors will be the NANs
colnames=[]
for f in all_data.columns:
    if all_data[f].dtype == 'object':
        colnames.append(all_data[f].name) #grab a list of cols on the way through
        lbl = preprocessing.LabelEncoder()
        lbl.fit(np.unique(list(all_data[f].values)))
        all_data[f]= lbl.transform(list(all_data[f].values))
#now one-hot encode
enc = preprocessing.OneHotEncoder()
enc.fit(all_data[colnames])
a=enc.transform(all_data[colnames]).toarray()
#make a dataframe and merge it to the original
new=pd.DataFrame(a)
newcolnames=list(new.columns.values) 
renamedcols=[]
for i in newcolnames:
    i="enc"+i.astype(str)
    renamedcols.append(i)
new.columns=renamedcols
all_data=pd.merge(all_data, new, how='inner', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True)
#drop the categorical columns name
all_data=all_data.drop(colnames,axis=1)
#fill up remaining NANs

Load data...


In [19]:
importances[indices[:]]


array([ 0.09231929,  0.03497101,  0.03483037,  0.03380233,  0.033713  ,
        0.03262868,  0.03230669,  0.0320481 ,  0.03169715,  0.02989147,
        0.02947344,  0.02944443,  0.02904131,  0.02869756,  0.02781091,
        0.0266641 ,  0.02638529,  0.02372874,  0.0213982 ,  0.02059511,
        0.01974061,  0.0141696 ,  0.01415061,  0.00827391,  0.00727171,
        0.00713061,  0.00710731,  0.00685376,  0.00509576,  0.00440759,
        0.00438527,  0.00436995,  0.00435459,  0.00428179,  0.00427061,
        0.00426978,  0.00426793,  0.00426325,  0.0042592 ,  0.00424943,
        0.00420654,  0.0041099 ,  0.00410701,  0.00408226,  0.00407612,
        0.00406927,  0.0040635 ,  0.00404686,  0.00399523,  0.00397036,
        0.00394578,  0.00391073,  0.0038958 ,  0.00387599,  0.00385963,
        0.00383174,  0.00382909,  0.00377608,  0.00377239,  0.00376247,
        0.00375534,  0.00374381,  0.00371651,  0.00371383,  0.00370465,
        0.0037031 ,  0.0036705 ,  0.00366924,  0.00366594,  0.00

In [41]:
for item in newcolnames:
    if 'v' in item:
        print(item,all_data[item].max(),all_data[item].min())


v1 20.0000006294 -9.99649701427e-07
v2 19.9999999087 -9.81761449207e-07
v4 20.0000002886 -6.47592938974e-07
v5 20.0000003539 -5.28706773973e-07
v6 20.0000005964 -9.87330042687e-07
v7 19.9999999798 -9.46876549674e-07
v9 20.0000008878 -9.82875684434e-07
v10 20.0000001103 -9.87531659989e-07
v11 20.0000009233 -1.45906182755e-07
v12 20.0000001138 5.14322389107e-07
v13 20.0000009059 -8.46488878755e-07
v14 20.0000007841 -9.73883146957e-07
v15 19.9999990205 -9.97459053099e-07
v16 20.0000009395 -9.9946912175e-07
v17 19.9999999653 -9.06645539828e-07
v18 20.0000009944 -3.20761048295e-07
v19 20.0000007723 -5.17898659064e-07
v20 20.0000009936 4.50790264304e-07
v21 20.0000006896 6.04888654096e-08
v22 23420 -999
v26 20.0000000395 -9.34669608422e-07
v27 20.00000029 -9.915985826e-07
v28 20.0000009118 -9.63784810427e-07
v29 20.0000006093 -3.04075268346e-07
v32 20.0000008952 -9.77059916785e-07
v33 20.0000002023 -9.95200355134e-07
v34 20.000000815 -6.70766979802e-07
v35 20.0000000877 -9.95832650387e-07
v3

In [10]:
all_data.shape

(228714, 436)

In [8]:
colnames

['v3',
 'v24',
 'v30',
 'v47',
 'v52',
 'v56',
 'v66',
 'v71',
 'v74',
 'v91',
 'v112',
 'v113',
 'v125']

In [23]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

from sklearn import ensemble, metrics, linear_model
import random

#Some parameters to play with
rnd=12
random.seed(rnd)
n_ft=20 #Number of features to add
max_elts=3 #Maximum size of a group of linear features

class addNearestNeighbourLinearFeatures:
    
    def __init__(self, n_neighbours=1, max_elts=None, verbose=True, random_state=None):
        self.rnd=random_state
        self.n=n_neighbours
        self.max_elts=max_elts
        self.verbose=verbose
        self.neighbours=[]
        self.clfs=[]
        
    def fit(self,train,y):
        if self.rnd!=None:
            random.seed(rnd)
        if self.max_elts==None:
            self.max_elts=len(train.columns)
        list_vars=list(train.columns)
        random.shuffle(list_vars)
        
        lastscores=np.zeros(self.n)+1e15

        for elt in list_vars[:self.n]:
            self.neighbours.append([elt])
        list_vars=list_vars[self.n:]
        
        for elt in list_vars:
            indice=0
            scores=[]
            for elt2 in self.neighbours:
                if len(elt2)<self.max_elts:
                    clf=linear_model.LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1) 
                    clf.fit(train[elt2+[elt]], y)
                    scores.append(metrics.log_loss(y,clf.predict(train[elt2 + [elt]])))
                    indice=indice+1
                else:
                    scores.append(lastscores[indice])
                    indice=indice+1
            gains=lastscores-scores
            if gains.max()>0:
                temp=gains.argmax()
                lastscores[temp]=scores[temp]
                self.neighbours[temp].append(elt)

        indice=0
        for elt in self.neighbours:
            clf=linear_model.LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1) 
            clf.fit(train[elt], y)
            self.clfs.append(clf)
            if self.verbose:
                print(indice, lastscores[indice], elt)
            indice=indice+1
                    
    def transform(self, train):
        indice=0
        for elt in self.neighbours:
            train['_'.join(pd.Series(elt).sort_values().values)]=self.clfs[indice].predict(train[elt])
            indice=indice+1
        return train
    
    def fit_transform(self, train, y):
        self.fit(train, y)
        return self.transform(train)
    
    
train = pd.read_csv("C://users//Robert//Downloads//train.csv")
target = train['target'].values
test = pd.read_csv("C://users//Robert//Downloads//test.csv")
id_test = test['ID'].values

train['v22-1']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0]))
test['v22-1']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0]))
train['v22-2']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1]))
test['v22-2']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1]))
train['v22-3']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2]))
test['v22-3']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2]))
train['v22-4']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3]))
test['v22-4']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3]))

drop_list=['v91','v1', 'v8', 'v10', 'v15', 'v17', 'v25', 'v29', 'v34', 'v41', 'v46', 'v54', 'v64', 'v67', 'v97', 'v105', 'v111', 'v122']
train = train.drop(['ID','target'] + drop_list,axis=1).fillna(-999)
test = test.drop(['ID'] + drop_list,axis=1).fillna(-999)

refcols=list(train.columns)

for elt in refcols:
    if train[elt].dtype=='O':
        train[elt], temp = pd.factorize(train[elt])
        test[elt]=temp.get_indexer(test[elt])
    else:
        train[elt]=train[elt].round(5)
        test[elt]=test[elt].round(5)
        
a=addNearestNeighbourLinearFeatures(n_neighbours=n_ft, max_elts=max_elts, verbose=True, random_state=rnd)
a.fit(train, target)

train = a.transform(train)
test = a.transform(test)

clf = ensemble.ExtraTreesClassifier(n_estimators=750,max_features=50,criterion= 'entropy',min_samples_split= 4,
                        max_depth= 35, min_samples_leaf= 2, n_jobs = -1, random_state=rnd)

clf.fit(train,target)
#pred_et=clf.predict_proba(test)

#submission=pd.read_csv('../input/sample_submission.csv')
#submission.index=submission.ID
#submission.PredictedProb=pred_et[:,1]
#submission.to_csv('./addNNLinearFt.csv', index=False)
#submission.PredictedProb.hist(bins=30)

0 0.646715216318 ['v95', 'v65', 'v127']
1 0.562964004471 ['v93', 'v88', 'v22-3']
2 0.901054072954 ['v103', 'v84', 'v58']
3 1.61308392004 ['v112', 'v99', 'v22']
4 0.934943168077 ['v77', 'v92', 'v7']
5 1.70636167141 ['v12', 'v36', 'v109']
6 0.838443122677 ['v131', 'v115', 'v120']
7 3.96006779382 ['v50', 'v21', 'v31']
8 6.67960281866 ['v71', 'v80', 'v113']
9 0.564053131023 ['v82', 'v125', 'v22-4']
10 1.13009199555 ['v128', 'v117', 'v52']
11 4.51288261543 ['v102', 'v48', 'v69']
12 1.12644593609 ['v6', 'v86', 'v18']
13 1.00393374893 ['v43', 'v118', 'v76']
14 0.861120433524 ['v47', 'v57', 'v26']
15 1.73707065994 ['v62', 'v66', 'v30']
16 1.22333726898 ['v40', 'v19', 'v130']
17 0.555421380233 ['v116', 'v42', 'v32']
18 1.00645103824 ['v5', 'v124', 'v73']
19 0.549561619655 ['v22-1', 'v61', 'v39']


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=35, max_features=50, max_leaf_nodes=None,
           min_samples_leaf=2, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=750, n_jobs=-1,
           oob_score=False, random_state=12, verbose=0, warm_start=False)

In [34]:
import matplotlib.pyplot as plt

#extc.fit(train,target)
scores0 = cross_validation.cross_val_score(clf, train, target, cv=3,scoring='log_loss')
print(scores0)
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in extc.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
cols=list(train.columns.values)

print("Feature ranking:")
feature_names=[]
for f in range(train.shape[1]):
    print("%d. %s (%f) (%f) (%s)" % (f + 1, cols[indices[f]], importances[indices[f]], std[indices[f]], train.iloc[:,indices[f]].dtype))
    feature_names.append([cols[indices[f]], std[indices[f]]-importances[indices[f]]])
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(train.shape[1]), indices)
plt.xlim([-1, train.shape[1]])
plt.show()

[-0.46016055 -0.45887096 -0.45772426]
Feature ranking:
1. v50 (0.077713) (0.006641) (float64)
2. v21_v31_v50 (0.025222) (0.000034) (float64)
3. v66 (0.024344) (0.000170) (int32)
4. v114 (0.021634) (0.000190) (float64)
5. v107 (0.021539) (0.000575) (int32)
6. v12 (0.021406) (0.000914) (float64)
7. v22-2 (0.021396) (0.000001) (int64)
8. v40 (0.021380) (0.000831) (float64)
9. v52 (0.020787) (0.000182) (int32)
10. v56 (0.020699) (0.000179) (int32)
11. v125 (0.019596) (0.000119) (int32)
12. v22-4 (0.019494) (0.000087) (int64)
13. v22-3 (0.019473) (0.000057) (int64)
14. v22 (0.018948) (0.000192) (int32)
15. v125_v22-4_v82 (0.018930) (0.000050) (float64)
16. v22-3_v88_v93 (0.018513) (0.000031) (float64)
17. v14 (0.018402) (0.000516) (float64)
18. v112 (0.018326) (0.000184) (int32)
19. v24 (0.017852) (0.000188) (int32)
20. v117_v128_v52 (0.017609) (0.000028) (float64)
21. v30_v62_v66 (0.017474) (0.000031) (float64)
22. v112_v22_v99 (0.017215) (0.000029) (float64)
23. v21 (0.015949) (0.000182) 

In [39]:
train['v22-1']

0         64
1         64
2         64
3         65
4         64
5         64
6         64
7         65
8         64
9         64
10        64
11        65
12        64
13        64
14        64
15        64
16        64
17        64
18        65
19        65
20        64
21        65
22        64
23        65
24        64
25        65
26        64
27        64
28        64
29        64
          ..
114291    64
114292    65
114293    64
114294    64
114295    64
114296    64
114297    64
114298    64
114299    64
114300    64
114301    64
114302    65
114303    64
114304    65
114305    65
114306    64
114307    64
114308    64
114309    64
114310    64
114311    65
114312    65
114313    64
114314    64
114315    64
114316    65
114317    64
114318    65
114319    64
114320    64
Name: v22-1, dtype: int64

In [33]:
cols[1]

'v3'

In [140]:
a=[]
for item in list(sample_test.columns.values):
    a.append([item,sample_test['loss'].corr(sample_test[item])])

In [152]:

a.sort_values('import',ascending=0)
1. feature v50 (0.091195) (float64)
2. feature v52 (0.034621) (int64)
3. feature v22 (0.034425) (int64)
4. feature v125 (0.033434) (int64)
5. feature v91 (0.033427) (int64)
6. feature v56 (0.032889) (int64)
7. feature v112 (0.032042) (int64)
8. feature v10 (0.031870) (float64)
9. feature v12 (0.031628) (float64)
10. feature v114 (0.029703) (float64)
11. feature v40 (0.029410) (float64)
12. feature v66 (0.029251) (int64)
13. feature v14 (0.029154) (float64)
14. feature v34 (0.027714) (float64)
15. feature v47 (0.027350) (int64)

Unnamed: 0,0,1,import
108,loss,1.000000,1.000000
106,target,-0.754644,0.754644
107,predict,-0.300439,0.300439
41,v50,-0.268822,0.268822
7,v10,-0.196946,0.196946
38,v47,0.189566,0.189566
95,v129,-0.187302,0.187302
11,v14,-0.176269,0.176269
86,v113,-0.175468,0.175468
44,v56,0.139108,0.139108


In [8]:
train['target']=

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v9,v10,v11,...,v72+v129,v72-v129,v72*v47,v72/v47,v72+v47,v72-v47,v129*v47,v129/v47,v129+v47,v129-v47
0,1.3357,8.7275,2,3.9210,7.9153,2.5993,3.1769,10.0000,0.5033,16.4341,...,1,1,2,0.5000,3,-1,0,0.0,2,-2
1,-999.0000,-999.0000,2,-999.0000,9.1913,-999.0000,-999.0000,-999.0000,1.3129,-999.0000,...,2,2,8,0.5000,6,-2,0,0.0,4,-4
2,0.9439,5.3101,2,4.4110,5.3262,3.9796,3.9286,12.6667,0.7659,14.7561,...,5,1,6,1.5000,5,1,4,1.0,4,0
3,0.7974,8.3048,2,4.2259,11.6274,2.0977,1.9875,8.9655,6.5427,16.3475,...,3,1,4,1.0000,4,0,2,0.5,3,-1
4,-999.0000,-999.0000,2,-999.0000,-999.0000,-999.0000,-999.0000,-999.0000,1.0503,-999.0000,...,1,1,8,0.1250,9,-7,0,0.0,8,-8
5,-999.0000,-999.0000,2,-999.0000,8.8568,-999.0000,-999.0000,-999.0000,1.0503,-999.0000,...,1,1,8,0.1250,9,-7,0,0.0,8,-8
6,0.8998,7.3130,2,3.4941,9.9462,1.9261,1.7704,5.0113,2.3414,16.2745,...,2,2,4,1.0000,4,0,0,0.0,2,-2
7,-999.0000,-999.0000,2,-999.0000,-999.0000,-999.0000,-999.0000,-999.0000,1.8381,-999.0000,...,2,2,16,0.2500,10,-6,0,0.0,8,-8
8,2.0787,8.4626,3,3.7390,5.2656,1.5730,2.3034,11.1111,4.4639,16.0510,...,0,0,0,0.0000,3,-3,0,0.0,3,-3
9,1.1448,5.8806,2,3.2445,9.5384,2.5000,1.5594,9.9775,2.3632,16.0914,...,3,3,24,0.3750,11,-5,0,0.0,8,-8


In [46]:
test.loc[test['enc4']==1,('enc4','enc5','enc6','enc0')]

Unnamed: 0,enc4,enc5,enc6,enc0
117221,1,0,0,1
118363,1,0,0,0
118818,1,0,0,0
124892,1,0,0,0
129652,1,0,0,0
131950,1,0,0,1
136504,1,0,0,0
138752,1,0,0,0
139833,1,0,0,0
141241,1,0,0,0


In [8]:
all_data['v22'].max()

23420

In [4]:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble
from sklearn import preprocessing
from sklearn import cross_validation

def spreadshimaltodecimal(n):
    """Interpret a string "A", "B",.., "Z", "AA", as semi hexavigesimal (base-26) notation and return the value.
    This is how a typical spreadsheet counts the rows. e.g. A=1, B=2, ..., Z=26, AA=27, ..., AZ=52, BA=53, ..., ZZ=702, AAA=703, etc.
    Observe that this is not a proper base system, since that would require 26 to be written with two digits (e.g. as A0).
    https://github.com/macfreek/puzzle-code/blob/master/base26.py
    """
    try:
        alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        s = 1
        if n[0] == '-':
            n = n[1:]
            s = -1
        d = 0
        while n != "":
            d = 26*d + alphabet.index(n[0]) + 1
            n = n[1:]
        return s*d
    except TypeError:
        raise ValueError("invalid literal for spreadshimaltodecimal() with (semi) base 26: %r" % n)

print('Load data...')
train = pd.read_csv("C://users//Robert//Downloads//train.csv")
target = train['target'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81',
                    'v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("C://users//Robert//Downloads//test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v79','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
#make a big happy dataset to preprocess
train_rows=train.shape[0]
test_rows=test.shape[0]
all_data = pd.concat([train, test], ignore_index=True)

#deal with v22
all_data['v22']=all_data['v22'].fillna('ZZZZ')
all_data['v22']=all_data['v22'].apply(spreadshimaltodecimal)
all_data.loc[all_data['v22']==475254,'v22']=-999 #for feature expansion
#count NAs
all_data['NA_Count']=all_data.count(axis=1)
#encode each column for NA
#for col in list(all_data.columns.values):
#label=col+'pop'
#    all_data[label]=all_data[col].notnull().astype(int)
#start some one-hot encoding!
#first turn categories into factors  naturally one of these factors will be the NANs
colnames=[]
for f in all_data.columns:
    if all_data[f].dtype == 'object':
        colnames.append(all_data[f].name) #grab a list of cols on the way through
        lbl = preprocessing.LabelEncoder()
        lbl.fit(np.unique(list(all_data[f].values)))
        all_data[f]= lbl.transform(list(all_data[f].values))
#now one-hot encode
"""enc = preprocessing.OneHotEncoder()
enc.fit(all_data[colnames])
a=enc.transform(all_data[colnames]).toarray()
#make a dataframe and merge it to the original
new=pd.DataFrame(a)
newcolnames=list(new.columns.values) 
renamedcols=[]
for i in newcolnames:
    i="enc"+i.astype(str)
    renamedcols.append(i)
new.columns=renamedcols
all_data=pd.merge(all_data, new, how='inner', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True)
#drop the categorical columns name as they are now in the 'enc' columns
all_data=all_data.drop(colnames,axis=1)
"""
# custom ohe for just a few variables
c_var=['v3','v24','v74','v30','v66','v30','v91','v47','v71','v52','v112']
enc = preprocessing.OneHotEncoder()
enc.fit(all_data[c_var])
a=enc.transform(all_data[c_var]).toarray()
#make a dataframe and merge it to the original
new=pd.DataFrame(a)
newcolnames=list(new.columns.values) 
renamedcols=[]
for i in newcolnames:
    i="enc"+i.astype(str)
    renamedcols.append(i)
new.columns=renamedcols
all_data=pd.merge(all_data, new, how='inner', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True)

#drop the categorical columns name as they are now in the 'enc' columns
#all_data=all_data.drop(c_var,axis=1)

"""#feature engineer
v50thing=list(all_data.columns.values)
for i in v50thing:
    label='v50+'+i
    all_data[label]=all_data.v50+10*all_data[i]

all_data['v66*v66']=all_data.v66*all_data.v66
all_data['v18*v38']=all_data.v18*all_data.v38
all_data['v50*v62']=all_data.v50*all_data.v62  
all_data['v38*v99']=all_data.v38*all_data.v38  
all_data['v50*v72']=all_data.v50*all_data.v72  
all_data['v40*v66']=all_data.v40*all_data.v66  
all_data['v19*v39']=all_data.v19*all_data.v39  
all_data['v19*v38']=all_data.v19*all_data.v38  
all_data['v1*v19']=all_data.v1*all_data.v19  
all_data['v1*v38']=all_data.v1*all_data.v38  
all_data['v47*v66']=all_data.v47*all_data.v66  
all_data['v24*v50']=all_data.v24*all_data.v50  
all_data['v50*v66']=all_data.v50*all_data.v66  
all_data['v12*v50']=all_data.v12*all_data.v50  
all_data['v28*v38']=all_data.v28*all_data.v38  
all_data['v38*v84']=all_data.v38*all_data.v84  
all_data['v10*v66']=all_data.v10*all_data.v66  
all_data['v38*v80']=all_data.v38*all_data.v80  
all_data['v38*v47']=all_data.v38*all_data.v47  
all_data['v34*v66']=all_data.v34*all_data.v66  
all_data['v24*v24']=all_data.v24*all_data.v24  
all_data['v10*v38']=all_data.v10*all_data.v38  
all_data['v62*v66']=all_data.v62*all_data.v66  
all_data['v103*v57']=all_data.v103*all_data.v57 
all_data['v24*v66']=all_data.v24*all_data.v66  
all_data['v66*v72']=all_data.v66*all_data.v72  
all_data['v38*v90']=all_data.v38*all_data.v90 
all_data['v38*v50']=all_data.v38*all_data.v50  
all_data['v21*v50']=all_data.v21*all_data.v50  
all_data['v38*v86']=all_data.v38*all_data.v86  
all_data['v19*v99']=all_data.v19*all_data.v99  
all_data['v38*v62']=all_data.v38*all_data.v62  
all_data['v19*v86']=all_data.v19*all_data.v86  
all_data['v62*v72']=all_data.v62*all_data.v72 
all_data['v62*v62']=all_data.v62*all_data.v62  
all_data['v24*v47']=all_data.v24*all_data.v47
all_data['v19*v60']=all_data.v19*all_data.v60  
all_data['v50*v71']=all_data.v50*all_data.v71  
all_data['v33*v38']=all_data.v33*all_data.v38  
all_data['enc5*v2']=all_data.enc5*all_data.v2  
all_data['v30*v50']=all_data.v30*all_data.v50  
all_data['v34*v50']=all_data.v34*all_data.v50  
all_data['v102*v129']=all_data.v102*all_data.v129
all_data['v19*v80']=all_data.v19*all_data.v80  
all_data['v43*v66']=all_data.v43*all_data.v66  
all_data['v10*v50']=all_data.v10*all_data.v50  
all_data['v10*v90']=all_data.v10*all_data.v90  
all_data['v19*v27']=all_data.v19*all_data.v27  
all_data['enc4*v94']=all_data.enc4*all_data.v94 
all_data['v6*v66']=all_data.v6*all_data.v66   
all_data['v38*v55']=all_data.v38*all_data.v55  
all_data['v38*v58']=all_data.v38*all_data.v58  
all_data['v19*v57']=all_data.v19*all_data.v57  
all_data['v56pop']=all_data['v56'].notnull().astype(int)
all_data['v30pop']=all_data['v30'].notnull().astype(int)
all_data['v115pop']=all_data['v115'].notnull().astype(int)
#v56,30 115
#fill up remaining NANs
all_data=all_data.fillna(-999)

#add a random variable for feature selection
#all_data['rnd']=20*np.random.random(228714)
#split back into train and test sets.
"""
all_data=all_data.round(decimals=4, out=None)
all_data=all_data.fillna(-999)
train=all_data.iloc[0:train_rows,:]
test=all_data.iloc[train_rows:,:]
test.reset_index(drop=True)
sample_test=train.loc[0:1000,:]
sample_train=train.loc[1000:,:]

Load data...


In [37]:
train

Unnamed: 0,v2,v3,v4,v5,v6,v7,v9,v11,v12,v13,...,v117_v128_v52,v102_v48_v69,v18_v6_v86,v118_v43_v76,v26_v47_v57,v30_v62_v66,v130_v19_v40,v116_v32_v42,v124_v5_v73,v22-1_v39_v61
0,8.72747,0,3.92103,7.91527,2.59928,3.17689,10.00000,16.43411,6.08571,2.86683,...,0.348519,-0.009269,0.951575,0.117257,0.737750,0.269608,0.148505,0.771853,0.313389,0.751845
1,-999.00000,0,-999.00000,9.19127,-999.00000,-999.00000,-999.00000,-999.00000,6.50765,-999.00000,...,0.660182,0.758537,0.769846,0.768351,0.778143,0.798058,0.777951,0.770103,0.626679,0.767665
2,5.31008,0,4.41097,5.32616,3.97959,3.92857,12.66667,14.75610,6.38467,2.50559,...,0.209620,0.012471,1.462054,0.098494,0.584733,0.598759,0.226936,0.722432,0.210928,0.751825
3,8.30476,0,4.22593,11.62744,2.09770,1.98755,8.96552,16.34748,9.64665,3.90330,...,0.284774,0.005515,0.361363,0.173153,0.871746,0.528450,0.425903,0.753629,0.462419,0.763615
4,-999.00000,0,-999.00000,-999.00000,-999.00000,-999.00000,-999.00000,-999.00000,6.32009,-999.00000,...,0.666283,0.758537,0.769846,0.768351,0.756951,0.339918,0.772586,0.770103,0.775923,0.767665
5,-999.00000,0,-999.00000,8.85679,-999.00000,-999.00000,-999.00000,-999.00000,6.21608,-999.00000,...,0.648357,0.758537,0.769846,0.768351,0.756951,0.598759,0.769708,0.770103,0.635496,0.767665
6,7.31299,0,3.49415,9.94620,1.92607,1.77043,5.01129,16.27451,7.71117,5.91559,...,0.875691,0.003931,0.526924,0.160312,0.855432,0.938677,0.536520,0.852004,0.392093,0.751875
7,-999.00000,0,-999.00000,-999.00000,-999.00000,-999.00000,-999.00000,-999.00000,6.42448,-999.00000,...,0.755432,0.758537,0.769846,0.768351,0.756951,0.539217,0.763438,0.770103,0.775923,0.779416
8,8.46262,1,3.73903,5.26564,1.57303,2.30337,11.11111,16.05096,8.71505,2.34805,...,0.235936,0.003593,0.506340,0.174321,0.833135,0.587992,0.297259,0.695538,0.208428,0.751853
9,5.88061,0,3.24447,9.53838,2.50000,1.55941,9.97753,16.09140,7.41785,4.17695,...,0.790854,0.008525,0.621939,0.210692,0.824913,1.067666,0.839550,0.744267,0.378791,0.751890


In [36]:
y_pred = extc.predict_proba(X_test)


pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('extra_trees9.csv',index=False)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    7.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   16.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   46.5s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:  1.1min finished


In [4]:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble
from sklearn import preprocessing
from sklearn import cross_validation

def spreadshimaltodecimal(n):
    """Interpret a string "A", "B",.., "Z", "AA", as semi hexavigesimal (base-26) notation and return the value.
    This is how a typical spreadsheet counts the rows. e.g. A=1, B=2, ..., Z=26, AA=27, ..., AZ=52, BA=53, ..., ZZ=702, AAA=703, etc.
    Observe that this is not a proper base system, since that would require 26 to be written with two digits (e.g. as A0).
    https://github.com/macfreek/puzzle-code/blob/master/base26.py
    """
    try:
        alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        s = 1
        if n[0] == '-':
            n = n[1:]
            s = -1
        d = 0
        while n != "":
            d = 26*d + alphabet.index(n[0]) + 1
            n = n[1:]
        return s*d
    except TypeError:
        raise ValueError("invalid literal for spreadshimaltodecimal() with (semi) base 26: %r" % n)

print('Load data...')
train = pd.read_csv("C://users//Robert//Downloads//train.csv")
target = train['target'].values
IDtar=train['ID'].values
train = train.drop(['ID','target','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v81',
                    'v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
test = pd.read_csv("C://users//Robert//Downloads//test.csv")
id_test = test['ID'].values
test = test.drop(['ID','v8','v23','v25','v31','v36','v37','v46','v51','v53','v54','v63','v73','v75','v81','v82','v89','v92','v95','v105','v107','v108','v109','v110','v116','v117','v118','v119','v123','v124','v128'],axis=1)
#make a big happy dataset to preprocess
train_rows=train.shape[0]
test_rows=test.shape[0]
columns=train.columns
all_data = pd.concat([train, test], ignore_index=True)
all_data['NA_Count']=all_data.count(axis=1)

for col in list(all_data.columns.values):
    label=col+'pop'
    all_data[label]=all_data[col].notnull().astype(int)

#deal with v22
all_data['v22']=all_data['v22'].fillna('ZZZZ')
all_data['v22']=all_data['v22'].apply(spreadshimaltodecimal)
all_data.loc[all_data['v22']==475254,'v22']=-999*1171 #for feature expansion
all_data['v22']=all_data['v22']/1171
#count NAs


#start some one-hot encoding!
#first turn categories into factors  naturally one of these factors will be the NANs
colnames=[]
for f in all_data.columns:
    if all_data[f].dtype == 'object':
        colnames.append(all_data[f].name) #grab a list of cols on the way through
        lbl = preprocessing.LabelEncoder()
        lbl.fit(np.unique(list(all_data[f].values)))
        all_data[f]= lbl.transform(list(all_data[f].values))
#now one-hot encode
"""enc = preprocessing.OneHotEncoder()
enc.fit(all_data[colnames])
a=enc.transform(all_data[colnames]).toarray()
#make a dataframe and merge it to the original
new=pd.DataFrame(a)
newcolnames=list(new.columns.values) 
renamedcols=[]
for i in newcolnames:
    i="enc"+i.astype(str)
    renamedcols.append(i)
new.columns=renamedcols
all_data=pd.merge(all_data, new, how='inner', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True)
#drop the categorical columns name as they are now in the 'enc' columns
all_data=all_data.drop(colnames,axis=1)
"""
# custom ohe for just a few variables
c_var=['v3','v24','v74','v30','v66','v30','v91','v47','v71','v52','v112']
enc = preprocessing.OneHotEncoder()
enc.fit(all_data[c_var])
a=enc.transform(all_data[c_var]).toarray()
#make a dataframe and merge it to the original
new=pd.DataFrame(a)
newcolnames=list(new.columns.values) 
renamedcols=[]
for i in newcolnames:
    i="enc"+i.astype(str)
    renamedcols.append(i)
new.columns=renamedcols
all_data=pd.merge(all_data, new, how='inner', on=None, left_on=None, right_on=None,
      left_index=True, right_index=True, sort=True)
for item in columns:
    labelsd='sd'+item
    labelmd='meandiff^'+item
    labelmoddif='modediff^'+item
    all_data[labelsd]=(all_data[item]-all_data[item].mean())/all_data[item].std()
    all_data[labelmd]=(all_data[item]-all_data[item].mean())
    all_data[labelmd]=(all_data[item]-all_data[item].mode().iloc[0])
#all_data=all_data.round(decimals=4, out=None)
all_data['v66*v66']=all_data.v66*all_data.v66
all_data['v18*v38']=all_data.v18*all_data.v38
all_data['v50*v62']=all_data.v50*all_data.v62  
all_data['v38*v99']=all_data.v38*all_data.v38  
all_data['v50*v72']=all_data.v50*all_data.v72  
all_data['v40*v66']=all_data.v40*all_data.v66  
all_data['v19*v39']=all_data.v19*all_data.v39  
all_data['v19*v38']=all_data.v19*all_data.v38  
all_data['v1*v19']=all_data.v1*all_data.v19  
all_data['v1*v38']=all_data.v1*all_data.v38  
all_data['v47*v66']=all_data.v47*all_data.v66  
all_data['v24*v50']=all_data.v24*all_data.v50  
all_data['v50*v66']=all_data.v50*all_data.v66  
all_data['v12*v50']=all_data.v12*all_data.v50  
all_data['v28*v38']=all_data.v28*all_data.v38  
all_data['v38*v84']=all_data.v38*all_data.v84  
all_data['v10*v66']=all_data.v10*all_data.v66  
all_data['v38*v80']=all_data.v38*all_data.v80  
all_data['v38*v47']=all_data.v38*all_data.v47  
all_data['v34*v66']=all_data.v34*all_data.v66  
all_data['v24*v24']=all_data.v24*all_data.v24  
all_data['v10*v38']=all_data.v10*all_data.v38  
all_data['v62*v66']=all_data.v62*all_data.v66  
all_data['v103*v57']=all_data.v103*all_data.v57 
all_data['v24*v66']=all_data.v24*all_data.v66  
all_data['v66*v72']=all_data.v66*all_data.v72  
all_data['v38*v90']=all_data.v38*all_data.v90 
all_data['v38*v50']=all_data.v38*all_data.v50  
all_data['v21*v50']=all_data.v21*all_data.v50  
all_data['v38*v86']=all_data.v38*all_data.v86  
all_data['v19*v99']=all_data.v19*all_data.v99  
all_data['v38*v62']=all_data.v38*all_data.v62  
all_data['v19*v86']=all_data.v19*all_data.v86  
all_data['v62*v72']=all_data.v62*all_data.v72 
all_data['v62*v62']=all_data.v62*all_data.v62  
all_data['v24*v47']=all_data.v24*all_data.v47
all_data['v19*v60']=all_data.v19*all_data.v60  
all_data['v50*v71']=all_data.v50*all_data.v71  
all_data['v33*v38']=all_data.v33*all_data.v38  
all_data['enc5*v2']=all_data.enc5*all_data.v2  
all_data['v30*v50']=all_data.v30*all_data.v50  
all_data['v34*v50']=all_data.v34*all_data.v50  
all_data['v102*v129']=all_data.v102*all_data.v129
all_data['v19*v80']=all_data.v19*all_data.v80  
all_data['v43*v66']=all_data.v43*all_data.v66  
all_data['v10*v50']=all_data.v10*all_data.v50  
all_data['v10*v90']=all_data.v10*all_data.v90  
all_data['v19*v27']=all_data.v19*all_data.v27  
all_data['enc4*v94']=all_data.enc4*all_data.v94 
all_data['v6*v66']=all_data.v6*all_data.v66   
all_data['v38*v55']=all_data.v38*all_data.v55  
all_data['v38*v58']=all_data.v38*all_data.v58  
all_data['v19*v57']=all_data.v19*all_data.v57  

all_data=all_data.fillna(-999)
train=all_data.iloc[0:train_rows,:]
test=all_data.iloc[train_rows:,:]
test.reset_index(drop=True)
train['target']=target
train['ID']=IDtar
test['ID']=id_test
cols=train.columns


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Load data...


TypeError: unhashable type: 'Index'

In [28]:
train.to_csv("Rtrain.csv")
test.to_csv("Rtest.csv")

In [14]:
ca=cols[-2:]+cols[0:-2]

  if __name__ == '__main__':


In [23]:
cols=test.columns.tolist()

In [22]:

train=train[a]

In [24]:
a=cols[-1:]+cols[:-1]

In [26]:
test=test[a]

In [27]:
test

Unnamed: 0,ID,v1,v2,v3,v4,v5,v6,v7,v9,v10,...,v19*v80,v43*v66,v10*v50,v10*v90,v19*v27,enc4*v94,v6*v66,v38*v55,v38*v58,v19*v57
114321,0,1.375465e+00,11.361141,2,4.200778,6.577000,2.081784,1.784386,9.523810,1.312911,...,4.294881e-01,0.000000,0.156036,1.298443,0.989100,0,0.000000,0,0,0.586754
114322,1,-9.990000e+02,-999.000000,2,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,1.291029,...,-9.990000e+02,-999.000000,1.325977,-999.000000,-999.000000,-999,-999.000000,-999,-999,-999.000000
114323,2,-4.903407e-07,8.201529,2,4.544371,6.550100,1.558442,2.467532,7.142858,1.575492,...,2.462731e-01,6.857141,1.324565,1.447458,0.380604,0,3.116883,0,-0,0.940315
114324,7,2.661870e+00,3.041241,2,1.657216,9.773080,2.078337,1.430855,7.959596,1.575493,...,1.694415e+00,1.826832,1.491004,2.355522,0.643786,0,2.078337,0,0,2.076115
114325,10,1.252822e+00,11.283352,2,4.638388,8.520510,2.302484,3.510159,7.612904,1.050328,...,1.536215e-01,2.979737,1.568826,1.043192,0.485431,0,2.302484,0,0,0.782737
114326,11,1.733601e+00,7.525109,2,3.263905,5.608608,2.356090,2.369477,9.935483,0.656456,...,1.025603e+00,0.000000,0.322666,0.660605,0.543526,0,0.000000,0,0,0.871367
114327,13,-9.990000e+02,-999.000000,2,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,2.078774,...,-9.990000e+02,-999.000000,6.484442,-999.000000,-999.000000,-999,-999.000000,-999,-999,-999.000000
114328,14,2.027902e+00,6.947583,2,5.840414,7.944518,3.607374,2.541107,9.818181,0.262583,...,3.127414e-01,0.000000,0.131378,0.200441,0.496693,0,0.000000,0,0,0.802501
114329,15,1.296225e+00,9.573390,2,3.387392,9.316076,2.133125,1.650447,12.050474,5.229759,...,2.636969e-01,0.000000,4.941624,4.934516,0.627806,0,0.000000,0,0,0.886201
114330,16,-9.990000e+02,-999.000000,2,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,1.050329,...,-9.990000e+02,-999.000000,1.753759,-999.000000,-999.000000,-999,-999.000000,-999,-999,-999.000000
