In [182]:
from math import log
from datetime import datetime
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

import numpy as np
import pandas as pd
import os

# Load

In [13]:
kaggleTrain = pd.read_csv("kaggle/train.csv", header = 0)
kaggleTest = pd.read_csv("kaggle/test.csv", header = 0)

In [14]:
print "Kaggle Train count: " + str(len(kaggleTrain))
print "Kaggle Test count:  " + str(len(kaggleTest))

Kaggle Train count: 114321
Kaggle Test count:  114393


## Columns listing

In [70]:
#print train.schema.fields
columnsDict = {}
for col in kaggleTrain.columns:
    typeKey = str(kaggleTrain[col].dtype)
    colName = col
    
    if colName == 'ID':
        print "We have the ID columns, type: " + typeKey
        continue
    if colName == 'target':
        print "We have the target columns, type: " + typeKey
        continue
    
    if typeKey not in columnsDict:
        columnsDict[typeKey] = [colName]
    else:
        columnsDict[typeKey].append(colName)

print ""
for ct, cl in columnsDict.iteritems():
    print ct + " " + str(len(cl))

We have the ID columns, type: int64
We have the target columns, type: int64

object 19
int64 4
float64 108


# Split

In [20]:
train, test = train_test_split(kaggleTrain, test_size = 0.2)

# Prepare X and y

In [81]:
def prepareX(df):
    dfX = df[columnsDict['float64'] + columnsDict['int64']]
    dfX = dfX.fillna(0)
    
    return dfX

In [82]:
def prepareY(df):   
    return df[["target"]].values.ravel()

In [145]:
X_train = prepareX(train)
y_train = prepareY(train)

In [146]:
X_test = prepareX(test)
y_test = prepareY(test)

# Model

In [144]:
#.setNumTrees(150).setMaxDepth(6)
clf = RandomForestClassifier(n_estimators=150, n_jobs=4, max_depth=7)

# Pipeline

In [190]:
imputer = Imputer(missing_values=0,
                  strategy="mean",
                  axis=0)

In [196]:
estimator = Pipeline([("imputer", imputer),
                      ("forest", clf)
                     ])

Score after imputation of the missing values = 0.76


In [197]:
estimator.fit(X_train, y_train)

Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values=0, strategy='mean', verbose=0)), ('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [198]:
evaluate(estimator, X_test, y_test, "Testing", "withImputer")

Testing score: withImputer
0.762344194183
0.509219017912


# Train

In [147]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Score

In [186]:
def evaluate(model, X, y, string, more = ""):
    title = string + " score: " + more
    score = model.score(X, y)
    scoreLogloss = log_loss(y, model.predict_proba(X))
    
    currentDate = str(datetime.now())
    modelExplaination = clf.get_params()
    
    logline = str(score) + ";" + str(scoreLogloss) + ";" + currentDate + ";" + str(modelExplaination) + ";" + more
    
    with open("evaluate"+string+".log", "a") as myfile:
        myfile.write(logline+"\n")
    
    print title
    print score
    print scoreLogloss

In [177]:
evaluate(clf, X_train, y_train, "Training")

Training score:
0.760945153954
0.500384255454


In [178]:
evaluate(clf, X_test, y_test, "Testing")

Testing score:
0.762344194183
0.505796392234


# Save

In [229]:
X_kaggleTest = prepareX(kaggleTest)

In [230]:
predic = clf.predict_proba(X_kaggleTest)[:,1] # extract 2nd column for X=1

In [235]:
predicPd = pd.DataFrame(predic)
predicPd = predicPd.rename(columns={0: 'PredictedProb'})
predicPd['ID'] = kaggleTest[['ID']]
predicPd = predicPd[['ID', 'PredictedProb']]

In [236]:
predicPd.to_csv("results/pythonpure/results.csv", index = False)