In [2]:
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
%matplotlib inline

 
def preprocess(data, label_encoders={}):
    X = data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
    X["Cabin_Initial"] = data["Cabin"].apply(lambda x: "" if pd.isnull(x) else x[0])
    #display(X.columns.values)
    
    numvar = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
    catvar = ["Sex", "Embarked", "Cabin_Initial"]
    
    for col in numvar:
        X[col] = X[col].fillna(-1)
    for col in catvar:
        X[col] = X[col].fillna("")
    #X.isnull().sum()
    #X.tail(10)

    for col in catvar:
        if col not in label_encoders:
            le = LabelEncoder()
            le.fit(X[col])
            label_encoders[col] = le
        X[col] = label_encoders[col].transform(X[col])
    return X

# Load and preprocess training data
data = pd.read_csv("../input/train.csv")
y = data["Survived"]
data.drop("Survived", axis=1, inplace=True)
label_encoders = {}
X = preprocess(data, label_encoders)

# Split training data to internal training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

def run_grid_search(clf): 
    param_grid = {"n_estimators": [5, 10, 15, 20], "max_depth": [2, 5, 7, 9]}
    grid_clf = GridSearchCV(clf, param_grid, cv=10)
    grid_clf.fit(X_train, y_train)

    params = {
        "n_estimators": grid_clf.best_estimator_.n_estimators,
        "max_depth": grid_clf.best_estimator_.max_depth
    }
    print params
    return params

# Perform grid search by cross validations to find optimal parameters for classifier
clf = RandomForestClassifier()
#params = run_grid_search(clf)
params = {"n_estimators": 15, "max_depth": 9}
clf.set_params(**params)
print "internal training cv accuracy", np.mean(cross_val_score(clf, X_train, y_train, cv = 10))

# Perform fitting
clf.fit(X_train, y_train)
feature_rank = pd.DataFrame(clf.feature_importances_, index = X.columns, columns = ["importance"])
feature_rank.sort_values("importance", ascending=False, inplace=True)

pred_test = clf.predict(X_test)
print "internal testing accuracy", accuracy_score(y_test, pred_test)
display(feature_rank)

# Use all training_data to train the model
clf.fit(X, y)
data = pd.read_csv("../input/test.csv")
X = preprocess(data, label_encoders)
pred = clf.predict(X)
df = pd.DataFrame(zip(data["PassengerId"], pred), 
                   columns=["PassengerId", "Survived"])
df.to_csv('results.csv', index=False)

internal training cv accuracy 0.835800357702
internal testing accuracy 0.759776536313


Unnamed: 0,importance
Sex,0.299483
Fare,0.247143
Age,0.165101
Cabin_Initial,0.093223
Pclass,0.061931
SibSp,0.047311
Parch,0.045343
Embarked,0.040466
