In [1]:
import numpy as np
import pandas as pd

titanic = '../data/titanic/raw/train.csv'
titanic_data_raw = pd.read_csv(titanic)

In [2]:
titanic_data_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked']
titanic_data = titanic_data_raw.drop(drop_cols, axis=1)

In [4]:
sex_dict = {'male': 0, 'female': 1}
titanic_data = titanic_data.replace({'Sex': sex_dict})

In [5]:
titanic_data = titanic_data.fillna(titanic_data.median())

In [6]:
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05


In [7]:
features, label = titanic_data.drop(['Survived'], axis=1), titanic_data['Survived']

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=1113)
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

clf = RandomForestClassifier(n_estimators=512, max_depth=3, random_state=1113)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)
print(roc_auc_score(y_test, y_pred[:, 1]))

0.832899305556


In [10]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,0,28.0,0,0,9.5
1,3,1,29.0,1,1,10.4625
2,3,0,28.0,0,0,7.8958
3,2,1,28.0,0,0,13.0
4,1,1,16.0,0,1,57.9792


In [11]:
X_train.Sex.value_counts()

0    404
1    219
Name: Sex, dtype: int64

In [12]:
from binning import Binner

binner = Binner()

binner.fit(X_train)
X_train_binned = binner.transform(X_train)
X_test_binned = binner.transform(X_test)

In [13]:
X_train = pd.concat([X_train, X_train_binned], axis=1)
X_test = pd.concat([X_test, X_test_binned], axis=1)

In [14]:
X_train = X_train.drop(['Age', 'Fare_binned'], axis=1)
X_test = X_test.drop(['Age', 'Fare_binned'], axis=1)

In [15]:
clf = RandomForestClassifier(n_estimators=512, max_depth=3, random_state=1113)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)
print(roc_auc_score(y_test, y_pred[:, 1]))

0.839293981481


In [16]:
binner.columns_binned

dict_keys(['Age', 'Fare'])