In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

data_train = pd.read_csv('data/xtrain.csv')
data_test = pd.read_csv('data/xtest.csv')
print(data_train.shape)
print(data_train.info())
data_train.head()

(1021, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1021 entries, 0 to 1020
Data columns (total 14 columns):
pclass       1021 non-null int64
survived     1021 non-null int64
name         1021 non-null object
sex          1021 non-null bool
age          820 non-null float64
sibsp        1021 non-null int64
parch        1021 non-null int64
ticket       1021 non-null object
fare         1020 non-null float64
cabin        227 non-null object
embarked     1020 non-null object
boat         367 non-null object
body         97 non-null float64
home.dest    579 non-null object
dtypes: bool(1), float64(3), int64(4), object(6)
memory usage: 104.8+ KB
None


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3,1,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",False,31.0,1,1,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
1,3,0,"Karlsson, Mr. Nils August",True,22.0,0,0,350060,7.5208,,S,,,
2,3,1,"O'Leary, Miss. Hanora ""Norah""",False,,0,0,330919,7.8292,,Q,13,,
3,1,0,"Douglas, Mr. Walter Donald",True,50.0,1,0,PC 17761,106.425,C86,C,,62.0,"Deephaven, MN / Cedar Rapids, IA"
4,2,1,"Brown, Mrs. Thomas William Solomon (Elizabeth ...",False,40.0,1,1,29750,39.0,,S,14,,"Cape Town, South Africa / Seattle, WA"


In [31]:
def simplify_ages(df):
    df.age = df.age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
    categories = pd.cut(df.age, bins, labels=group_names)
    df.age = categories
    return df

def simplify_cabins(df):
    df.cabin = df.cabin.fillna('N')
    df.cabin = df.cabin.apply(lambda x: x[0])
    return df

def simplify_fares(df):
    df.fare = df.fare.fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df.fare, bins, labels=group_names)
    df.fare = categories
    return df

def drop_features(df):
    return df.drop(['body','name','ticket','boat','body','home.dest'], axis=1)

def transform_features(df):
    df = simplify_ages(df)
    df = simplify_cabins(df)
    df = simplify_fares(df)
    df = drop_features(df)
    return df

data_train = transform_features(data_train)
data_test = transform_features(data_test)
data_train.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,3,1,False,Young Adult,1,1,3_quartile,N,S
1,3,0,True,Student,0,0,1_quartile,N,S
2,3,1,False,Unknown,0,0,1_quartile,N,Q
3,1,0,True,Adult,1,0,4_quartile,C,C
4,2,1,False,Adult,1,1,4_quartile,N,S


In [33]:
from sklearn import preprocessing
def encode_features(df_train, df_test):
    features = ['fare', 'cabin', 'age', 'sex','embarked']
    df_combined = pd.concat([df_train[features], df_test[features]])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test
    
data_train, data_test = encode_features(data_train, data_test)
data_train.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,3,1,0,7,1,1,2,7,3
1,3,0,1,4,0,0,0,7,3
2,3,1,0,6,0,0,0,7,2
3,1,0,1,0,1,0,3,2,1
4,2,1,0,0,1,1,3,7,3


In [34]:
from sklearn.model_selection import train_test_split

X_all = data_train.drop(['survived'], axis=1)
y_all = data_train['survived']

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=8,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            n_estimators=9, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [36]:
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))

0.785365853659


In [37]:
from sklearn.cross_validation import KFold

def run_kfold(clf):
    kf = KFold(891, n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

run_kfold(clf)




Fold 1 accuracy: 0.844444444444
Fold 2 accuracy: 0.797752808989
Fold 3 accuracy: 0.775280898876
Fold 4 accuracy: 0.831460674157
Fold 5 accuracy: 0.85393258427
Fold 6 accuracy: 0.820224719101
Fold 7 accuracy: 0.820224719101
Fold 8 accuracy: 0.842696629213
Fold 9 accuracy: 0.786516853933
Fold 10 accuracy: 0.76404494382
Mean Accuracy: 0.813657927591


In [47]:
predictions = clf.predict(data_test)


output = pd.DataFrame({ 'survived': predictions })
output.to_csv('gdg-titanic-predictions.csv', index = False)
output

Unnamed: 0,survived
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,1


In [43]:
data_train.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,3,1,0,7,1,1,2,7,3
1,3,0,1,4,0,0,0,7,3
2,3,1,0,6,0,0,0,7,2
3,1,0,1,0,1,0,3,2,1
4,2,1,0,0,1,1,3,7,3
