# Introduction #

Let's first load the train and test data into a data frame using pandas and check some information about it.

In [2]:
import numpy as np
import pandas as pd

train_df = pd.read_csv('../input/train.csv', header=0)
test_df = pd.read_csv('../input/test.csv', header=0)

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


We can check that Age, Cabin and Embarked has missing informations. So we have to start checking the importance of some features and start creating new ones.

# Feature Engineering #

## Sex and new feature Gender ##

Convert string value to integer creating a new feature

In [3]:
train_df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean()

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [4]:
def map_gender(dataset):
    return dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

# female = 0, Male = 1
train_df['Gender'] = map_gender(train_df)
test_df['Gender'] = map_gender(test_df)

train_df[["Gender", "Survived"]].groupby(['Gender'], as_index=False).mean()

Unnamed: 0,Gender,Survived
0,0,0.742038
1,1,0.188908


## Embarked ##

Let's fill the missing embarked information using the most common.

In [5]:
train_df.Embarked.dropna().mode()

0    S
dtype: object

In [6]:
def fill_missing_embarked(dataset):
    if len(dataset.Embarked[ dataset.Embarked.isnull() ]) > 0:
        dataset.Embarked[ dataset.Embarked.isnull() ] = dataset.Embarked.dropna().mode().values
        
fill_missing_embarked(train_df)
fill_missing_embarked(test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [7]:
survived_by_embarked = train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

survived_by_embarked 

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [8]:

Ports = list(enumerate(np.unique(survived_by_embarked['Embarked'])))    # determine all values of Embarked,
Ports_dict = { name : i for i, name in Ports }              # set up a dictionary in the form  Ports : index

def convert_embarked_to_int(dataset):
    dataset.Embarked = dataset.Embarked.map( lambda x: Ports_dict[x]).astype(int)

convert_embarked_to_int(train_df)
convert_embarked_to_int(test_df)

train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

Unnamed: 0,Embarked,Survived
0,0,0.553571
1,1,0.38961
2,2,0.339009


## Age ##

This one has missing values, we use the median of all ages

In [9]:
def fill_missing_age(dataset):
    median_age = dataset['Age'].dropna().median()
    if len(dataset.Age[ dataset.Age.isnull() ]) > 0:
        dataset.loc[ (dataset.Age.isnull()), 'Age'] = median_age
    
fill_missing_age(train_df)
fill_missing_age(test_df)

def normalize_age(dataset):
    dataset['AgeNorm'] = dataset['Age'].map(lambda x: x/max(dataset['Age']))
    
normalize_age(train_df)
normalize_age(test_df)

train_df['AgeNorm'].head()

0    0.2750
1    0.4750
2    0.3250
3    0.4375
4    0.4375
Name: AgeNorm, dtype: float64

In [10]:
import re

def get_title(name):
	title_search = re.search(' ([A-Za-z]+)\.', name)
	# If the title exists, extract and return it.
	if title_search:
		return title_search.group(1)
	return ""

train_df['Title'] = train_df['Name'].apply(get_title)
test_df['Title'] = test_df['Name'].apply(get_title)

def normalize_title(dataset):
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
    'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

normalize_title(train_df)
normalize_title(test_df)

pd.crosstab(train_df['Title'], train_df['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,0,40
Miss,185,0
Mr,0,517
Mrs,126,0
Rare,3,20


In [11]:
title_by_survived = train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort('Survived', ascending=0)

title_by_survived

  if __name__ == '__main__':


Unnamed: 0,Title,Survived
3,Mrs,0.793651
1,Miss,0.702703
0,Master,0.575
4,Rare,0.347826
2,Mr,0.156673


In [12]:
Titles = list(enumerate(title_by_survived['Title']))
Titles_dict = { name : i for i, name in Titles }  

def map_titles(dataset):
    dataset['Title'] = dataset.Title.map( lambda x: Titles_dict[x]).astype(int)

map_titles(train_df)
map_titles(test_df)

train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,0,0.793651
1,1,0.702703
2,2,0.575
3,3,0.347826
4,4,0.156673


In [13]:
def family_size(dataset):
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch']
    
family_size(train_df)
family_size(test_df)

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean()

Unnamed: 0,FamilySize,Survived
0,0,0.303538
1,1,0.552795
2,2,0.578431
3,3,0.724138
4,4,0.2
5,5,0.136364
6,6,0.333333
7,7,0.0
8,10,0.0


In [14]:
def fill_missing_fare(dataset):
    median_fare = dataset['Fare'].dropna().median()
    if len(dataset.Fare[ dataset.Fare.isnull() ]) > 0:
        dataset.loc[ (dataset.Fare.isnull()), 'Fare'] = median_fare
        
fill_missing_fare(train_df)
fill_missing_fare(test_df)

def fare_norm(dataset):
    dataset['FareNorm'] = dataset['Fare'].map(lambda x: x / max(dataset['Fare'])).astype('float64')
    
fare_norm(train_df)
fare_norm(test_df)

train_df[['FareNorm', 'Survived']].groupby(['FareNorm'], as_index=False).mean()

Unnamed: 0,FareNorm,Survived
0,0.000000,0.066667
1,0.007832,0.000000
2,0.009759,0.000000
3,0.012175,0.000000
4,0.012565,0.000000
5,0.012590,0.000000
6,0.012679,0.000000
7,0.013175,0.000000
8,0.013387,0.000000
9,0.013565,0.000000


## Data cleaning ##

In [19]:
def drop_useless(dataset):
    return dataset.drop(['Name', 'Sex', 'Fare', 'Age', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 

train_df_dropped = drop_useless(train_df)
# Collect the test data's PassengerIds before dropping it
ids = test_df['PassengerId'].values
test_df_dropped = drop_useless(test_df)

def norm(dataset):
    dataset['Pclass'] = dataset['Pclass'].map(lambda x: x / max(dataset['Pclass'])).astype('float64')
    dataset['SibSp'] = dataset['SibSp'].map(lambda x: x / max(dataset['SibSp'])).astype('float64')
    dataset['Parch'] = dataset['Parch'].map(lambda x: x / max(dataset['Parch'])).astype('float64')
    dataset['Embarked'] = dataset['Embarked'].map(lambda x: x / max(dataset['Embarked'])).astype('float64')
    dataset['Title'] = dataset['Title'].map(lambda x: x / max(dataset['Title'])).astype('float64')
    dataset['FamilySize'] = dataset['FamilySize'].map(lambda x: x / max(dataset['FamilySize'])).astype('float64')
    
norm(train_df_dropped)
norm(test_df_dropped)

train_df_dropped.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Embarked,Gender,AgeNorm,Title,FamilySize,FareNorm
0,0,1.0,0.125,0.0,1.0,1,0.275,1.0,0.1,0.014151
1,1,0.333333,0.125,0.0,0.0,0,0.475,0.0,0.1,0.139136
2,1,1.0,0.0,0.0,1.0,0,0.325,0.25,0.0,0.015469
3,1,0.333333,0.125,0.0,1.0,0,0.4375,0.0,0.1,0.103644
4,0,1.0,0.0,0.0,1.0,1,0.4375,1.0,0.0,0.015713


## Predicting ##

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
import csv as csv

# The data is now ready to go. So lets fit to the train, then predict to the test!
# Convert back to a numpy array
train_data = train_df_dropped.values
test_data = test_df_dropped.values

X = train_data[0::,1::]
y = train_data[0::,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

print('Training...')
np.random.seed(5)

n_folds = 10
cv = StratifiedKFold(n_folds)
N_es = [50, 100, 200]
N_md = [1,2,3,4]
criteria = ['gini', 'entropy']

random_forest = RandomForestClassifier()
gscv = GridSearchCV(estimator=random_forest, param_grid=dict(n_estimators=N_es, max_depth=N_md, criterion=criteria), 
                    n_jobs=1, cv=list(cv.split(X_train, y_train)), verbose=0)
gscv.fit(X_train, y_train)

print('Best CV accuracy: %g\nBest n_estimators: %g\nBest max_depth: %g\nBest criterion: %s' % (
        gscv.best_score_, gscv.best_estimator_.n_estimators, gscv.best_estimator_.max_depth,gscv.best_estimator_.criterion))

acc = (y_test == gscv.predict(X_test).astype(int)).mean()

print(acc)

print('Predicting...')
output = gscv.predict(test_data).astype(int)

predictions_file = open("myfirstforest.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print('Done.')

Training...
Best CV accuracy: 0.833333
Best n_estimators: 100
Best max_depth: 4
Best criterion: entropy
1.0
Predicting...
Done.
