# Random Forest on Titanic Dataset
Here I replicate the work done in the datacamp kaggle tutorial.

## Preamble

In [113]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [114]:
pd.options.mode.chained_assignment = None 

## Load data

In [115]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Inspect data

In [137]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,0


In [139]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Impute values, and transform non-numeric categories
These steps required the chained_assignment option to be changed, because there is something complex to be understood when it comes to changing values of a dataframe based on a copy of a slice. I don't understand this yet.
https://stackoverflow.com/questions/21463589/pandas-chained-assignments

In [117]:
train.Age = train.Age.fillna(train.Age.median())
train.Sex[train.Sex == 'male'] = 1
train.Sex[train.Sex == 'female'] = 0
train.Embarked = train.Embarked.fillna('S')
train.Embarked[train.Embarked == 'S'] = 0
train.Embarked[train.Embarked == 'C'] = 1
train.Embarked[train.Embarked == 'Q'] = 2

test.Fare[152] = test.Fare.median()
test.Embarked = test.Embarked.fillna('S')
test.Embarked[test.Embarked == 'S'] = 0
test.Embarked[test.Embarked == 'C'] = 1
test.Embarked[test.Embarked == 'Q'] = 2
test.Sex[test.Sex == 'male'] = 1
test.Sex[test.Sex == 'female'] = 0
test.Age = test.Age.fillna(test.Age.median())

## Build the model
Initialize the random forest, and the feature and target vectors

In [120]:
features = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
target = train.Survived

In [121]:
forest = RandomForestClassifier(max_depth=10, min_samples_split=2, n_estimators=100)
forest = forest.fit(features, target)

## Inspect the model

In [132]:
zip(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], forest.feature_importances_)

[('Pclass', 0.11043185965544458),
 ('Sex', 0.31909303896000429),
 ('Age', 0.20752964764060447),
 ('SibSp', 0.056909642866011668),
 ('Parch', 0.041899666398717283),
 ('Fare', 0.22935927748309104),
 ('Embarked', 0.034776866996126345)]

## Predict
Use the model to predict the feature of the test sample

In [127]:
test_features = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
prediction = forest.predict(test_features)

## Generate output
Use a dataframe to output a csv file to upload to Kaggle

In [128]:
solution = pd.DataFrame(prediction, test.PassengerId, columns=['Survived'])
solution

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0
897,0
898,0
899,0
900,1
901,0


In [129]:
solution.to_csv('forest_sol.csv', index_label=['PassengerId'])