### Initial data exploration

Load the train and test datasets to create two DataFrames

In [45]:
import pandas as pd
import numpy as np
from pprint import pprint as pp
train_csv = 'train.csv'
test_csv = 'test.csv'
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

Preview the contents of the files

In [13]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [17]:
pp(train_df.shape)
train_df.describe()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [18]:
pp(test_df.shape)
test_df.describe()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


How many people survive?

In [31]:
pp("Passengers that survived vs passengers that passed away")
pp(train_df['Survived'].value_counts())
pp("As proportions")
pp(train_df['Survived'].value_counts(normalize=True))
pp("Males that survived vs males that passed away")
pp(train_df['Survived'][train_df["Sex"] == "male"].value_counts())
pp("Females that survived vs Females that passed away")
pp(train_df['Survived'][train_df["Sex"] == "female"].value_counts())
pp("Normalized male survival")
pp(train_df['Survived'][train_df["Sex"] == "male"].value_counts(normalize=True))
pp("Normalized female survival")
pp(train_df['Survived'][train_df["Sex"] == "female"].value_counts(normalize=True))

'Passengers that survived vs passengers that passed away'
0    549
1    342
Name: Survived, dtype: int64
'As proportions'
0    0.616162
1    0.383838
Name: Survived, dtype: float64
'Males that survived vs males that passed away'
0    468
1    109
Name: Survived, dtype: int64
'Females that survived vs Females that passed away'
1    233
0     81
Name: Survived, dtype: int64
'Normalized male survival'
0    0.811092
1    0.188908
Name: Survived, dtype: float64
'Normalized female survival'
1    0.742038
0    0.257962
Name: Survived, dtype: float64


Does age play a role?

In [71]:
# Assign 1 to passengers under 18, 0 to those 18 or older. Print the new column. Leave as NaN is no age is recorded.
train_df["Child"] = np.nan
train_df.loc[train_df["Age"] < 18, "Child"] = 1
train_df.loc[train_df["Age"] >= 18, "Child"] = 0
print("normalized Survival Rates for passengers under 18 --")
pp(train_df["Survived"][train_df["Child"] == 1].value_counts(normalize = True))
print("normalized Survival Rates for passengers 18 or older --")
pp(train_df["Survived"][train_df["Child"] == 0].value_counts(normalize = True))

normalized Survival Rates for passengers under 18 --
1    0.539823
0    0.460177
Name: Survived, dtype: float64
normalized Survival Rates for passengers 18 or older --
0    0.618968
1    0.381032
Name: Survived, dtype: float64


It looks like age slightly plays a role in survival, but it is a lot more clear from the training set that a person will survive in either sex (male or female). Below I will predict that if a passenger is female, that she will survive.

In [75]:
# Create a copy of test: test_one
test_one = test_df.copy()
# Initialize a Survived column to 0
test_one["Survived"] = 0
# Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
test_one.loc[test_one["Sex"] == "female", "Survived"] = 1

### Using decision trees

In [76]:
from sklearn import tree

Let's reload to refresh the DataFrame and clean up the data first.

In [86]:
train_df = pd.read_csv(train_csv)
# Impute the Age variable with the median Age
train_df["Age"] = train_df["Age"].fillna(train_df["Age"].median())

# Convert the male and female groups to integer form
train_df.loc[train_df["Sex"] == "male", "Sex"] = 0
train_df.loc[train_df["Sex"] == "female", "Sex"] = 1

# Impute the Embarked variable
train_df["Embarked"] = train_df["Embarked"].fillna("S")

# Convert the Embarked classes to integer form
train_df.loc[train_df["Embarked"] == "S", "Embarked"] = 0
train_df.loc[train_df["Embarked"] == "C", "Embarked"] = 1
train_df.loc[train_df["Embarked"] == "Q", "Embarked"] = 2

And create my first decision tree

In [91]:
# Create the target and features numpy arrays: target, features_one
target = train_df["Survived"].values
features_one = train_df[["Pclass", "Sex", "Age", "Fare"]].values

# Fit my first decision tree: my_tree_one
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)

# Look at the importance and score of the included features
print(my_tree_one.fdeature_importances_)
print(my_tree_one.score(features_one, target))

[ 0.12378179  0.31274009  0.22875336  0.33472476]
0.977553310887


Looks like Fare is the feature with the greatest importance. Let's proceed to use this model to predict with our test dataset.

In [98]:
test_df = pd.read_csv(test_csv)
# Impute missing numerical values with the median
test_df["Fare"] = test_df["Fare"].fillna(test_df["Fare"].median())
test_df["Age"] = test_df["Age"].fillna(test_df["Age"].median())

# Convert the male and female groups to integer form
test_df.loc[test_df["Sex"] == "male", "Sex"] = 0
test_df.loc[test_df["Sex"] == "female", "Sex"] = 1

# Extract the features from the test set: Pclass, Sex, Age, and Fare.
test_features = test_df[["Pclass", "Sex", "Age", "Fare"]].values

# Make your prediction using the test set
my_prediction = my_tree_one.predict(test_features)
pp(my_prediction)

# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
PassengerId =np.array(test_df["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
pp(my_solution)

# Check that your data frame has 418 entries
pp(my_solution.shape)

# Write your solution to a csv file with the name my_solution.csv
my_solution.to_csv("my_solution_one.csv", index_label = ["PassengerId"])

array([0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0,