In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

%matplotlib inline

# Import Data

In [33]:
# import train data and save it as data
data = pd.read_csv('train.csv')

In [34]:
# load data into a pandas dataframe and save as df
df = pd.DataFrame(data)

# Feature Engineering

#### Replacing Null Values

In [35]:
#Fill null values with median age
df["Age_Fill_Med"] = df["Age"].fillna(df["Age"].median())

In [45]:
train_median = df['Age'].median()
print(train_median)

28.0


#### Creating Dummy Variables

In [37]:
# dummy variables for Embarked and Sex features
df = pd.get_dummies(df,columns=['Embarked','Sex'],drop_first=True)

# Random Forest


In [38]:
y = df['Survived']

X = df[['Age_Fill_Med','Pclass','Sex_male','Fare', 'Embarked_Q', 'Embarked_S']]

In [39]:
# Original Random Forest with 500 trees and minimum sample split of 5
rf = RandomForestClassifier(n_estimators=250,random_state=0, min_samples_split= 5)

# cross validation using 10 folds
s = cross_val_score(rf,X,y,cv=10)

print('Nulls to Median + Dummies: '+'{:.2f}% +- {:.2f}%'.format(100*np.mean(s),100*np.std(s)))

Nulls to Median + Dummies: 83.62% +- 3.58%


In [40]:
rf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [41]:
rf.predict(X)

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [42]:
pd.DataFrame(rf.feature_importances_, index = X.columns, columns = ['Feature Importance']).sort_values(by = 'Feature Importance', ascending = False)

Unnamed: 0,Feature Importance
Sex_male,0.315202
Fare,0.287375
Age_Fill_Med,0.253013
Pclass,0.112645
Embarked_S,0.022248
Embarked_Q,0.009517


# Import Test Data

In [53]:
data = pd.read_csv('test.csv')

In [54]:
test_df = pd.DataFrame(data)

In [55]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [50]:
# Fill NAs with median of training set
test_df["Age_Fill_Med"] = test_df["Age"].fillna(train_median)

In [51]:
test_df = pd.get_dummies(test_df,columns=['Embarked','Sex'])

In [None]:
test_df.drop('Sex_female', axis = 1)

In [52]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Age_Fill_Med,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,,34.5,0,1,0,0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,,47.0,0,0,1,1,0
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,,62.0,0,1,0,0,1
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,,27.0,0,0,1,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,,22.0,0,0,1,1,0
