In [9]:

# import analysis packages
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

In [10]:
# create training and testing dataframes
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [11]:
# remove duplicates as they add little value
train_df = train_df.drop_duplicates()

In [12]:
# remove columns that do not affect the survival rate
# in both train and test dataframes (Used RapidMiner)
train_df = train_df.drop('Cabin', axis=1)
train_df = train_df.drop('Name',axis=1)
train_df = train_df.drop('Ticket',axis=1)

test_df = test_df.drop('Cabin',axis=1)
test_df = test_df.drop('Name',axis=1)
test_df = test_df.drop('Ticket',axis=1)

In [13]:
# fill blank 'Age' rows with mean value of other ages
imp = SimpleImputer(missing_values=np.nan, strategy='median')
train_df['Age'] = imp.fit_transform(train_df[['Age']])

imp = SimpleImputer(missing_values=np.nan, strategy='median')
test_df['Age'] = imp.fit_transform(test_df[['Age']])

In [14]:
# fill missing Embarked values
train_df['Embarked'] = train_df['Embarked'].fillna('S')
test_df['Embarked'] = test_df['Embarked'].fillna('S')

In [15]:
# fill blank Fare values
train_df=train_df.fillna(train_df['Fare'].mean())
test_df = test_df.fillna(test_df['Fare'].mean())

test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [16]:
y = train_df["Survived"]

#define prediction features for train and test
features = ["Pclass","Sex","Age", "SibSp", "Parch","Embarked"]
X = pd.get_dummies(train_df[features])
X_test = pd.get_dummies(test_df[features])

#generate predictions
model = RandomForestClassifier(n_estimators=150, max_depth=7, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

#convert result to .csv
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)