In [13]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import csv as csv

# import decision tree from scikit-learn
import sklearn.tree as tree
import sklearn.ensemble as ske

In [20]:
# data frame containing training data
train_df = pd.read_csv('data/train.csv')

# Female = 0, Male = 0
train_df['Sex'] = train_df['Sex'].map({'female':0, 'male':1}
                                     ).astype(int)

# missing ports -> embark them from most common ports
mode_port = train_df['Embarked'].dropna().mode().values
if len(train_df.loc[ (train_df.Embarked.isnull()), 'Embarked']) > 0:
    train_df.loc[ (train_df.Embarked.isnull()), 'Embarked'] = mode_port


# missing ages -> median ages
median_age = train_df.Age.dropna().median()
if len(train_df.loc[ (train_df.Age.isnull()), 'Age']) > 0:
    train_df.loc[ (train_df.Age.isnull()), 'Age'] = median_age


# missing fare -> median corresponding to that class
if len(train_df.loc[(train_df.Fare.isnull()), 'Fare']) > 0:
    median_fare = np.zeros(3)
    for f in range(3):
        median_fare[f] = train_df.loc[(train_df.Pclass == f+1),
                                     'Fare'].dropna().median()
    for f in range(3):
        train_df.loc[(train_df.Fare.isnull()) & (train_df.Pclass == f+1 ),
                    'Fare'] = median_fare[f]

# Embarked from 'C', 'Q', 'S'
# this is not ideal: in translating categories to numbers,
# Port "2" is not 2 times greater than Port "1", etc.
# determine all values of Embarked
ports = list(enumerate(np.unique(train_df['Embarked'])))
# set up a Dict of form Port: index
ports_map = {port: i for i, port in ports}
# convert all embarked from string to int
train_df.Embarked = train_df.Embarked.map( lambda x:
                            ports_map[x]).astype(int)

# This should get us very clean data
# train_df # uncomment this line to see dataframe

In [21]:
# Test data
test_df = pd.read_csv('data/test.csv', header=0)

# ids required for final reporting
test_ids = test_df['PassengerId'].values

# Female = 0, Male = 0
test_df['Sex'] = test_df['Sex'].map({'female':0, 'male':1}
                                     ).astype(int)

# missing ports -> most common ports
mode_port = test_df['Embarked'].dropna().mode().values
if len(test_df.loc[ (test_df.Embarked.isnull()), 'Embarked']) > 0:
    test_df.loc[ (test_df.Embarked.isnull()), 'Embarked'] = mode_port
# convert all embarked from string to int
test_df.Embarked = test_df.Embarked.map( lambda x:
                            ports_map[x]).astype(int)


# missing ages -> median ages
median_age = test_df.Age.dropna().median()
if len(test_df.loc[ (test_df.Age.isnull()), 'Age']) > 0:
    test_df.loc[ (test_df.Age.isnull()), 'Age'] = median_age


# missing fare -> median corresponding to that class
if len(test_df.loc[(test_df.Fare.isnull()), 'Fare']) > 0:
    median_fare = np.zeros(3)
    for f in range(3):
        median_fare[f] = test_df.loc[(test_df.Pclass == f+1),
                                     'Fare'].dropna().median()
    for f in range(3):
        test_df.loc[(test_df.Fare.isnull()) & (test_df.Pclass == f+1 ),
                    'Fare'] = median_fare[f]

        
# that should clean up test data
# test_df # uncomment this line to see the dataframe

In [22]:
# Clean up the dataset

# features to drop
feat_drop = ['Name', 'Ticket', 'Cabin', 'PassengerId']
train_df = train_df.drop(feat_drop, axis=1)
# Now drop the rows that have missing data.
train_df = train_df.dropna()

# We are not using the following features so drop them
test_df = test_df.drop(feat_drop, axis=1)

# training dataset
train_data = train_df.values
X_train = train_data[:, 1:]
Y_train = train_data[:, 0]

# test data
X_test = test_df.values

891 training examples


In [24]:
# Print information regarding training set size and features
print('{} training examples'.format(X_train.shape[0]))
print('{} features'.format(X_train.shape[1]))

891 training examples
7 features


In [18]:
# Decision Tree Classifier
# Training
dt_clf = tree.DecisionTreeClassifier()
dt_clf = dt_clf.fit(X_train, Y_train)

# getting predictions on test data
output = dt_clf.predict(X_test).astype(int)

# store the output in a file
with open('results/sub_decisiontree.csv', 'w') as f:
    cw = csv.writer(f)
    cw.writerow(["PassengerId", "Survived"])
    cw.writerows(zip(test_ids, output))

In [25]:
# Random forest
rnd_frst_clf = ske.RandomForestClassifier(n_estimators=100)
rnd_frst_clf = rnd_frst_clf.fit(X_train, Y_train)

# getting predictions on test data
output = rnd_frst_clf.predict(X_test).astype(int)

# store the output in a file
with open('results/sub_randomforest.csv', 'w') as f:
    cw = csv.writer(f)
    cw.writerow(["PassengerId", "Survived"])
    cw.writerows(zip(test_ids, output))