In [147]:
import pandas as pd
import numpy
import random
import seaborn
import matplotlib.pyplot as plot
get_ipython().magic('matplotlib tk')
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

train_dataSet = pd.read_csv("data/train.csv")
test_dataSet = pd.read_csv("data/test.csv")
merged_dataSet = [train_dataSet, test_dataSet]

In [125]:
# Describe the train data set for base information like mean, std, min
train_dataSet.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [126]:
train_dataSet.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Graham, Mr. George Edward",male,CA. 2343,C23 C25 C27,S
freq,1,577,7,4,644


In [127]:
# Describe the train data set for base information like mean, std, min
test_dataSet.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [128]:
test_dataSet.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,418,418,418,91,418
unique,418,2,363,76,3
top,"Rosenbaum, Miss. Edith Louise",male,PC 17608,B57 B59 B63 B66,S
freq,1,266,5,3,270


In [134]:
grouped_survived_class = train_dataSet[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)


In [135]:
grouped_survived_sex = train_dataSet[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [136]:
grouped_survived_sibspo = train_dataSet[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [137]:
grouped_survived_parch = train_dataSet[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [138]:
grouped_survived_embark = train_dataSet[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [154]:
grid = sns.FacetGrid(train_dataSet, col='Survived', row='Sex', size=2.2, aspect=1.6)
grid.map(plot.hist, 'Age', alpha=.5, bins=20)
grid.add_legend();

In [155]:
grid = sns.FacetGrid(train_dataSet, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plot.hist, 'Age', alpha=.5, bins=20)
grid.add_legend();

In [156]:
for dataset in merged_dataSet:
    dataset['Accompanied'] = 0
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

    dataset.loc[dataset['FamilySize'] == 1, 'Accompanied'] = 1

In [157]:
grouped_survived_accompanied = train_dataSet[['Accompanied', 'Survived']] \
    .groupby(['Accompanied'], as_index=False) \
    .mean() \
    .sort_values(by='Survived', ascending=False)

In [158]:
for dataset in merged_dataSet:
    dataset['Sex'] = dataset['Sex'].map({'female': 1, 'male': 0}).astype(int)

In [159]:
guess_ages = numpy.zeros((2, 3))

for dataset in merged_dataSet:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_data_frame = dataset[(dataset['Sex'] == i) & \
                                        (dataset['Pclass'] == j + 1)]['Age'].dropna()

            age_guess = guess_data_frame.median()

            # Convert random age float to nearest .5 age
            guess_ages[i, j] = int( age_guess / 0.5 + 0.5 ) * 0.5

    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1), \
                    'Age' ] = guess_ages[i, j]

dataset['Age'] = dataset['Age'].astype(int)

In [160]:
train_dataSet.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Accompanied,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,0,2
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,0,2
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,1,1


In [106]:
for dataset in merged_dataSet:
        dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
        dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
        dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
        dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
        dataset.loc[ dataset['Age'] > 64, 'Age']

In [107]:
train_dataSet.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Accompanied,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",0,1.0,1,0,A/5 21171,7.25,,S,0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2.0,1,0,PC 17599,71.2833,C85,C,0,2
2,3,1,3,"Heikkinen, Miss. Laina",1,1.0,0,0,STON/O2. 3101282,7.925,,S,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2.0,1,0,113803,53.1,C123,S,0,2
4,5,0,3,"Allen, Mr. William Henry",0,2.0,0,0,373450,8.05,,S,1,1


In [108]:
# Determine what is the most frequent port of embarkment
freq_port = train_dataSet.Embarked.dropna().mode()[0]
freq_port

'S'

In [109]:
for dataset in merged_dataSet:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
train_dataSet[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
1,1,0.553571
2,2,0.38961
0,0,0.339009


In [110]:
avg_fare = test_dataSet['Fare'].dropna().median()
test_dataSet['Fare'].fillna(avg_fare, inplace=True)

In [111]:
for dataset in merged_dataSet:
        dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
        dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
        dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
        dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
        dataset['Fare'] = dataset['Fare'].astype(int)

In [112]:
train_dataSet.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Accompanied,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",0,1.0,1,0,A/5 21171,0,,0,0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2.0,1,0,PC 17599,3,C85,1,0,2
2,3,1,3,"Heikkinen, Miss. Laina",1,1.0,0,0,STON/O2. 3101282,1,,0,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2.0,1,0,113803,3,C123,0,0,2
4,5,0,3,"Allen, Mr. William Henry",0,2.0,0,0,373450,1,,0,1,1


In [115]:
#
train_dataSet.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'FamilySize'], axis=1, inplace=True)
test_dataSet.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'FamilySize'], axis=1, inplace=True)
merged_dataSet = [train_dataSet, test_dataSet]


In [117]:
x_train = train_dataSet.drop('Survived', axis=1)
y_train = train_dataSet['Survived']
x_test = test_dataSet.drop('PassengerId', axis=1).copy()

In [118]:
shuffle_split = ShuffleSplit(n_splits=20, test_size=.20, random_state=0)

In [120]:
def test_classifier(classifier):
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    score = cross_val_score(classifier, x_train, y_train, cv=shuffle_split)

    print("Accuracy: %.2f%% (+/- %.2f%%)" % (score.mean() * 100, score.std() * 100))

    return y_pred

logistic_regression = test_classifier(LogisticRegression())
decision_tree = test_classifier(DecisionTreeClassifier(max_depth=10))
random_forest = test_classifier(RandomForestClassifier(n_estimators=100))

Accuracy: 79.55% (+/- 2.49%)
Accuracy: 81.51% (+/- 3.49%)
Accuracy: 81.51% (+/- 3.70%)


In [123]:
def save_results(prediction, filename):
    results = pd.DataFrame({
        'PassengerId': test_dataSet['PassengerId'],
        'Survived': prediction
    })
    results.to_csv(filename, index=False)

# Save the predictions to a file
save_results(logistic_regression, 'results-logisticregression.csv')
save_results(decision_tree, 'results-decisiontree.csv')
save_results(random_forest, 'results-randomforest.csv')