# Titanic: to survive or not to survive

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn import cross_validation as cva
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import GradientBoostingClassifier as gbc
from sklearn.linear_model import LogisticRegression as lr
#from sklearn.cross_validation import KFold

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
full_data = [train, test]
all_data = train.append(test)

test.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


## Feature Engineering ##

### Feature: Age ###

In [2]:
#Deal with NaN values for train and test data. Current strategy is to replace NaN values with a random set of ages within
#a designated distance from the mean

#Get some statistics
age_mean = train['Age'].mean()
age_std = train['Age'].std()

#Loop through each dataset
for dataset in full_data:
    age_nan_count = dataset['Age'].isnull().sum()

    #Generate random numbers between +-1 std and mean
    age_rand = np.random.randint(age_mean-age_std/2, age_mean+age_std/2, size=age_nan_count)
    
    #Fill in NaN values with mean of entrire set (train + test)
    dataset['Age'][np.isnan(dataset["Age"])] = age_rand



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Let's map a categorical age to test in our classifiers

In [3]:
#Divide the data into bins. Decided to use pd.cut instead of pd.qcut because survival rates were more diverse
train['CategoricalAge'] = pd.cut(train['Age'], 5)
print (train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())

for dataset in full_data:
    dataset['CategoricalAge'] = 0
    dataset.loc[dataset['Age']<=16, 'CategoricalAge'] = 0
    dataset.loc[(dataset['Age']>16) & (dataset['Age']<=32), 'CategoricalAge'] = 1
    dataset.loc[(dataset['Age']>32) & (dataset['Age']<=48), 'CategoricalAge'] = 2
    dataset.loc[(dataset['Age']>48) & (dataset['Age']<=64), 'CategoricalAge'] = 3
    dataset.loc[dataset['Age']>64, 'CategoricalAge'] = 4


     CategoricalAge  Survived
0    (0.34, 16.336]  0.550000
1  (16.336, 32.252]  0.352941
2  (32.252, 48.168]  0.374468
3  (48.168, 64.084]  0.434783
4      (64.084, 80]  0.090909


### Feature: Sex ###

In [4]:
#Map the Sex column to 0:Male, 1:Female, 2:Child
for dataset in full_data:
        dataset.loc[dataset['Sex']=='male','Sex'] = 0
        dataset.loc[dataset['Sex']=='female','Sex'] = 1
        dataset.loc[dataset['Age']<=6,'Sex'] = 2        #labeling the young ones as children
        
print (train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean())


   Sex  Survived
0    0  0.168174
1    1  0.742268
2    2  0.702128


### Feature: Pclass ###

In [5]:
#We have all the Pclass values in numerical form, so no need to fill or map anything
print(train[['Pclass','Survived']].groupby('Pclass', as_index=False).mean())


   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


### Feature: Family ###

In [6]:
#My thought is that we create two new family features:
    #FamilySize: Parch + SibSp
    #IsAlone: when family size = 0

for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch']
    
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 0, 'IsAlone'] = 1

print(train[['FamilySize','Survived']].groupby('FamilySize', as_index=False).mean())
#print(train[['IsAlone','Survived']].groupby('IsAlone', as_index=False).mean())


   FamilySize  Survived
0           0  0.303538
1           1  0.552795
2           2  0.578431
3           3  0.724138
4           4  0.200000
5           5  0.136364
6           6  0.333333
7           7  0.000000
8          10  0.000000


### Feature: Name Title ###

In [7]:
#Let's grab the titles of people (Mr, Mrs, etc) and map them to numerical values

def get_title(name):
    title = re.search('[A-Z][a-z]+\.', name)
    if title:
        return(title.group())
    return("")

title_map = {
    'Mr.':0,
    'Mrs.':1,
    'Miss.':2,
    'Master.':3,
    'Dr.':4,
    'Rev.':5,
    'Major.':6,
    'Col.':6,
    'Mlle.':2,
    'Jonkheer.':7,
    'Lady.':7,
    'Don.':7,
    'Dona.':7,
    'Countess.':7,
    'Capt.':6,
    'Sir.':7,
    'Mme.':1,
    'Ms.':2
}

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
    dataset['Title'] = dataset['Title'].map(title_map)
    
print(train[['Title','Survived']].groupby('Title', as_index=False).mean())

   Title  Survived
0      0  0.156673
1      1  0.793651
2      2  0.702703
3      3  0.575000
4      4  0.428571
5      5  0.000000
6      6  0.400000
7      7  0.600000


### Feature: Fare ###

In [8]:
#Let's take a look at fare. It looks like there are 15 values of $0, which I'm going to assume are missing values.
#print(train['Fare'].value_counts().sort_index(ascending=False))
#train[train['Fare']==0]

#For now, let's fill in any NaN or missing fares with the median of the pclass fare
for x in range(1,4):
    median = train[train['Pclass']==x]['Fare'].median()
    std = train[train['Pclass']==x]['Fare'].std()
    #print(median, " ",std)
    
    for dataset in full_data:
        dataset.loc[(dataset['Pclass']==x) & (dataset['Fare']==0), 'Fare'] = median
        dataset.loc[(dataset['Pclass']==x) & (np.isnan(dataset["Fare"])==True), 'Fare'] = median


#Creating a CategoricalFare feature
pd.qcut(train['Fare'],6).value_counts()

# Mapping Fare
for dataset in full_data:
    dataset['CategoricalFare'] = 0
    dataset.loc[dataset['Fare'] <= 8.76, 'CategoricalFare']= 0
    dataset.loc[(dataset['Fare'] > 8.76) & (dataset['Fare'] <= 14.5), 'CategoricalFare'] = 1
    dataset.loc[(dataset['Fare'] > 14.5) & (dataset['Fare'] <= 26.25), 'CategoricalFare'] = 2
    dataset.loc[(dataset['Fare'] > 26.25) & (dataset['Fare'] <= 53.1), 'CategoricalFare'] = 3
    dataset.loc[dataset['Fare'] > 53.1, 'CategoricalFare']= 4
    dataset['CategoricalFare'] = dataset['CategoricalFare'].astype(int)

print(train[['CategoricalFare','Survived']].groupby('CategoricalFare', as_index=False).mean())

   CategoricalFare  Survived
0                0  0.207358
1                1  0.339869
2                2  0.462069
3                3  0.425676
4                4  0.671233


## Machine Learning Section:

In [9]:
#This did NOT result in any better scores....

rand = np.random.rand(len(train)) < 0.5
train1 = train[rand]
train2 = train[~rand]

#The features for the model
predictors = ['Pclass', 'Sex', 'CategoricalAge', 'FamilySize', 'Title','CategoricalFare', 'IsAlone']

#SOLO random forest classifier 
alg = rfc(random_state=1, n_estimators=50, min_samples_split=5, min_samples_leaf=5)
kf = cva.KFold(train1.shape[0], n_folds=3, random_state=1)
scores = cva.cross_val_score(alg, train1[predictors], train1['Survived'], cv=kf)
print(scores.mean())

alg.fit(train1[predictors], train1['Survived'])
train2_preds = alg.predict(train2[predictors])
test_preds = alg.predict(test[predictors])

train2['Train_Preds'] = train2_preds
test['Train_Preds'] = test_preds

predictors2 = ['Train_Preds']

kf = cva.KFold(train2.shape[0], n_folds=3, random_state=1)
scores = cva.cross_val_score(alg, train2[predictors2], train2['Survived'], cv=kf)
print(scores.mean())

#Let's predict on titanicTest!
alg.fit(train2[predictors2], train2['Survived'])
predictions = alg.predict(test[predictors2])


#Create a submission file
submission = pd.DataFrame({
       'PassengerId':test['PassengerId'],
       'Survived':predictions
   })

submission.to_csv('titanic_submission.csv',index=False)

submission.head()


0.814997209363


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.808225108225


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [10]:
# #The features for the model
# predictors = ['Pclass', 'Sex', 'CategoricalAge', 'FamilySize', 'Title','CategoricalFare', 'IsAlone']

# #SOLO random forest classifier (this resulted in a best submission score of 75%)
# alg = rfc(random_state=1, n_estimators=50, min_samples_split=5, min_samples_leaf=5)

# #Setting up the cross validation folds
# kf = cva.KFold(train.shape[0], n_folds=3, random_state=1)

# #Running the algorithm with the kfolds
# scores = cva.cross_val_score(alg, train[predictors], train['Survived'], cv=kf)
# print(scores.mean())

# #Let's predict on titanicTest!
# alg.fit(train[predictors], train['Survived'])
# predictions = alg.predict(test[predictors])

# #Create a submission file
# submission = pd.DataFrame({
#        'PassengerId':test['PassengerId'],
#        'Survived':predictions
#    })

# submission.to_csv('titanic_submission.csv',index=False)

# submission.head()
