In [1]:
#Importing libraries
import pandas as pd
import numpy as np
%matplotlib inline

#Load training and testing dataset
titanic_train = pd.read_csv('data/train.csv')
titanic_test = pd.read_csv('data/test.csv')

titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Let us drop the irrelevant features. 'PassengerId', 'Name' 'Ticket' wont influence survival of a person

In [3]:
titanic_train= titanic_train.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
titanic_test = titanic_test.drop(['Name', 'Ticket'], axis=1)

In [4]:
titanic_train['Cabin'].isnull().sum()

687

Since majority of Cabin entries (687 out of 891) are NaN, we can drop it from our learning algorithm.

In [5]:
titanic_train.drop('Cabin', axis = 1, inplace = True)
titanic_test.drop('Cabin', axis = 1, inplace = True)

In [6]:
titanic_train['Embarked'].isnull().sum()

2

In [7]:
titanic_test['Embarked'].isnull().sum()

0

In [8]:
titanic_train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [9]:
#Filling N/A values with S as it has maximum occurences
titanic_train['Embarked']= titanic_train['Embarked'].fillna('S')

In [10]:
titanic_train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
average_age_train   = titanic_train["Age"].mean()
std_age_train       = titanic_train["Age"].std()
count_nan_age_train = titanic_train["Age"].isnull().sum()

average_age_test   = titanic_test["Age"].mean()
std_age_test       = titanic_test["Age"].std()
count_nan_age_test = titanic_test["Age"].isnull().sum()

rand_1 = np.random.randint(average_age_train - std_age_train, average_age_train + std_age_train, size = count_nan_age_train)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)

titanic_train["Age"][np.isnan(titanic_train["Age"])] = rand_1
titanic_test["Age"][np.isnan(titanic_test["Age"])] = rand_2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
titanic_train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.477183,0.523008,0.381594,32.204208
std,0.486592,0.836071,13.608438,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.75,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [13]:
titanic_test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.058612,0.447368,0.392344,35.627188
std,120.810458,0.841838,13.230438,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,28.0,0.0,0.0,14.4542
75%,1204.75,3.0,38.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [14]:
titanic_train['Fare'] = titanic_train['Fare'].astype(int)

titanic_test['Fare'].fillna(titanic_test['Fare'].median(), inplace= True)
titanic_test['Fare'] = titanic_test['Fare'].astype(int)

In [15]:
titanic_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7,S
1,1,1,female,38.0,1,0,71,C
2,1,3,female,26.0,0,0,7,S
3,1,1,female,35.0,1,0,53,S
4,0,3,male,35.0,0,0,8,S


In [16]:
titanic_train['Family'] = titanic_train['SibSp'] + titanic_train['Parch']
titanic_train['Family'].loc[titanic_train['Family'] >0] =1
titanic_train['Family'].loc[titanic_train['Family'] ==0] =0

titanic_test['Family'] = titanic_test['SibSp'] + titanic_test['Parch']
titanic_test['Family'].loc[titanic_test['Family'] >0] =1
titanic_test['Family'].loc[titanic_test['Family'] ==0] =0

titanic_train.drop(['SibSp','Parch'], axis=1, inplace = True)
titanic_test.drop(['SibSp','Parch'], axis=1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [17]:
titanic_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Family
0,0,3,male,22.0,7,S,1
1,1,1,female,38.0,71,C,1
2,1,3,female,26.0,7,S,0
3,1,1,female,35.0,53,S,1
4,0,3,male,35.0,8,S,0


In [18]:
less_than_16 = titanic_train[titanic_train['Age']<16]
print len(less_than_16)

less_than_16_and_survived = titanic_train[titanic_train['Age']<16][titanic_train['Survived']==1]
print len(less_than_16_and_survived)

96
53




In [19]:
def get_person(passenger):
    age,sex = passenger
    return 'child' if age < 16 else sex
    
titanic_train['Person'] = titanic_train[['Age','Sex']].apply(get_person,axis=1)
titanic_test['Person']  = titanic_test[['Age','Sex']].apply(get_person,axis=1)

titanic_train.drop(['Sex'],axis=1,inplace=True)
titanic_test.drop(['Sex'],axis=1,inplace=True)

In [20]:
titanic_train.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Embarked,Family,Person
0,0,3,22.0,7,S,1,male
1,1,1,38.0,71,C,1,female
2,1,3,26.0,7,S,0,female
3,1,1,35.0,53,S,1,female
4,0,3,35.0,8,S,0,male


In [21]:
train_encoded = pd.get_dummies(titanic_train)
test_encoded = pd.get_dummies(titanic_test)

In [22]:
train_encoded.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Family,Embarked_C,Embarked_Q,Embarked_S,Person_child,Person_female,Person_male
0,0,3,22.0,7,1,0,0,1,0,0,1
1,1,1,38.0,71,1,1,0,0,0,1,0
2,1,3,26.0,7,0,0,0,1,0,1,0
3,1,1,35.0,53,1,0,0,1,0,1,0
4,0,3,35.0,8,0,0,0,1,0,0,1


In [23]:
test_encoded.head()

Unnamed: 0,PassengerId,Pclass,Age,Fare,Family,Embarked_C,Embarked_Q,Embarked_S,Person_child,Person_female,Person_male
0,892,3,34.5,7,0,0,1,0,0,0,1
1,893,3,47.0,7,1,0,0,1,0,1,0
2,894,2,62.0,9,0,0,1,0,0,0,1
3,895,3,27.0,8,0,0,0,1,0,0,1
4,896,3,22.0,12,1,0,0,1,0,1,0


In [24]:
pclass_dummies_train  = pd.get_dummies(train_encoded['Pclass'])
pclass_dummies_train.columns = ['Class_1','Class_2','Class_3']

pclass_dummies_test  = pd.get_dummies(test_encoded['Pclass'])
pclass_dummies_test.columns = ['Class_1','Class_2','Class_3']

train_encoded.drop(['Pclass'],axis=1,inplace=True)
test_encoded.drop(['Pclass'],axis=1,inplace=True)

train_encoded = train_encoded.join(pclass_dummies_train)
test_encoded = test_encoded.join(pclass_dummies_test)

In [25]:
train_encoded.head()

Unnamed: 0,Survived,Age,Fare,Family,Embarked_C,Embarked_Q,Embarked_S,Person_child,Person_female,Person_male,Class_1,Class_2,Class_3
0,0,22.0,7,1,0,0,1,0,0,1,0,0,1
1,1,38.0,71,1,1,0,0,0,1,0,1,0,0
2,1,26.0,7,0,0,0,1,0,1,0,0,0,1
3,1,35.0,53,1,0,0,1,0,1,0,1,0,0
4,0,35.0,8,0,0,0,1,0,0,1,0,0,1


In [26]:
test_encoded.head()

Unnamed: 0,PassengerId,Age,Fare,Family,Embarked_C,Embarked_Q,Embarked_S,Person_child,Person_female,Person_male,Class_1,Class_2,Class_3
0,892,34.5,7,0,0,1,0,0,0,1,0,0,1
1,893,47.0,7,1,0,0,1,0,1,0,0,0,1
2,894,62.0,9,0,0,1,0,0,0,1,0,1,0
3,895,27.0,8,0,0,0,1,0,0,1,0,0,1
4,896,22.0,12,1,0,0,1,0,1,0,0,0,1


In [27]:
#Separating features and labels in training set
survival = train_encoded['Survived']
features = train_encoded.drop('Survived', axis = 1)

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, survival, test_size = 0.2, random_state = 0)

print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])

Training set has 712 samples.
Testing set has 179 samples.




In [28]:
from sklearn.metrics import fbeta_score, accuracy_score
from time import time

def train_predict(learner, X_train, y_train, X_test, y_test): 

    results = {}
    
    start = time()
    learner.fit(X_train, y_train)
    end = time()
    
    results['train_time'] = end - start
        
    start = time()
    predictions_test = learner.predict(X_test)
    end = time()

    results['pred_time'] = end - start
        
    #Compute accuracy on test set
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    #Compute F-score on the test set
    results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5)
        
    # Return the results
    return results

In [29]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

#Initialize the three models
clf_A = RandomForestClassifier(random_state=42)
clf_B = GaussianNB()
clf_C = LinearSVC(random_state=42)

results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = \
    train_predict(clf, X_train, y_train, X_test, y_test)

In [30]:
results

{'GaussianNB': {'acc_test': 0.78770949720670391,
  'f_test': 0.71072319201995016,
  'pred_time': 0.00030303001403808594,
  'train_time': 0.0010318756103515625},
 'LinearSVC': {'acc_test': 0.78770949720670391,
  'f_test': 0.71618037135278523,
  'pred_time': 0.02811717987060547,
  'train_time': 0.06043601036071777},
 'RandomForestClassifier': {'acc_test': 0.82681564245810057,
  'f_test': 0.78864353312302848,
  'pred_time': 0.008910894393920898,
  'train_time': 0.043797969818115234}}

In [31]:
#Import 'GridSearchCV', 'make_scorer'
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

#Initialize the classifier
clf = RandomForestClassifier()

#Create the parameters list we wish to tune
parameters = {'n_estimators': [10,50,100],
             'min_samples_split': [2,10,20,50],
             'max_features': ['sqrt', 'log2', 'auto']}

#Make an fbeta_score scoring object
scorer = make_scorer(fbeta_score, beta=0.5)

#Perform grid search on the classifier using 'scorer' as the scoring method
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

#Fit the grid search object to the training data and find the optimal parameters
grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the best model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print "Unoptimized model\n------"
print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))
print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5))
print "\nOptimized Model\n------"
print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))



Unoptimized model
------
Accuracy score on testing data: 0.8156
F-score on testing data: 0.7807

Optimized Model
------
Final accuracy score on the testing data: 0.8324
Final F-score on the testing data: 0.8185


In [32]:
best_clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [33]:
test_encoded.head()

Unnamed: 0,PassengerId,Age,Fare,Family,Embarked_C,Embarked_Q,Embarked_S,Person_child,Person_female,Person_male,Class_1,Class_2,Class_3
0,892,34.5,7,0,0,1,0,0,0,1,0,0,1
1,893,47.0,7,1,0,0,1,0,1,0,0,0,1
2,894,62.0,9,0,0,1,0,0,0,1,0,1,0
3,895,27.0,8,0,0,0,1,0,0,1,0,0,1
4,896,22.0,12,1,0,0,1,0,1,0,0,0,1


In [34]:
passenger_id = test_encoded['PassengerId']
test_encoded.drop('PassengerId', axis = 1, inplace = True)

In [35]:
pred = best_clf.predict(test_encoded)

In [36]:
pred

array([0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [37]:
submission = pd.DataFrame({
        "PassengerId": passenger_id,
        "Survived": pred
    })
submission.to_csv('titanic.csv', index=False)