# Titanic Survival Predictions

Import pandas, then turn the CSV file into pandas DataFrame.

In [1153]:
import numpy as np
import pandas as pd
%matplotlib inline

tt = pd.read_csv("train.csv", index_col = 0)
tt.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [1154]:
print (tt.shape)
tt.info()

(891, 11)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


So we have 891 instances  **(rows)** and 11 features **(columns)**.

We also notice that Age, Cabin, and Embarked columns contain missing values.

We are going to look at each feature (column) **separately**.

First, we will see how many people survived the crash. 0 = No & 1 = Yes. This is also the label, so it is better we
drop this from the table tt and store it in a new variable, say test_label.

In [1155]:
train_labels = tt.Survived
train_labels = train_labels.astype('int')
train_labels.value_counts()

0    549
1    342
Name: Survived, dtype: int64

Lets see how many males and females are there & and how many of each gender survived.

In [1156]:
tt.groupby(['Sex']).Survived.value_counts()

Sex     Survived
female  1           233
        0            81
male    0           468
        1           109
Name: Survived, dtype: int64

Lets see how many survived from each Pclass. 0 = Did Not Survive, 1 = Survived

In [1157]:
tt.groupby(['Pclass']).Survived.value_counts()
#tt.loc[:, ['Pclass','Survived']].groupby('Pclass')

Pclass  Survived
1       1           136
        0            80
2       0            97
        1            87
3       0           372
        1           119
Name: Survived, dtype: int64

We will create a new column named **familySize** that will be the sum of SibSp and Parch.

Then we will create another column, **isAlone**, whose value will be 0 or 1 (if not alone).

In [1158]:
tt['familySize'] = tt['SibSp'] + tt['Parch']
tt.groupby('familySize').Survived.value_counts()
tt['isAlone'] = tt['familySize']

def checkAlone(x):
    if x > 0:return 0
    else: return 1
        
tt.isAlone = tt.isAlone.apply(checkAlone)
tt.loc[:,['familySize', 'isAlone']].head(4)

Unnamed: 0_level_0,familySize,isAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,1,0
3,0,1
4,1,0


Lets see the Embarked column; it has 2 missing entry. We are going to replace the missing values by the most ocurring class.

In [1159]:
tt['Embarked'] = tt.Embarked.fillna('S')
tt.groupby('Embarked').Survived.value_counts()

Embarked  Survived
C         1            93
          0            75
Q         0            47
          1            30
S         0           427
          1           219
Name: Survived, dtype: int64

Lets check the Fare column. And use binning to categorize this column.

In [1160]:
tt.Fare.describe()
tt['categoricalFare'] = pd.qcut(tt.Fare,4)
tt.groupby('categoricalFare').Survived.value_counts()

categoricalFare  Survived
(-0.001, 7.91]   0           179
                 1            44
(7.91, 14.454]   0           156
                 1            68
(14.454, 31.0]   0           121
                 1           101
(31.0, 512.329]  1           129
                 0            93
Name: Survived, dtype: int64

Age has 177 missing values. We are going to use binning in age after we fill in the missing values.

In [1161]:
tt.Age.isnull().value_counts() #tells us how many rows are missing age
upper = tt.Age.mean() + 1.5*tt.Age.std()
lower = tt.Age.mean() - 1.5*tt.Age.std()
np.random.seed(1)
random_age = np.random.randint(lower,upper,177)
age = [x for x in random_age]
tt.loc[tt.Age.isnull(), 'Age'] = age
tt['categoricalAge'] = pd.cut(tt.Age,5)
tt.groupby('categoricalAge').Survived.value_counts()

categoricalAge    Survived
(0.34, 16.336]    0            77
                  1            69
(16.336, 32.252]  0           277
                  1           149
(32.252, 48.168]  0           142
                  1            90
(48.168, 64.084]  0            43
                  1            33
(64.084, 80.0]    0            10
                  1             1
Name: Survived, dtype: int64

# Data Cleaning

Now we will turn the non-numerical data into numerical values to feed into our ML model.

tt.isAlone is already done.

In [1162]:
tt['Sex'] = tt.Sex.map({'female':0, 'male':1}) #mapping Sex column

tt['Embarked'] = tt.Embarked.map({'S':0,'C':1,'Q':2}).astype(int) #mapping Embarked column

#mapping Fare column
tt.loc[(tt.Fare > -0.001) & (tt.Fare <= 7.91), 'Fare'] = 0
tt.loc[(tt.Fare > 7.91) & (tt.Fare <= 14.454), 'Fare'] = 1
tt.loc[(tt.Fare > 14.454) & (tt.Fare <= 31), 'Fare'] = 2
tt.loc[(tt.Fare > 31) & (tt.Fare <= 512.329), 'Fare'] = 3

#mapping Age column
tt.loc[tt['Age'] <= 16, 'Age'] = 0
tt.loc[(tt['Age'] > 16) & (tt['Age'] <= 32), 'Age'] = 1
tt.loc[(tt['Age'] > 32) & (tt['Age'] <= 48), 'Age'] = 2
tt.loc[(tt['Age'] > 48) & (tt['Age'] <= 64), 'Age'] = 3
tt.loc[tt['Age'] > 64, 'Age'] = 4

tt.Fare = tt.Fare.astype('int')
tt.Age = tt.Age.astype('int')

#Feature Selection
drop_features = ['Survived','Name','SibSp','Parch','Ticket','Cabin','familySize','categoricalFare','categoricalAge']
tt = tt.drop(drop_features, axis=1)
print (tt.head(10))
train = tt.values

             Pclass  Sex  Age  Fare  Embarked  isAlone
PassengerId                                           
1                 3    1    1     0         0        0
2                 1    0    2     3         1        0
3                 3    0    1     1         0        1
4                 1    0    2     3         0        0
5                 3    1    2     1         0        1
6                 3    1    2     1         2        1
7                 1    1    3     3         0        1
8                 3    1    0     2         0        0
9                 3    0    1     1         0        0
10                2    0    0     2         1        0


Let us now feed this data into our machine learning model & **train the decision tree**.

In [1163]:
from sklearn.tree import DecisionTreeClassifier

dtclass = DecisionTreeClassifier(criterion = 'entropy').fit(train,train_labels)

We have trained our decision tree. Now it's time to **clean** our test data so we can make predictions.

In [1164]:
test_data = pd.read_csv("test.csv", index_col = 0)
print (test_data.head())
test_data.shape #418 intances
test_data.info()

             Pclass                                          Name     Sex  \
PassengerId                                                                 
892               3                              Kelly, Mr. James    male   
893               3              Wilkes, Mrs. James (Ellen Needs)  female   
894               2                     Myles, Mr. Thomas Francis    male   
895               3                              Wirz, Mr. Albert    male   
896               3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

              Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
PassengerId                                                       
892          34.5      0      0   330911   7.8292   NaN        Q  
893          47.0      1      0   363272   7.0000   NaN        S  
894          62.0      0      0   240276   9.6875   NaN        Q  
895          27.0      0      0   315154   8.6625   NaN        S  
896          22.0      1      1  3101298  12.2875   NaN   

In [1165]:
test_data['Sex'] = test_data.Sex.map({'female':0, 'male':1}) #mapping Sex column
test_data.Sex = test_data.Sex.astype(int)

#creating a new column
test_data['familySize'] = test_data['SibSp'] + test_data['Parch']
test_data['isAlone'] = test_data['familySize']
test_data['isAlone'] = test_data.familySize.apply(checkAlone)

test_data['Embarked'] = test_data.Embarked.map({'S':0,'C':1,'Q':2}) #mapping the Embarked column

#Fare has 1 missing value. We will replace it with the median. And then map it.
test_data.Fare.median() # = 14.4542
test_data['Fare'] = test_data.Fare.fillna(14.4542)
test_data['categoricalFare'] = pd.qcut(test_data.Fare,4)
test_data.loc[(test_data.Fare > -0.001) & (test_data.Fare <= 7.896), 'Fare'] = 0
test_data.loc[(test_data.Fare > 7.896) & (test_data.Fare <= 14.454), 'Fare'] = 1
test_data.loc[(test_data.Fare > 14.454) & (test_data.Fare <= 31.472), 'Fare'] = 2
test_data.loc[(test_data.Fare > 31.472) & (test_data.Fare <= 512.329), 'Fare'] = 3
test_data.Fare = test_data.Fare.astype(int)

#Now we will transform the Age column. Age has 86 missing values
uppert = test_data.Age.mean() + 1.5*test_data.Age.std()
lowert = test_data.Age.mean() - 1.5*test_data.Age.std()
random_aget = np.random.randint(lowert,uppert,86)
aget = [x for x in random_aget]
test_data.loc[test_data.Age.isnull(), 'Age'] = aget
test_data['categoricalAge'] = pd.cut(test_data.Age,5)

#mapping Age column
test_data.loc[test_data['Age'] <= 16, 'Age'] = 0
test_data.loc[(test_data['Age'] > 16) & (test_data['Age'] <= 32), 'Age'] = 1
test_data.loc[(test_data['Age'] > 32) & (test_data['Age'] <= 48), 'Age'] = 2
test_data.loc[(test_data['Age'] > 48) & (test_data['Age'] <= 64), 'Age'] = 3
test_data.loc[test_data['Age'] > 64, 'Age'] = 4
test_data.Age = test_data.Age.astype(int)

test_data['Survived'] = 8
#feature selection
test_data = test_data.drop(drop_features, axis=1)

test_val = test_data.values
test_data.head(7)

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,isAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,3,1,2,0,2,1
893,3,0,2,0,0,0
894,2,1,3,1,2,1
895,3,1,1,1,0,1
896,3,0,1,1,0,0
897,3,1,0,1,0,1
898,3,0,1,0,2,1


# Prediction using Decision Trees

In [1166]:
#PREDICTION
predictions = dtclass.predict(test_val)
predictionsFrame = pd.DataFrame(predictions, columns = ['Survived'], index=test_data.index)

#write the predictions to csv file
predictionsFrame.to_csv('Desktop/Dtree.csv')

#true = pd.read_csv("Desktop/truelabels.csv", index_col = 0)
#ACCURACY
#acc = dtclass.score(test_val,true.values)

# Training & Prediction using Random Forest

In [1168]:
from sklearn.ensemble import RandomForestClassifier
#Training
rfc = RandomForestClassifier(n_estimators = 200).fit(train,train_labels)

#Predictions
predictions_rfc = rfc.predict(test_val)
predictions_rfcFrame = pd.DataFrame(predictions_rfc, columns = ['Survived'], index=test_data.index)
#write the predictions to csv file
predictionsFrame.to_csv('Desktop/RF.csv')

# Accuracy = 76.555%