In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## First Look of the data

In [3]:

titanic_data = pd.read_csv('../input/titanic/train.csv', index_col = 'PassengerId')
titanic_data.head()

Let's check how many records are incomplete 

In [6]:
titanic_data.dropna()

Only 183 records have completely valid fields.<br>
But we can get rid of some fields.

In [5]:
features = [ 'Survived', 'Pclass','Sex', 'Age','SibSp', 'Parch','Fare', 'Embarked']
# features = [ 'Pclass', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


titanic_data_filtered = titanic_data[ features ].dropna()
titanic_data_filtered

After some experimeting, it was found that removing fields like: name, Ticket and cabin, we can get a lot of records to train our ML model.

### Exploratory Data Analysis

In [7]:
titanic_data.groupby('Sex')['Survived'].mean()


Almost 75% of the women survived, whereas only under 20% of the men survived.

In [8]:
titanic_data.groupby('Pclass')['Survived'].mean()


The data also shows that survival rate also depended on the classes,<br>
Upper: 62.9%,
Middle: 47.3%,
Lower: 24.2%

In [9]:
titanic_data.groupby(['Sex', 'Pclass'])['Survived'].mean().unstack()


The data confirms our intuitions;<br> Nearly all of the first class women survived,<br>
whereas, only a little bit over 10% Third class men survived.

In [10]:
u18 = titanic_data[titanic_data['Age'] <= 18]['Survived'].mean()
o18 = titanic_data[titanic_data['Age'] > 18]['Survived'].mean()
age_surival_rate = pd.Series({'under_18':u18, 'over_18':o18}, name = 'Age_bracket_survival')
age_surival_rate.index.names = ['Age_bracket']
age_surival_rate


In [11]:
age = pd.cut(titanic_data['Age'], [0, 18, 80])
titanic_data.groupby(['Sex', age, 'Pclass'])['Survived'].mean().unstack()

In [12]:
a = titanic_data.groupby(['Sex', age, 'Pclass'])['Survived'].mean().reset_index()
sex_class_age_survival = pd.Series(list(a['Survived']), index = a['Sex']+'_'+a['Pclass'].astype(str)+a['Age'].astype(str)).sort_values()

In [13]:
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
plt.figure(figsize = (10, 6))
plt.xticks(rotation = 60)
plt.xlabel('Category')
plt.ylabel('Survival Rate')
plt.title('Survival Rate on Titanic based on Sex, Class and Age')
sns.barplot(x = sex_class_age_survival.index, y = sex_class_age_survival.values)

### Checking out Survival rate with some other factors

In [16]:
titanic_data['Fare'].describe()

In [22]:
price = pd.cut(titanic_data['Fare'],[0, 8, 15, 32,100, 200,300, 513])
titanic_data.groupby(price)['Survived'].aggregate(['mean', len])

Let's see the fare cost based on sex and class

In [23]:
titanic_data.groupby(['Sex', 'Pclass'])['Fare'].mean().unstack()

So we discovered that the Fare for females was higher than for men

In [34]:
fare_bracket = pd.cut(titanic_data['Fare'],[0, 8, 15, 32, 513])
titanic_data.groupby(['Embarked', fare_bracket])['Survived'].aggregate([len, 'mean']).unstack()

# Creating, Fitting and Predicting ML model

In [35]:
from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_absolute_error

In [86]:
modelling_features = [ 'Pclass','Sex', 'Age','SibSp', 'Parch','Fare', 'Embarked']

# Need to replace all the non-numerical data types with a number
# Replacing females with 0 and males with 1
titanic_data['Sex'].replace(['female','male'], [0,1],inplace=True)
# Replacing S, Q, C with 1, 2, 3
titanic_data['Embarked'].replace(['S','Q', 'C'], [1,2,3],inplace=True)

train_x = titanic_data[modelling_features].dropna()
train_y = titanic_data.filter(items = train_x.index, axis = 0)['Survived']


# Create and fit the model
titanic_model = RandomForestRegressor(random_state = 1)
titanic_model.fit(train_x, train_y)

#### Model for those columns that have missing values

The test_data has some values - Cabin, Age and Fare

In [95]:
features_sin_age = [ 'Pclass','Sex', 'SibSp', 'Parch','Fare', 'Embarked']


train_x = titanic_data[features_sin_age].dropna()
train_y = titanic_data.filter(items = train_x.index, axis = 0)['Survived']



age_model = RandomForestRegressor(random_state=1)
age_model.fit(train_x, train_y)

In [96]:
features_sin_fare = [ 'Pclass','Sex', 'SibSp', 'Parch','Age', 'Embarked']

train_x = titanic_data[features_sin_fare].dropna()
train_y = titanic_data.filter(items = train_x.index, axis = 0)['Survived']

fare_model = RandomForestRegressor(random_state=1)
fare_model.fit(train_x, train_y)

In [97]:
features_sin_age_fare = [ 'Pclass','Sex', 'SibSp', 'Parch', 'Embarked']

train_x = titanic_data[features_sin_age_fare].dropna()
train_y = titanic_data.filter(items = train_x.index, axis = 0)['Survived']

age_fare_model = RandomForestRegressor(random_state=1)
age_fare_model.fit(train_x, train_y)

## Performing predictions on the test data

In [100]:
test_data = pd.read_csv('../input/titanic/test.csv', index_col = 'PassengerId')

# Need to replace all the non-numerical data types with a number
# Replacing females with 0 and males with 1


test_data['Sex'].replace(['female','male'], [0,1],inplace=True)
# Replacing S, Q, C with 1, 2, 3
test_data['Embarked'].replace(['S','Q', 'C'], [1,2,3],inplace=True)


val_x = test_data[modelling_features].dropna()
predictions = titanic_model.predict(val_x)


In [155]:
# predicting for those with rows with only missing age
val_age_x = test_data[test_data['Age'].isnull()][features_sin_age].dropna()
age_predictions = age_model.predict(val_age_x)

In [173]:
# predicting for those with rows with only missing fare
val_fare_x = test_data[test_data['Fare'].isnull()][features_sin_fare].dropna()
fare_predictions = fare_model.predict(val_fare_x)

In [159]:
# predicting for those with missing both age and fare
val_age_fare_x = test_data[test_data['Age'].isnull() & test_data['Fare'].isnull()]
val_age_fare_x
# there are no rows with both Age and Fare missing

In [168]:
indices = list(val_x.index)
indices.extend(list(val_age_x.index))
indices.extend(list(val_fare_x.index))
len(indices)

In [174]:
predictions_val = list(predictions)
predictions_val.extend(age_predictions)
predictions_val.extend(fare_predictions)
len(predictions_val)

In [178]:
# Since the predictions are in float, we need to set a chance threshold number
# over which the person would be considered to have survived
# surival_threshold of 0.6 resulted in a score of 0.76
# surival_threshold of 0.7 resulted in a score of 0.77
# surival_threshold of 0.75 resulted in a score of 0.77511
# surival_threshold of 0.8 resulted in a score of 0.7799
# surival_threshold of 0.85 resulted in a score of 0.767

survival_threshold = 0.8
test_predictions = list(map(lambda x: 1 if x>survival_threshold else 0, predictions_val))

In [179]:
sum(test_predictions)/len(test_predictions)

In [187]:
res = pd.Series( test_predictions,indices, name = 'Survived')
res.index.names = ['PassengerId']

In [188]:
res

In [192]:
output = pd.DataFrame({'PassengerId': indices, 'Survived': test_predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

In [193]:
output