# Titanic Survival Predictions

In [1]:
# Importing Libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import random

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingCVClassifier

First read the data from dataset

In [1]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
train_data.head()

In [1]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
test_data.head()

# Survived People

In [1]:
plt.figure(figsize=(10,6))
ax = sns.countplot(x=train_data['Survived'])

## Survived People By Class And Gender

In [1]:
plt.figure(figsize=(10,6))
ax = sns.pointplot(x="Pclass",
                   y="Survived",
                   hue="Sex",
                   data=train_data,
                   palette={"male":"g","female":"m"},
                   markers=["^","o"],
                   linestyles=["-","--"])

## Survived People By Gender

In [1]:
plt.figure(figsize=(10,6))
ax = sns.countplot(x= 'Sex',data = train_data, hue='Survived')

## Survived People By Age 

In [1]:
plt.figure(figsize=(10,6))
ax = sns.boxplot(x='Age', data=train_data, hue='Survived')

## Survived People By PClass

In [1]:
plt.figure(figsize=(10,6))
ax = sns.countplot(x='Pclass',data = train_data, hue='Survived')

# Correlation

In [1]:
f,ax = plt.subplots(figsize=(12,10))
sns.heatmap(train_data.corr(), annot = True, cmap='coolwarm')

In [1]:
corr_matrix = train_data.corr()
corr_matrix['Survived'].sort_values(ascending = False)

Pclass have higher correlation value

let's see if pclass really has survived correlation

In [1]:
sns.countplot(train_data[train_data['Survived']==1]['Pclass']).set_title('Count Survived people for each class')

count how many people are in the class

In [1]:
len(train_data[train_data['Pclass'] == 1]), len(train_data[train_data['Pclass'] == 2]), len(train_data[train_data['Pclass'] == 3])

count how many people are in the class survived

In [1]:
train_data[train_data['Pclass'] == 1]['Survived'].sum(), train_data[train_data['Pclass'] == 2]['Survived'].sum(), train_data[train_data['Pclass'] == 3]['Survived'].sum()

creating percentages

In [1]:
precentages = []
first = 136/216
seconds = 87/184
third = 119/491

precentages.append(first)
precentages.append(seconds)
precentages.append(third)

In [1]:
percents = pd.DataFrame(precentages)
percents.index += 1

In [1]:
percents['Pclass'] = ['1','2','3']
cols = ['Percent','Pclass']
percents.columns = [i for i in cols]
sns.barplot(y = 'Percent',x= 'Pclass', data = percents).set_title('Percentage of survived passenger class')

from the barplot in the above we know Class 1 have higher chance to survived rather than class 3, in class 3 is smaller chance to survived

# Cleaning Data

we will looking up the missing data, and clean up them

In [1]:
train_data.isna().sum()

In [1]:
test_data.isna().sum()

First we will fill age with median values

In [1]:
df = [train_data,test_data]

for d in df:
    d['Age'].fillna(d['Age'].median(),inplace=True)

Looking missing value in Cabin, we will fill in the empty values in the cabin with the most number of values


In [1]:
train_data['Cabin'].value_counts()

In [1]:
for d in df:
    d['Cabin'].fillna('C',inplace=True)

In [1]:
train_data['Cabin'].isna().sum()

let's make the cabin simpler into just one word

In [1]:
cabins = []
for i in train_data['Cabin']:
    cabins.append(str(i))

In [1]:
words = []
for i in cabins:
    word = i[0]
    words.append(word)

In [1]:
train_data['Cabin'] = words

In [1]:
train_data['Cabin'].head()

In [1]:
train_data['Cabin'].value_counts()

let's make same in test data

In [1]:
cabins = []
for i in test_data['Cabin']:
    cabins.append(str(i))

In [1]:
words = []
for i in cabins:
    word = i[0]
    words.append(word)

In [1]:
test_data['Cabin'] = words

In [1]:
test_data['Cabin'].value_counts()

#### Now checking embarked

In [1]:
train_data['Embarked'].isna().sum()

We will fill the missing value with the higher value

In [1]:
train_data['Embarked'].value_counts()

Ok, the higher value is S we will fill it

In [1]:
for d in df:
    d['Embarked'].fillna('S',inplace=True)

In [1]:
train_data.isna().sum()

Ok is almost done, we will fill missing value in test_data on fare

In [1]:
for d in df:
    d['Fare'].fillna(d['Fare'].mean(),inplace = True)

In [1]:
test_data.isna().sum()

Ok done for cleaning data.

# Prepare the data

In [1]:
train_data['Family'] = train_data.apply(lambda x: x['SibSp'] + x['Parch'], axis = 1)
test_data['Family'] = test_data.apply(lambda x: x['SibSp'] + x['Parch'], axis = 1)

In [1]:
train_data.drop(['SibSp','Name','Ticket','Parch'], axis = 1,inplace = True)
test_data.drop(['SibSp','Name','Ticket','Parch'], axis = 1, inplace = True)

In [1]:
train_df = pd.get_dummies(train_data)
test_df = pd.get_dummies(test_data)

In [1]:
train_df.drop('PassengerId', axis = 1, inplace = True)

In [1]:
y = train_df['Survived']
train_df.drop('Survived', axis=1, inplace = True)
train_df.drop('Cabin_T', axis=1, inplace = True)
test_df.drop('PassengerId',axis=1, inplace=True)
X = train_df
X_test = test_df

# Train Data Using Stacking CV Classifier

init rfc for random forest classifier

In [1]:
rfc = RandomForestClassifier()

Initiliazing param_grid

In [1]:
param_grid = {
    'n_estimators':[200,500,1000],
    'max_features':['auto'],
    'max_depth': [6, 7, 8],
    'criterion': ['entropy']
    }

Using GridSearchCV For best estimator to random forest classifier

In [1]:
CV = GridSearchCV(estimator = rfc, param_grid = param_grid, cv=5)
CV.fit(X,y)
CV.best_estimator_

In [1]:
rfc = RandomForestClassifier(criterion='entropy', max_depth=8, n_estimators=200)
ada = AdaBoostClassifier()
gbc = GradientBoostingClassifier()

Fitting each model

In [1]:
rfc.fit(X,y)
ada.fit(X,y)
gbc.fit(X,y)

Ok lets make the model, for getting higher accuracy you can hypertune each model before fitting them.

In [1]:
model = StackingCVClassifier(classifiers = (rfc,ada,gbc),
                                 meta_classifier = rfc,
                                 use_features_in_secondary = True)

Train the model

In [1]:
model.fit(X.values,y)

Print model accuracy score

In [1]:
print(model.score(X, y))

In [1]:
prediction = model.predict(X_test.values)

In [1]:
output = pd.DataFrame({'PassengerId' : test_data.PassengerId, 'Survived' : prediction})
output.to_csv('my_submissions.csv', index = False)