In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [None]:
train_data.describe(include='all')

> ****Missing Values for train and test set****

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

# Visual Representation of Features with respect to Survival

In [None]:
import seaborn as sns

sns.barplot(x="Sex", y="Survived", data=train_data)

As per the graph above it is clearly shown that the survival rate of females are higher than that of males.

In [None]:
sns.barplot(x="SibSp", y="Survived", data=train_data)

From the above graph we can understand that the survival rate of those people who have siblings and who doesn't have.

In [None]:
sns.barplot(x="Pclass", y="Survived", data=train_data)

In [None]:
train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

pclass: A proxy for socio-economic status (SES)
[1st = Upper ,
2nd = Middle ,
3rd = Lower]

So the people belonging to higher socio-economic status have higher survival chance than others.

In [None]:
sns.barplot(x="Parch", y="Survived", data=train_data)

parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

age_graph= sns.FacetGrid(train_data, col='Survived')
age_graph.map(plt.hist, 'Age', bins=20)

For age we need to divide them in categories otherwise it will be difficult to understand like the above.

In [None]:
#sort the ages into logical categories
train_data["Age"] = train_data["Age"].fillna(-0.5)
test_data["Age"] = test_data["Age"].fillna(-0.5)
bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf]
labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
train_data['AgeGroup'] = pd.cut(train_data["Age"], bins, labels = labels)
test_data['AgeGroup'] = pd.cut(test_data["Age"], bins, labels = labels)

#draw a bar plot of Age vs. survival
sns.barplot(x="AgeGroup", y="Survived", data=train_data)

# **Handling "Embarked" Feature**

In [None]:
#Missing values for Embarked Column in training set
train_data[train_data.Embarked.isnull()]

In [None]:
#Filling missing values in Embarked Column
train_data = train_data.fillna({"Embarked": "S"})

# **Dropping unnecessary columns from the dataset**

In [None]:
train_data = train_data.drop(['Ticket', 'Cabin'], axis=1)
test_data = test_data.drop(['Ticket', 'Cabin'], axis=1)

# Title feature
*Name feature can be engineered to extract titles and test correlation between titles and survival, before dropping Name and PassengerId features.*

In the following code we extract Title feature using regular expressions. The RegEx pattern (\w+\.) matches the first word which ends with a dot character within Name feature. The expand=False flag returns a DataFrame.

In [None]:
#create a combined group of both datasets
combine = [train_data, test_data]

#extract a title for each Name in the train and test datasets
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_data['Title'], train_data['Sex'])

In [None]:
#replace various titles with more common names
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Capt', 'Col',
    'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare')
    
    dataset['Title'] = dataset['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

train_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} #Replacing title groups with numerical values
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train_data.head()

# Handling Age Feature

In [None]:
# Filling missing ages with mode age group for each title
mr_age = train_data[train_data["Title"] == 1]["AgeGroup"].mode() #Young Adult
miss_age = train_data[train_data["Title"] == 2]["AgeGroup"].mode() #Student
mrs_age = train_data[train_data["Title"] == 3]["AgeGroup"].mode() #Adult
master_age = train_data[train_data["Title"] == 4]["AgeGroup"].mode() #Baby
royal_age = train_data[train_data["Title"] == 5]["AgeGroup"].mode() #Adult
rare_age = train_data[train_data["Title"] == 6]["AgeGroup"].mode() #Adult

age_title_mapping = {1: "Young Adult", 2: "Student", 3: "Adult", 4: "Baby", 5: "Adult", 6: "Adult"}
for x in range(len(train_data["AgeGroup"])):
    if train_data["AgeGroup"][x] == "Unknown":
        train_data["AgeGroup"][x] = age_title_mapping[train_data["Title"][x]]
        
for x in range(len(test_data["AgeGroup"])):
    if test_data["AgeGroup"][x] == "Unknown":
        test_data["AgeGroup"][x] = age_title_mapping[test_data["Title"][x]]

In [None]:
#Assign each age value to a numerical value
age_mapping = {'Baby': 1, 'Child': 2, 'Teenager': 3, 'Student': 4, 'Young Adult': 5, 'Adult': 6, 'Senior': 7}
train_data['AgeGroup'] = train_data['AgeGroup'].map(age_mapping)
test_data['AgeGroup'] = test_data['AgeGroup'].map(age_mapping)

train_data.head()

train_data = train_data.drop(['Age'], axis = 1)
test_data = test_data.drop(['Age'], axis = 1)

In [None]:
#Drop the name feature because the titles are already extracted.
train_data = train_data.drop(['Name'], axis = 1)
test_data = test_data.drop(['Name'], axis = 1)

# Sex Feature

In [None]:
#map each Sex value to a numerical value
sex_mapping = {"male": 0, "female": 1}
train_data['Sex'] = train_data['Sex'].map(sex_mapping)
test_data['Sex'] = test_data['Sex'].map(sex_mapping)

train_data.head()

Embarked Mapping is required to convert string to numeric values of "Embarked" column

In [None]:
embarked_mapping = {"S": 1, "C": 2, "Q": 3}
train_data['Embarked'] = train_data['Embarked'].map(embarked_mapping)
test_data['Embarked'] = test_data['Embarked'].map(embarked_mapping)

train_data.head()

# Fare Feature

In [None]:
#drop Fare values
train_data = train_data.drop(['Fare'], axis = 1)
test_data= test_data.drop(['Fare'], axis = 1)

In [None]:
train_data.head()

In [None]:
test_data.head()

# Building Machine Learning Model

In [None]:
#Splitting train dataset

from sklearn.model_selection import train_test_split

X = train_data.drop(['Survived', 'PassengerId'], axis=1)
Y = train_data["Survived"]
x_train, x_val, y_train, y_val = train_test_split(X,Y, test_size = 0.22, random_state = 0)

# Different ML Models

In [None]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

G = GaussianNB()
G.fit(x_train, y_train)
y_pred = G.predict(x_val)
Accuracy_for_gaussian = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Accuracy_for_gaussian: ",Accuracy_for_gaussian)

# Logistic Regression
from sklearn.linear_model import LogisticRegression

L = LogisticRegression()
L.fit(x_train, y_train)
y_pred = L.predict(x_val)
Accuracy_for_logistic_regression = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Accuracy_for_logistic_regression: ",Accuracy_for_logistic_regression)

# Random Forest
from sklearn.ensemble import RandomForestClassifier

R = RandomForestClassifier()
R.fit(x_train, y_train)
y_pred = R.predict(x_val)
Accuracy_for_Random_forest = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Accuracy_for_Random_forest: ",Accuracy_for_Random_forest)

#Linear Support Vector Classification
from sklearn.svm import LinearSVC
SVC = LinearSVC()
SVC.fit(x_train, y_train)
y_pred = SVC.predict(x_val)
Accuracy_for_LinearSVC = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Accuracy_for_LinearSVC: ",Accuracy_for_LinearSVC)

#Support Vector Machine
from sklearn import svm
clf = svm.SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_val)
Accuracy_for_SVM = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Accuracy_for_SVM: ",Accuracy_for_SVM)

#Multinomial Naves Bayes
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(x_train, y_train)
y_pred = MNB.predict(x_val)
Accuracy_for_MultinomialNB = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Accuracy_for_MultinomialNB: ",Accuracy_for_MultinomialNB)

#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier()
DTC.fit(x_train, y_train)
y_pred = DTC.predict(x_val)
Accuracy_for_DecisionTreeClassifier = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Accuracy_for_DecisionTreeClassifier: ",Accuracy_for_DecisionTreeClassifier)

#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier()
GBC.fit(x_train, y_train)
y_pred = GBC.predict(x_val)
Accuracy_for_GradientBoostingClassifier = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Accuracy_for_GradientBoostingClassifier: ",Accuracy_for_GradientBoostingClassifier)

#Stochastic Gradient Boosting Classifier
from sklearn.linear_model import SGDClassifier
SGD = SGDClassifier(loss="log", penalty="l2", max_iter=5)
SGD.fit(x_train, y_train)
y_pred = SGD.predict(x_val)
Accuracy_for_SGDClassifier = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Accuracy_for_SGDClassifier: ",Accuracy_for_SGDClassifier)

#K-Neighbor Classifier
from sklearn.neighbors import KNeighborsClassifier
KN = KNeighborsClassifier()
KN.fit(x_train, y_train)
y_pred = KN.predict(x_val)
Accuracy_for_KNeighborsClassifier = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Accuracy_for_KNeighborsClassifier: ",Accuracy_for_KNeighborsClassifier)

Submission File

In [None]:
ID = test_data['PassengerId']
Predictions = GBC.predict(test_data.drop('PassengerId', axis=1))

output = pd.DataFrame({ 'PassengerId' : ID, 'Survived': Predictions })
output.to_csv('submission.csv', index=False)

# **If you find my notebook helpful then kindly upvote this. Thank You.**