***Hello there!
I hope you are doing well. This is my first end-to-end data science project, so I genuinely encourage you to share your feedback or any suggestion you might have on my work. 
I hope going through this kernel would be worth your time!***

# Importing Necessary Libraries

In [None]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
from matplotlib import pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings('ignore')

# Data Loading and Cleaning

In [None]:
# Loading train data
train_df = pd.read_csv('../input/titanic/train.csv', index_col='PassengerId')
train_df.head()

In [None]:
# Loading test data
test_df = pd.read_csv('../input/titanic/test.csv', index_col='PassengerId')
test_df.head()

In [None]:
print(train_df.info())
print('-'*40)
print(test_df.info())

###  **Insights:**
* Data type and format of each feature is correct.
* There are missing values in Age, Cabin, and Embarked columns of train dataset.
* There are missing values in Age, Cabin, and Fare columns of test dataset.

In [None]:
# Categorical Variables Description
print(train_df['Sex'].value_counts())
print('-'*50)
print(f'Number of unique values in "Cabin" column: {train_df.Cabin.nunique()}')
print('-'*50)
print(train_df['Embarked'].value_counts())

In [None]:
# Numerical Variables Description
train_df.describe()

In [None]:
# Nominal Variables Description
train_df.Name.sample(10)

###  **Insights:**
* All the features have reasonable data. Therefore, no need to correct values.  

In [None]:
# Creating age table to fill missing values in Age Column
age_table = train_df.groupby(['Pclass', 'Sex', 'Embarked']).Age.median().reset_index()
age_table

* We will use above age table consisting of median age for each combination of Pclass, Sex, and Embarked for better estimation of missing values in Age column.
* We will fill 'S' in the Embarked column for missing values because it is the most frequent embarkation port.
* We will use median fare for filling the missing values in Fare column. 
* We will not consider Cabin column as it contains more than 75% missing values.

In [None]:
# Filling the missing values
def find_age(row):
    if pd.isnull(row[3]):
        Pclass = row[0]
        Sex = row[1]
        Embarked = row[2]
        return age_table[(age_table.Pclass==Pclass) & (age_table.Sex==Sex) & (age_table.Embarked==Embarked)]['Age'].iloc[0]
    else:
        row[3]

train_df.fillna(value={'Embarked':'S', 'Age':train_df[['Pclass', 'Sex', 'Embarked', 'Age']].apply(find_age, axis=1)}, inplace=True)
test_df.fillna(value={'Fare':test_df.Fare.median(), 'Age':test_df[['Pclass', 'Sex', 'Embarked', 'Age']].apply(find_age, axis=1)}, inplace=True)
train_df.sample(5)

In [None]:
# Class Imbalance Check
plt.pie(train_df.Survived.value_counts(), labels=['Not Survived', 'Survived'], autopct='%0.1f%%')
plt.axis('equal')
plt.show()

###  **Insights:**
* This is not a class imbalance problem.

# Data Visualization

In [None]:
# Pclass distribution and its relation with the survival rate
plt.figure(figsize=(12, 4))

# Subplot 1
plt.subplot(1, 2, 1)
plt.pie(train_df.Pclass.value_counts().sort_index(), labels=['1', '2', '3'], autopct='%0.1f%%')
plt.axis('equal')
plt.title('Pclass')

# Subplot 2
plt.subplot(1, 2, 2)
sns.pointplot(data=train_df, x='Pclass', y='Survived', estimator=np.mean, capsize=0.05)
plt.ylim([0, 1])
plt.title('Survival Rate for Passenger Classes')

plt.subplots_adjust()
plt.tight_layout()
plt.show()

In [None]:
# Sex distribution and its relation with the survival rate
plt.figure(figsize=(12, 4))

# Subplot 1
plt.subplot(1, 2, 1)
plt.pie(train_df.Sex.value_counts(), labels=['Male', 'Female'], autopct='%0.1f%%')
plt.axis('equal')
plt.title('Sex')

# Subplot 2
plt.subplot(1, 2, 2)
sns.pointplot(data=train_df, x='Sex', y='Survived', estimator=np.mean, capsize=0.05)
plt.ylim([0, 1])
plt.title('Survival Rate for Genders')

plt.subplots_adjust()
plt.tight_layout()
plt.show()

In [None]:
# Embarkation ports distribution and its relation with the survival rate
plt.figure(figsize=(12, 4))

# Subplot 1
plt.subplot(1, 2, 1)
plt.pie(train_df.Embarked.value_counts(), labels=['S', 'C', 'Q'], autopct='%0.1f%%')
plt.axis('equal')
plt.title('Embarked')

# Subplot 2
plt.subplot(1, 2, 2)
sns.pointplot(data=train_df, x='Embarked', y='Survived', estimator=np.mean, capsize=0.05)
plt.ylim([0, 1])
plt.title('Survival Rate for Embarkation Ports')

plt.subplots_adjust()
plt.tight_layout()
plt.show()

In [None]:
# SibSp distribution and its relation with the survival rate
plt.figure(figsize=(12, 4))

# Subplot 1
plt.subplot(1, 2, 1)
x = np.arange(len(train_df.SibSp.value_counts().sort_index()))
y = train_df.SibSp.value_counts().sort_index()
vbar = plt.bar(x, y)
plt.xticks(x, train_df.SibSp.value_counts().sort_index().index)
plt.bar_label(vbar)
plt.title('SibSp')

# Subplot 2
plt.subplot(1, 2, 2)
sns.pointplot(data=train_df, x='SibSp', y='Survived', estimator=np.mean, capsize=0.05)
plt.ylim([0, 1])
plt.title('Survival Rate for SibSp')

plt.subplots_adjust()
plt.tight_layout()
plt.show()

In [None]:
# Parch distribution and its relation with the survival rate
plt.figure(figsize=(12, 4))

# Subplot 1
plt.subplot(1, 2, 1)
x = np.arange(len(train_df.Parch.value_counts().sort_index()))
y = train_df.Parch.value_counts().sort_index()
vbar = plt.bar(x, y)
plt.xticks(x, train_df.Parch.value_counts().sort_index().index)
plt.bar_label(vbar)
plt.title('Parch')

# Subplot 2
plt.subplot(1, 2, 2)
sns.pointplot(data=train_df, x='Parch', y='Survived', estimator=np.mean, capsize=0.05)
plt.ylim([0, 1])
plt.title('Survival Rate for Parch')

plt.subplots_adjust()
plt.tight_layout()
plt.show()

In [None]:
# Age distribution and its relation with the survival
plt.figure(figsize=(12, 4))

# Subplot 1
plt.subplot(1, 2, 1)
sns.kdeplot(train_df.Age)
quartiles = np.quantile(train_df.Age, [0.25, 0.50, 0.75])
colors = ['r', 'g', 'b']
labels = ['Q1', 'Q2', 'Q3']
for q, c, l in zip(quartiles, colors, labels):
    plt.axvline(x=q, c=c, label=l+f'={q}')
plt.legend()

# Subplot 2
plt.subplot(1, 2, 2)
sns.kdeplot(data=train_df, x='Age', hue='Survived')

plt.show()

In [None]:
# Fare distribution and its relation with the survival
plt.figure(figsize=(12, 4))

# Subplot 1
plt.subplot(1, 2, 1)
sns.kdeplot(train_df.Fare)
quartiles = np.quantile(train_df.Fare, [0.25, 0.50, 0.75])
colors = ['r', 'g', 'b']
labels = ['Q1', 'Q2', 'Q3']
for q, c, l in zip(quartiles, colors, labels):
    plt.axvline(x=q, c=c, label=l+f'={q}')
plt.legend()

# Subplot 2
plt.subplot(1, 2, 2)
sns.kdeplot(data=train_df, x='Fare', hue='Survived')

plt.show()

###  **Insights:**
* Pclass, Sex, and Embarked columns can be seen to have an impact on survival.
* SibSp and Parch columns are non-uniformly distributed among their categories. Moreover, they do not clearly exhibit relationship with the survival with the current distribution.
* Age and Fare columns show a considerable relationship with the survival.

# Feature Engineering

In [None]:
# Combining SibSp and Parch columns to create Family column to find a better relation with survival
train_df_fe = train_df.copy()
test_df_fe = test_df.copy()
combined = [train_df_fe, test_df_fe]

for df in combined:
    df['Family'] = df['SibSp'] + df['Parch']
    df['Family'] = df.Family.apply(lambda x: x if x in [0, 1, 2] else 3)
train_df_fe.sample(5)

In [None]:
# Family distribution and its relation with the survival rate
plt.figure(figsize=(12, 4))

# Subplot 1
plt.subplot(1, 2, 1)
x = np.arange(len(train_df_fe.Family.value_counts().sort_index()))
y = train_df_fe.Family.value_counts().sort_index()
vbar = plt.bar(x, y)
plt.xticks(x, train_df_fe.Family.value_counts().sort_index().index)
plt.bar_label(vbar)
plt.title('Family')

# Subplot 2
plt.subplot(1, 2, 2)
sns.pointplot(data=train_df_fe, x='Family', y='Survived', estimator=np.mean, capsize=0.05)
plt.ylim([0, 1])
plt.title('Survival Rate for Family')

plt.subplots_adjust()
plt.tight_layout()
plt.show()

###  **Insights:**
* Family column can be seen to have better relationship with the survival as compared to SibSp and Parch columns.

In [None]:
# Extracting the titles from the Name column
for df in combined:
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df.Title.apply(lambda title: title if title in ['Mr', 'Mrs', 'Miss', 'Master'] else 'Other')
train_df_fe.sample(5)

In [None]:
# Title distribution and its relation with the survival rate
plt.figure(figsize=(12, 4))

# Subplot 1
plt.subplot(1, 2, 1)
x = np.arange(len(train_df_fe.Title.value_counts()))
y = train_df_fe.Title.value_counts()
vbar = plt.bar(x, y)
plt.xticks(x, train_df_fe.Title.value_counts().index)
plt.bar_label(vbar)
plt.title('Title')

# Subplot 2
plt.subplot(1,2,2)
sns.pointplot(data=train_df_fe, x='Title', y='Survived', estimator=np.mean, capsize=0.05)
plt.ylim([0, 1])
plt.title('Survival Rate for Title')

plt.subplots_adjust()
plt.tight_layout()
plt.show()

###  **Insights:**
* Title column also exhibits considerable relationship with the survival.

# Data Preprocessing

In [None]:
# Multicollinearity Check
plt.figure(figsize=(8, 6))
corr = train_df_fe.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(20, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0, annot=True, linewidth=0.05, square=True)
plt.tight_layout()
plt.show()

###  **Insights:**
* As Family column is derived from SibSp and Parch columns, they have a significantly high correlation. Therefore, we will use only Family column.

In [None]:
# Feature Selection
x_train_features = train_df_fe[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Family', 'Title']]
x_test_features = test_df_fe[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Family', 'Title']]
y_train = train_df_fe['Survived']
x_train_features.head()

In [None]:
# Label Encoding
x_train_encoded1 = x_train_features.replace({'Sex': {'male':0, 'female':1}})
x_test_encoded1 = x_test_features.replace({'Sex': {'male':0, 'female':1}})
x_train_encoded1.head()

In [None]:
# One-Hot Encoding
x_train_encoded2 = pd.get_dummies(x_train_encoded1, columns=['Embarked', 'Title'], drop_first=True)
x_test_encoded2 = pd.get_dummies(x_test_encoded1, columns=['Embarked', 'Title'], drop_first=True)
x_train_encoded2.head()

In [None]:
# Converting DataFrames to Arrays
x_train_arr = x_train_encoded2.values
x_test_arr = x_test_encoded2.values
y_train_arr = y_train.values
x_train_arr[0]

In [None]:
# Standardization
scaler = StandardScaler()
x_train_norm = scaler.fit_transform(x_train_arr)
x_test_norm = scaler.transform(x_test_arr)

x_train_norm[0]

# Prediction using ML Models

In [None]:
# K-Nearest Neighbors Classifier (KNN)
cv = KFold(n_splits=10, random_state=1, shuffle=True)

k_values = []
knc_accuracy = []
for k in range(1, 20, 2):
    knc = KNeighborsClassifier(n_neighbors=k)
    scores_knc = cross_val_score(knc, x_train_norm, y_train_arr, cv=cv)
    knc_accuracy.append(scores_knc.mean())
    k_values.append(k)
print(f'KNN: Maximum accuracy is {np.round(max(knc_accuracy), 3)} at k={k_values[knc_accuracy.index(max(knc_accuracy))]}')

plt.plot(k_values, knc_accuracy)
plt.xticks(k_values)
plt.show()

In [None]:
# Logistic Regression
lr = LogisticRegression()
scores_lr = cross_val_score(lr, x_train_norm, y_train_arr, cv=cv)
print(f'Logistic Regression: Accuracy={np.round(scores_lr.mean(), 3)} and Standard Deviation={np.round(scores_lr.std(), 3)}')

# Support Vector Classifier
svc = SVC()
scores_svc = cross_val_score(svc, x_train_norm, y_train_arr, cv=cv)
print(f'Support Vector Classifier: Accuracy={np.round(scores_svc.mean(), 3)} and Standard Deviation={np.round(scores_svc.std(), 3)}')

# Decision Tree Classifier
dtc = DecisionTreeClassifier()
scores_dtc = cross_val_score(dtc, x_train_norm, y_train_arr, cv=cv)
print(f'Decision Tree Classifier: Accuracy={np.round(scores_dtc.mean(), 3)} and Standard Deviation={np.round(scores_dtc.std(), 3)}')

# Random Forest Classifier
rfc = RandomForestClassifier()
scores_rfc = cross_val_score(rfc, x_train_norm, y_train_arr, cv=cv)
print(f'Random Forest Classifier: Accuracy={np.round(scores_rfc.mean(), 3)} and Standard Deviation={np.round(scores_rfc.std(), 3)}')

# Gaussian Naive Bayes Classifier
gnb = GaussianNB()
scores_gnb = cross_val_score(gnb, x_train_norm, y_train_arr, cv=cv)
print(f'Gaussian Naive Bayes Classifier: Accuracy={np.round(scores_gnb.mean(), 3)} and Standard Deviation={np.round(scores_gnb.std(), 3)}')

In [None]:
# Predicting the test data labels using Support Vector Classifier
svc.fit(x_train_norm, y_train_arr)
y_predicted = svc.predict(x_test_norm)

# Submitting predictions
submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived': y_predicted})
submission.to_csv('Titanic Submission.csv', index=False)
print('Submission is successful.')

***Thank you so much for your time!
If you liked my work, kindly consider upvoting. It means a lot!
Also, do not forget to share your thoughts in the comment section.***