![](https://wallpaperboat.com/wp-content/uploads/2020/04/titanic-wallpaper-for-pc.jpg)

## Importing Required Libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

## Exploratory Data Analysis

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
train.head(2)

In [None]:
plt.figure(figsize=(8,5))
sns.heatmap(train.corr(), cmap='cool')
plt.title('Correlation between features')
plt.show()

In [None]:
train.describe()

In [None]:
train.isnull().sum()

In [None]:
plt.figure(figsize=(12,5))
sns.heatmap(train.isnull(), cmap='cool')
plt.title('Null Values in Training Data')
plt.show()

## Data Visualization and Data Cleaning

### Working with 'Age' feature

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15,5))
sns.distplot(train['Age'], ax=ax[0])
sns.boxplot(train['Age'], ax=ax[1])
ax[1].set_title("Box plot of 'Age' Feature")
ax[0].set_title("Distibution plot of 'Age' Feature")
fig.show()

In [None]:
train['Age'].fillna(train['Age'].mean(), inplace=True)

### Working with 'Embarked' feature

In [None]:
train['Embarked'].unique()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(train['Embarked'], palette='cool', orient='v')
plt.title('Counting of all unique values of "Embarked" feature')
plt.show()

In [None]:
train['Embarked'].fillna('S', inplace=True)

### Woking with 'Cabin' features

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(train['Cabin'].isnull(), palette='cool_r', orient='v')
plt.title('Counting of null values of "Cabin" feature')
plt.show()

#### As cabin column mostly contains NaN values. So, droping it.

In [None]:
train.drop('Cabin', axis=1, inplace=True)

#### Now, we're left with fresh data, containing no null values

In [None]:
train.head(2)

## Removing Outliers

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15,5))
sns.distplot(train['Fare'], ax=ax[0])
sns.boxplot(train['Fare'], ax=ax[1])
ax[1].set_title("Box plot of 'Fare' Feature")
ax[0].set_title("Distibution plot of 'Fare' Feature")
fig.show()

In [None]:
med = train['Fare'].median()
Q = np.quantile(train['Fare'], 0.945)
train['Fare'] = train['Fare'].apply(lambda x:med if x > Q else x)
print('Outliers Removed...')

## One Hot Encoding categorical data

In [None]:
fig, ax = plt.subplots(1,3, figsize=(18,5))
sns.countplot(train['Pclass'], palette='cool', ax=ax[0])
sns.countplot(train['Sex'], palette='cool', ax=ax[1])
sns.countplot(train['Embarked'], palette='cool', ax=ax[2])
plt.show()

In [None]:
pclass   = pd.get_dummies(train['Pclass'])
sex      = pd.get_dummies(train['Sex'])
embarked = pd.get_dummies(train['Embarked'])

pclass.columns   = ['Pclass_1', 'Pclass_2', 'Pclass_3']
sex.columns      = ['Female', 'Male']
embarked.columns = ['Embarked_C', 'Embarked_Q', 'Embarked_S']

In [None]:
data = pd.concat([train, pclass, sex, embarked], axis=1)
data.drop(['Pclass', 'Sex', 'Embarked'], axis=1, inplace=True)
data.drop(['Name', 'Ticket'], axis=1, inplace=True)
data.head(1)

## Splitting dependent & independent variable

In [None]:
X = data.drop('Survived', axis=1)
y = data['Survived']

## Scaling the data

In [None]:
scale = StandardScaler()
X_scaled = scale.fit_transform(X)

In [None]:
scaled = pd.DataFrame(X_scaled)
scaled.columns = X.columns
scaled.head(2)

## Splitting the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled, y, test_size=0.25)
print('Data Splitting Successful...')

## Model Training

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
print('Model Trained...')
print(f'Training Accuracy - {(model.score(X_train, y_train) * 100).round(2)}%')

## Model Testing | Prediction

In [None]:
y_pred = model.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
print(f'Testing Accuracy - {(score * 100).round(2)}%')

In [None]:
plt.figure(figsize=(8,5))
sns.heatmap(metrics.confusion_matrix(y_test, y_pred), cmap='cool', annot=True, fmt='g', linewidths=5)
plt.title('Confusion Matrix')
plt.show()