# Import Necessary Libraries

In [None]:
# let`s first import all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load and Read the dataset

In [None]:
# let`s now load the titanic dataset
data=pd.read_csv("../input/titanicdataset-traincsv/train.csv")

In [None]:
data.head() # look at the starting five rows of the dataset

In [None]:
data.shape # rows and columns in the dataset

In [None]:
data.describe() #gives the statistical summary of the dataset

In [None]:
data.info()

# Data Preprocessing

1. Fill the missing values
2. Convert the data type of age (float64) to int64

In [None]:
data.isnull().sum() #look for the missing value within the dataset

In [None]:
# As the "age,cabin and Embarked column have missing values" so let`s fill the missing values

In [None]:
#let`s look at the cabin column
data["Cabin"].isnull().sum()

In [None]:
# so we have 687 missing values in cabin column so let`s drop this column
data.drop(["Cabin"],axis=1,inplace=True)

In [None]:
data

In [None]:
# let`s now look at the `Age` column
data["Age"].isnull().sum()

In [None]:
# In Age column we have 177 missing value so let`s fill these values with mean or median , 
# I will use mean to fill it
Age_mean=data["Age"].mean()
data["Age"]=data["Age"].fillna(Age_mean)

In [None]:
# Now let`s have look at Age column
data["Age"].isnull().sum()

In [None]:
data=data.dropna() # Embarked has only 2 missing values so let`s drop the rows 

In [None]:
data.isnull().sum()

In [None]:
data.shape

# Exploratory Data Analysis

In [None]:
# By looking at the data we can analyse that "PassengerId,Name,SibSp,Parch,Ticket ,Fare,Embarked" 
# can not effect on the survival of passenger
features=data[["Pclass","Sex","Age","Fare"]]
target=data["Survived"]

In [None]:
# As "Sex" is categorical data so let`s use LabelEncoder to covert it into Numerical data
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder() #let`s create object of LabelEncoder
features["Sex_n"]=le.fit_transform(features["Sex"])

In [None]:
features
# Sex column is converted into numerical column

In [None]:
# Now let`s drop the original "Sex" column
features.drop(["Sex"],axis=1,inplace=True)

In [None]:
features

# MODEL BUILDING

In [None]:
# before building the model let`s split the dataset using train_test_split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(features,target,test_size=0.2)

In [None]:
len(x_train)

In [None]:
len(x_test)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train,y_train) #fit method is used to train the model so here we are training our model
model.score(x_test,y_test)  # score method will take "x_test" and will calculate survival rate
# and will compare it with y_test and will give the accuracy of the model

## Decision Tree

In [None]:
# let`s now use decision tree algorithm for prediction
from sklearn import tree
model=tree.DecisionTreeClassifier()
model.fit(x_train,y_train)
model.score(x_test,y_test)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(x_train,y_train)
model.score(x_test,y_test)

# Fine Tuning

From the above models we can observe that without fine tuning any of the model, the Random Forest give us the best accuracy so lets fine tune the hyper parameters of the Random Forest.

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=50,max_depth=10,random_state=42)
model.fit(x_train,y_train)
print("The accuracy of model is: ",model.score(x_test,y_test)*100)

So we can see that after fine tuning our accuracy is increased 
Now our accuracy is 86% which is pretty good. yayyyy :D

Lets now predict the survival of the pessengers using our Random Forest model.

In [None]:
y_predicted= model.predict(x_test)
y_predicted

# Confusion Matric

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_predicted)
sns.heatmap(cm,annot=True)