# Exploratory Data Analysis :

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('/kaggle/input/titanic/train.csv')
plt.style.use('ggplot')
fig = plt.figure(figsize = (8,5))
df.Survived.value_counts(normalize = True).plot(kind='bar', color= ['red','blue'], alpha = 0.5, rot = 1)
plt.title('Survived and Deceased')
plt.show()

In [None]:
plt.scatter(df.Survived, df.Age, color='maroon', alpha = 0.09)
plt.title('Relation between Survival and Age')
plt.show()

The above plot shows people with older age died somewhat more than people with younger age. More specifically, survival rate is denser in age range 10–30 and mortality rate is denser in range 30–50.

In [None]:
df.Pclass.value_counts(normalize = True).plot(kind='bar', color=['orange','red','green'], alpha = 0.6, rot = 1)
plt.title('Class Distribution')
plt.show()

The above plot shows almost 55% of the passengers belong to 3rd class, 25% belong to 1st class and rest 20% belong to 2nd class respectively.

In [None]:
for i in [1,2,3]:
    df.Age[df.Pclass == i].plot(kind = 'kde', alpha = 1.0)
plt.title("Class vs Age")
plt.legend(("1st", "2nd", "3rd"))
plt.show()

The above plot corresponds to kernel density estimate (simply put it is just a way to represent the probability density function) for the three types of classes of the passengers w.r.t. age. The plot shows the younger passengers are more among the 3rd class passengers and older passengers are among the 1st class ones. The middle aged ones are in 2nd class mostly.

In [None]:
df.Embarked.value_counts(normalize = True).plot(kind='bar', color=['orange','red','green'], alpha = 0.6, rot = 1)
plt.title('Places where embarked')
plt.show()

The above code is for knowing the information how many of the passengers embarked from Southampton (from where Titanic actually started its journey), how many embarked from Cherbourg (where Titanic stopped first in France) and how many embarked from Queenstown (where it stopped last in Ireland). The above graph shows the figs are around 70%, 20% and 10% respectively.

In [None]:
m_color = '#F8BA00'
df.Survived[df.Sex == 'male'].value_counts(normalize = True).plot(kind='bar', alpha = 0.6, color = m_color, rot = 1)
plt.title('Male Survived')
plt.show()
f_color = '#FA0000'
df.Survived[df.Sex == 'female'].value_counts(normalize = True).plot(kind='bar', alpha = 0.6, color = f_color, rot = 1)
plt.title('Female Survived')
plt.show()
df.Sex[df.Survived == 1].value_counts(normalize = True).plot(kind='bar', alpha = 0.6, color = [f_color, m_color], rot = 1)
plt.title('Gender of Survived')
plt.show()

The above plots are to visualize the correlation of survival with gender. Among male passengers, 80% died (20% survived); among female passengers, only 30% died (70% survived). Among all survivors, 70% were female and 30% were male. Perhaps the reason being female and children were rescued first.

In [None]:
for i in [1,2,3]:
    df.Survived[df.Pclass == i].plot(kind = 'kde', alpha = 0.9, rot = 1)
plt.title("Class vs Survived")
plt.legend(("1st", "2nd", "3rd"))
plt.show()

The above graph shows that 3rd class passengers have very high mortality rate compared to 1st class passengers (see *violet hump* has a *great height* but *red hump* has *lower height* above ***x-axis’ point 0.00***). And 1st class passengers have higher survival rate compared to 3rd class passengers (see *red hump* is much *higher* compared to *violet hump* above ***x-axis’ point 1.00***).

In [None]:
plt.subplot2grid((4,4),(0,0), rowspan = 2, colspan = 2)
df.Survived[(df.Sex == 'male') & (df.Pclass == 1)].value_counts(normalize = True).plot(kind='bar', alpha = 0.5, color = m_color, rot = 1)
plt.title('Rich Male Survived')
plt.show()
plt.subplot2grid((4,4),(0,1), rowspan = 2, colspan = 2)
df.Survived[(df.Sex == 'male') & (df.Pclass == 3)].value_counts(normalize = True).plot(kind='bar', alpha = 0.5, color = m_color, rot = 1)
plt.title('Poor Male Survived')
plt.show()
plt.subplot2grid((4,4),(1,0), rowspan = 2, colspan = 2)
df.Survived[(df.Sex == 'female') & (df.Pclass == 1)].value_counts(normalize = True).plot(kind='bar', alpha = 0.5, color = f_color, rot = 1)
plt.title('Rich Female Survived')
plt.show()
plt.subplot2grid((4,4),(1,1), rowspan = 2, colspan = 2)
df.Survived[(df.Sex == 'female') & (df.Pclass == 3)].value_counts(normalize = True).plot(kind='bar', alpha = 0.5, color = f_color, rot = 1)
plt.title('Poor Female Survived')
plt.show()

40% of rich men (1st class) survived, 60% of them died; 10% of poor men survived, 90% of them died; almost 100% of rich women survived, only <1% of them died; 50% of poor women survived, rest 50% of them died.

# Data Pre-Processing :

We will use below function for data pre-processing. It finds the cells in the data set where “Fare” and “Age” have no value (blank cells) and fills them with median values of those columns respectively. For convenience, it marks “male” passengers with “0” and female passengers with “1”. Also, it finds the blank cells in “Embarked” column and fills them with “S” (Southampton by default). For convenience, it marks “S” (Sothampton embarked) with “0”, “C” with 1 and “Q” with 2 respectively.

In [None]:
def clean_data(data):
    data["Fare"] = data["Fare"].fillna(data["Fare"].dropna().median())
    data["Age"] = data["Age"].fillna(data["Age"].dropna().median())
    
    data.loc[data["Sex"] == "male", "Sex"] = 0
    data.loc[data["Sex"] == "female", "Sex"] = 1
    
    data["Embarked"] = data["Embarked"].fillna("S")
    data.loc[data["Embarked"] == "S", "Embarked"] = 0
    data.loc[data["Embarked"] == "C", "Embarked"] = 1
    data.loc[data["Embarked"] == "Q", "Embarked"] = 2

Now, we will use the above function for finding the score of prediction for survival. Below code takes “Survived” as dependent variable and rest of the features as independent variables to fit the entire thing into regression model and finds the accuracy of the model.

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Modelling :

I kept the modelling part pretty simple. I used only Logistic Regression, Decision Tree and Random Forest models.

**(1A) Logistic Regression :**

In [None]:
# Check score with simple Logistic Regression Model
import pandas as pd
from sklearn import linear_model
train = pd.read_csv("/kaggle/input/titanic/train.csv")
clean_data(train)
target = train['Survived']
features = train[['Pclass', 'Age', 'Fare', 'Embarked', 'Sex', 'SibSp', 'Parch']]
classifier = linear_model.LogisticRegression(solver='liblinear')
classifier_ = classifier.fit(features, target)
print(classifier_.score(features, target))

**(1B) Logistic Regression with Polynomial Degree 2 :**

Now we will check whether fitting a regression model with polynomial degree = 2 gives better accuracy in prediction or not. The below code does not produce better accuracy. So, to avoid overfitting, we will not try fitting it with polynomial of degree = 2.

In [None]:
# Check score with Logistic Regression Model with Polynomial Degree = 2
from sklearn import linear_model, preprocessing
poly = preprocessing.PolynomialFeatures(degree=2)
poly_features = poly.fit_transform(features)
classifier_ = classifier.fit(poly_features, target)
print(classifier_.score(poly_features, target))

We observe a very little improvement using polynomial degree 2. 

**(2A) Decision Tree Model :**

Now, we will try fitting the problem with a decision tree model.

In [None]:
# Check score with Decision Tree Model
import pandas as pd
from sklearn import tree
train = pd.read_csv("/kaggle/input/titanic/train.csv")
clean_data(train)
target = train["Survived"]
features = train[["Pclass", "Age", "Fare", "Embarked", "Sex", "SibSp", "Parch"]]
decision_tree = tree.DecisionTreeClassifier(random_state = 42)
decision_tree_ = decision_tree.fit(features, target)
print(decision_tree_.score(features, target)) 

So, the decision tree produces 97.98% accuracy which is a better-fit model compared to previous logistic regression models (simple and polynomial deg =2). We can make this decision tree model more generalized by using a generalized decision tree and using cross validation (50 fold). The model improvement also indicates at reducing overfitting, apart from just enhancing accuracy.

**(2B) Decision Tree Model with 50-Fold Cross Validation :**

In [None]:
# Making the Decision Tree more generalized to reduce overfitting
from sklearn import model_selection
generalized_tree = tree.DecisionTreeClassifier(
                    random_state = 1,
                    max_depth = 7,
                    min_samples_split = 2)
generalized_tree_ = generalized_tree.fit(features, target)
scores = model_selection.cross_val_score(generalized_tree, features, target, scoring = 'accuracy', cv = 50)
print(scores)
print(scores.mean())

We observe that the overall score has decreased but this generalized tree avoids overfitting. We can visualize the tree picture using **graphviz**.

In [None]:
import graphviz
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import StandardScaler as scaler

data = export_graphviz(DecisionTreeClassifier(max_depth=3).fit(features, target), out_file=None, 
                       feature_names = ['Pclass', 'Age', 'Fare', 'Embarked', 'Sex', 'SibSp', 'Parch'],
                       class_names = ['Survived (0)', 'Survived (1)'], 
                       filled = True, rounded = True, special_characters = True)
# we have intentionally kept max_depth short here to accommodate the entire visual-tree
graph = graphviz.Source(data)
graph

**(3) Random Forest :**

Next, we will try fitting the problem with a Random Forest model to improve the accuracy of the result. Since Random Forest is a bagging ensemble model consisting of multiple decision trees, it will improve the performance of the model. We will first tune the hyperparameters and then will use the model with best hyperparameters.

In [None]:
#Perform Grid Search to tune hyperparameters of the Random Forest model
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(random_state = 1)
n_estimators = [1740, 1742, 1745, 1750]
max_depth = [6, 7, 8]
min_samples_split = [4, 5, 6]
min_samples_leaf = [4, 5, 6] 
oob_score = ['True']

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, oob_score = oob_score)

gridF = GridSearchCV(forest, hyperF, verbose = 1, n_jobs = 4)
bestF = gridF.fit(features, target)

In [None]:
#print(bestF)

In [None]:
# Check score with Random Forest Model having the best hyperparameters
from sklearn.ensemble import RandomForestClassifier
train = pd.read_csv("/kaggle/input/titanic/train.csv")
clean_data(train)
target = train["Survived"]
features = train[["Pclass", "Age", "Fare", "Embarked", "Sex", "SibSp", "Parch"]]
r_forest = RandomForestClassifier(criterion='gini',bootstrap=True,
                                    n_estimators=1745,
                                    max_depth=7,
                                    min_samples_split=6,
                                    min_samples_leaf=6,
                                    max_features='auto',
                                    oob_score=True,
                                    random_state=123,
                                    n_jobs=-1,
                                    verbose=0)
rf_clf = r_forest.fit(features, target)
print(rf_clf.score(features, target)) 

In [None]:
rf_clf.oob_score_

We observe that Random Forest model is also giving a high accuracy. Since Random Forest is an ensemble model, it works better compared to usual Decision Tree model. We can also make several trials with different parameter-ranges for tuning the hyperparameters of the Random Forest model in order to get the maximum accuracy. However, since we have already got good quite accuracy with our Random Forest model, we will use it for final submission. 

# Plotting Decision Regions using Mlxtend :

Before final submission, we will just check plotting decision regions applying different models on the training data set. I have used [**mlxtend library**](http://rasbt.github.io/mlxtend/) written by ***Sebastian Raschka***. 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.plotting import plot_decision_regions

value = 1.50
width = 0.75

clf1 = LogisticRegression(solver='liblinear', random_state=0)
clf2 = RandomForestClassifier(random_state=0)
clf3 = DecisionTreeClassifier(random_state=0) 
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[1, 1, 1], voting='soft')

X_list = train[["Pclass", "Age", "Fare", "Embarked", "Sex", "SibSp", "Parch"]]
X = np.asarray(X_list, dtype=np.float32)
y_list = train["Survived"]
y = np.asarray(y_list, dtype=np.int32)

# Plotting Decision Regions
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(10, 8))

labels = ['Logistic Regression',
          'Random Forest',
          'Decision Tree',
          'Ensemble']

for clf, lab, grd in zip([clf1, clf2, clf3, eclf],
                         labels,
                         itertools.product([0, 1],
                         repeat=2)):
    clf.fit(X, y)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X=X, y=y, clf=clf, 
                                filler_feature_values={2: value, 3: value, 4: value, 5: value, 6: value}, 
                                filler_feature_ranges={2: width, 3: width, 4: width, 5: width, 6: width},
                                legend=2)
    plt.title(lab)

plt.show()

We can see the decision boundary made with Decision Tree model and Ensemble Model ***(Ensemble of three models with equal weights)*** are almost similar. They are combination of squares and rectangles formed by split partitions. Decision regions plotted with Logistic Regression are very distinct for two classes. For Random Forest, the regions are quite distinctly divided as well. Moreover, Random Forest has divided both the classes into almost equal proportion (balanced). This is a reconfirmation for choosing Random Forest as the best model for submission.

# Submission using Random Forest :

In [None]:
import pandas as pd
from sklearn import tree
test = pd.read_csv("/kaggle/input/titanic/test.csv")
clean_data(test)
prediction = rf_clf.predict(test[["Pclass", "Age", "Fare", "Embarked", "Sex", "SibSp", "Parch"]])
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': prediction})
output.to_csv('titanic_submission.csv', index=False)
print("Submission successful")