# Importing the Libraries:

In [None]:
# Pandas for reading the data:
import pandas as pd

# Libraries for plotting of the data:
import seaborn as sns
import matplotlib.pyplot as plt

# Libraries for Preprocessing of the data:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
import category_encoders as ce

# Libraries for Predicting on the data:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# Metrics to measure how good we do with predicting on the data:
from sklearn.metrics import *

# Visualizing the Data:

Reading the data:

In [None]:
data = pd.read_csv("../input/passenger-list-for-the-estonia-ferry-disaster/estonia-passenger-list.csv")
data.head(7)

Lets define the labels or on what column would we be predicting on using the data:

In [None]:
survived = data['Survived']

# Dropping the labels.
# Also dropping the PassengerId, Firstname and Lastname columns because they seem useless:
data.drop(['PassengerId', 'Firstname', 'Lastname', 'Survived'], axis=1, inplace=True)

Lets make some countplots to check the proportion of various features:

In [None]:
data.head()

What proportion of people survived?

In [None]:
plt.figure(figsize=(7,7))
sns.set_context("poster", font_scale=0.7)
sns.set_palette("Reds")
sns.countplot(survived)

How many men and women were aboard?

In [None]:
sns.set_palette(['skyblue', 'pink'])
plt.figure(figsize=(7,7))
sns.countplot(data['Sex'])

What is the distribuition of various countries the passengers were from?

In [None]:
plt.figure(figsize=(20,7))
sns.set_context("poster", font_scale=0.6)
sns.countplot(data['Country'])

What was the distribution of men and women of different ages?

In [None]:
plt.figure(figsize=(7,7))
sns.violinplot(data=data, x='Sex',y='Age')

# Preprocessing the Data:

Lets find out the missing values in the data:

In [None]:
data.isnull().sum()

So, no missing or NaN values!

*Lets encode the labels, *defining the categorical columns:

In [None]:
c = (data.dtypes == 'object')

categorical = list(c[c].index)

Its time to use the Cat Boost Encoder from category encoders to encode the categorical columns: 

In [None]:
cat = ce.CatBoostEncoder()

# Fitting the data to the labels:
cat.fit(data[categorical], survived)

# Transforming the columns:
data[categorical] = cat.transform(data[categorical])

So, the final step of preprocessing would be the Standardisation of the values in the model:

In [None]:
scale = StandardScaler()
scaleddata = pd.DataFrame(scale.fit_transform(data), columns=data.columns)

Splitting the data:

In [None]:
train, test, ytrain, ytest = train_test_split(scaleddata, y, train_size=0.7, test_size=0.3)

# Training and Predicting:

Using Random Forests:

In [None]:
ran = RandomForestClassifier(n_estimators=500)

ran.fit(train, ytrain)

ranpred = ran.predict(test)


In [None]:
print("The Accuracy of this model is :", accuracy_score(ranpred, ytest)*100)

Using XGBoost Classifier:

In [None]:
xgb = XGBClassifier(n_estimators=300)

xgb.fit(train, ytrain)

xpred = xgb.predict(test)

In [None]:
print("The Accuracy of this model is :", accuracy_score(xpred, ytest)*100)

Using Decision Tree Classifier:

In [None]:
tree = DecisionTreeClassifier()

tree.fit(train, ytrain)

treepred = tree.predict(test)

In [None]:
print("The Accuracy of this model is :", accuracy_score(treepred, ytest)*100)

Thank you for going through this notebook, if you liked it an upvote would be appreciated! :D