In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
FILEPATH = '/kaggle/input/passenger-list-for-the-estonia-ferry-disaster/estonia-passenger-list.csv'

In [None]:
df = pd.read_csv(FILEPATH, index_col = 'PassengerId')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Get Age type (https://www.statcan.gc.ca/eng/concepts/definitions/age2)

def get_age_type(x):
#     print(x)

    if(not x):
        return 'NA'
    
    if(x > 64):
        return 'Senior'
    
    if(x > 24):
        return 'Adult'
    
    if(x > 14):
        return 'Youth'
    
    return 'Child'

In [None]:
df['Age_type'] = df['Age'].apply(get_age_type)

In [None]:
df.head()

In [None]:
# Get NA 
df.isnull().sum().sort_values(ascending = False)

In [None]:
# As there is no use with first and last names, we can remove them

df = df.drop(['Firstname', 'Lastname'], axis = 1)

In [None]:
df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Show death counts by country
print(df.Country.value_counts())

ax = sns.barplot(
    x = df['Country'].value_counts().keys(), 
    y = df['Country'].value_counts().values
)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
plt.show()

In [None]:
# Show death counts by Age type
print(df.Age_type.value_counts())
ax1 = sns.barplot(
    x = df['Age_type'].value_counts().keys(), 
    y = df['Age_type'].value_counts().values
)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation = 90)
plt.show()

From the age type, it seems most adulst died in the disaster. 

In [None]:
# Show death counts by Gender
print(df.Sex.value_counts())
ax2 = sns.barplot(
    x = df['Sex'].value_counts().keys(), 
    y = df['Sex'].value_counts().values
)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation = 90)
plt.show()

In [None]:
import plotly.graph_objects as go


# Categorize by age type
pclass = df['Age_type'].value_counts().to_frame().reset_index().rename(columns={'index':'Age_type','Age_type':'Count'})

fig = go.Figure(data=[go.Scatter(
    x = pclass['Age_type'], 
    y = pclass['Count'],
    mode = 'lines+markers',
    marker = dict(
        color = pclass['Count'],
        size = pclass['Count'] * 0.2,
        showscale = True
    ))])

# Use theme [plotly_dark, ggplot2, plotly_dark, seaborn, plotly, plotly_white, presentation, xgridoff]
fig.layout.template = 'seaborn'

fig.update_layout(title = 'Passenger Age Type', xaxis_title = "Class", yaxis_title = "Count", title_x = 0.5)
fig.show()

In [None]:
pclass = df['Age_type'].value_counts().to_frame().reset_index().rename(columns = {'index':'Age_type','Age_type':'Count'})

# colors=['Orinoco','olive']
# more colors: https://www.htmlcsscolor.com/hex/D9D9AC

fig = go.Figure([go.Pie(labels = pclass['Age_type'], values = pclass['Count'])])

fig.update_traces(hoverinfo = 'label+percent', textinfo = 'percent+value', textfont_size = 15,
                 marker = dict(line = dict(color = '#000000', width = 2)))

fig.update_layout(title = "Passengers by Age Type",title_x = 0.5)
fig.show()

In [None]:
# Categorize by Country
pclass = df['Country'].value_counts().to_frame().reset_index().rename(columns={'index':'Country','Country':'Count'})

fig = go.Figure(data=[go.Scatter(
    x = pclass['Country'], 
    y = pclass['Count'],
    mode = 'lines+markers',
    marker = dict(
        color = pclass['Count'],
        size = pclass['Count'] * 0.2,
        showscale = True
    ))])

# Use theme [plotly_dark, ggplot2, plotly_dark, seaborn, plotly, plotly_white, presentation, xgridoff]
fig.layout.template = 'plotly'

fig.update_layout(title = 'Passenger Count by Country', xaxis_title = "Class", yaxis_title = "Count",title_x = 0.5)
fig.show()

In [None]:
pclass = df['Country'].value_counts().to_frame().reset_index().rename(columns = {'index':'Country','Country':'Count'})

# colors=['Orinoco','olive']
# more colors: https://www.htmlcsscolor.com/hex/D9D9AC

fig = go.Figure([go.Pie(labels = pclass['Country'], values = pclass['Count'])])

fig.update_traces(hoverinfo = 'label+percent', textinfo = 'percent+value', textfont_size = 15,
                 marker = dict(line = dict(color = '#000000', width = 2)))

fig.update_layout(title = "Passengers ",title_x = 0.5)
fig.show()

Most of the people died in the Estonia Disaster are from Sweden and Estonia.

In [None]:
pclass = df['Survived'].value_counts().to_frame().reset_index().rename(columns = {'index':'Survived','Survived':'Count'})

# colors=['Orinoco','olive']
# more colors: https://www.htmlcsscolor.com/hex/D9D9AC

fig = go.Figure([go.Pie(labels = pclass['Survived'], values = pclass['Count'])])

fig.update_traces(hoverinfo = 'label+percent', textinfo = 'percent+value', textfont_size = 15,
                 marker = dict(line = dict(color = '#000000', width = 2)))

fig.update_layout(title = "Passengers by Survived",title_x = 0.5)
fig.show()

In [None]:
# Let's do some data cleaning for the prediction. As Country can't do anything with the prediction, we can remove them.

df1 = df.copy()

df1 = df1.drop(['Country'], axis = 1)

df1.head()

In [None]:
# One hot encoding with dummies
# As we need to convert the categorical values to numerical values to let the computer understand, we should use one hot encoding in Pandas.

df1 = pd.get_dummies(df1)

For more info on One Hot encoding, check here:

https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/

https://riptutorial.com/pandas/example/20990/one-hot-encoding-with--get-dummies---

https://towardsdatascience.com/what-is-one-hot-encoding-and-how-to-use-pandas-get-dummies-function-922eb9bd4970

In [None]:
df1.head()

In [None]:
y = df1['Survived']

In [None]:
X = df1.drop('Survived', axis = 1)
X.head()

In [None]:
# Let's divide the dataset for training and testing
from sklearn import model_selection as ms

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size = 0.28, random_state = 34)

## Model Time

Let's try different ML models to get the best accuracy. 

In [None]:
# Let's try Random Forest first

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

rfc = RandomForestClassifier().fit(X_train, y_train)

In [None]:
rfc_ypred = rfc.predict(X_test)
accuracy_score(y_test, rfc_ypred)

In [None]:
# Let's try Gaussian Naive Bayes Classifier

from sklearn.naive_bayes import GaussianNB

nb_object = GaussianNB()

nbc = nb_object.fit(X_train, y_train)

In [None]:
from sklearn.naive_bayes import GaussianNB

ganb = GaussianNB()
ganb_model = ganb.fit(X_train, y_train)

ganb_model

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit(X_train, y_train)

dt_model

In [None]:
from sklearn.neural_network import MLPClassifier

mlpc_model = MLPClassifier().fit(X_train, y_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn_model = knn.fit(X_train, y_train)

knn_model

In [None]:
from sklearn.linear_model import LogisticRegression

lor = LogisticRegression(solver = "liblinear")
lor_model = lor.fit(X,y)

lor_model

In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel = "linear").fit(X_train, y_train)

svm_model

In [None]:
# Let's compare models

best_model_accuracy = 0
best_model = None

models = [
    rfc, 
    nbc,
    ganb_model,
    dt_model,
    mlpc_model,
    knn_model,
    lor_model,
    svm_model
]

results = pd.DataFrame(columns = ['Accuracy %'])

for model in models:
    
    model_name = model.__class__.__name__
    
    predY = model.predict(X_test)
    accuracy = accuracy_score(y_test, predY)
    
    results.loc[model_name] = "{:.2}".format(accuracy)
    
    print("-" * 43)
    print(model_name + ": " )
    
    if(accuracy > best_model_accuracy):
        best_model_accuracy = accuracy
        best_model = model_name
    
    print("Accuracy: {:.2%}".format(accuracy))

In [None]:
print("Best Model : {}".format(best_model))
print("Best Model Accuracy : {:.2%}".format(best_model_accuracy))

In [None]:
results

LogisticRegression gets the best which is 87%. 

**Note:**

I will have to add more models to pick the best. 
    
However, so far I have got RandomClassifier model to predict the best!

I will clean up and add more comments soon. **Please upvote** if you like this notebook. Also, please share some feedback so I can improve things~

<font color="blue" size=+1.5><b>Check out my other kernels</b></font>

<table style="font-family: 'Trebuchet MS', Arial, Helvetica, sans-serif;border-collapse: collapse;width: 100%;">
  <tr>
    <th style="border: 1px solid #ddd;padding: 8px; padding-top: 12px;padding-bottom: 12px;text-align: left;background-color: #2987E7;color: white;">Notebook</th>
    <th style="border: 1px solid #ddd;padding: 8px; padding-top: 12px;padding-bottom: 12px;text-align: left;background-color: #2987E7;color: white;">Tags</th>
  </tr>
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/sof-questions-eda-and-visual">SOF Questions - EDA and Visual</a> </td>
    <td style="text-align: left">Data Visual, Plotly</td>
  </tr>
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/netflix-visualization-plotly-plots-treemap">Netflix - Visualization, Plotly, Plots, and Treemap</a> </td>
    <td style="background-color: #f2f2f2;text-align: left">Data Visual, Data Cleaning, Plotly</td>
  </tr>
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/prediction-with-various-algorithms">Prediction with various Algorithms</a> </td>
    <td style="text-align: left">Random Forest, Logistic Regression</td>
  </tr>
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/eda-and-visualization">EDA and Visualization</a> </td>
    <td style="background-color: #f2f2f2;text-align: left">Data Cleaning, Data Visual</td>
  </tr>
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/job-analysis-eda-visual">Job Analysis - EDA and Visual</a> </td>
    <td style="text-align: left">Data Visual, EDA, Plotly</td>
  </tr>   
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/estonia-disaster-visualization">Estonia Disaster - Visualization</a> </td>
    <td style="background-color: #f2f2f2;text-align: left">Data Visual, EDA, Data Cleaning</td>
  </tr>
    
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/pandas-cheatsheet-100-exercices" >Pandas Cheatsheet: 100+ exercises collection</a></td>
    <td style="text-align: left">Pandas, Data Manipulation</td>
  </tr>   
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/prediction-with-various-algorithms">Credit Card Fraud - Prediction with various algorithms</a></td>
    <td style="background-color: #f2f2f2;text-align: left">Various ML Algorithms</td>
  </tr>  
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/linear-equations-real-time">Linear Equations - Real Time</a> </td>
    <td style="text-align: left">Linear Equation</td>
  </tr>  
</table>