In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Introduction:

On September 27 1994 the ferry Estonia set sail on a night voyage across the Baltic Sea carrying 989 passengers and crew but sadely the Estonia never arrived.

The weather was typically stormy for the time of year but, At roughly 01:00 a worrying sound of screeching metal was heard, The ship suddenly listed 15 minutes later and soon alarms were sounding,  Shortly afterwards the Estonia rolled drastically to starboard. A Mayday signal was sent but power failure meant the ship’s position was given imprecisely. The Estonia disappeared from the responding ships’ radar screens at about 01:50.

The final death toll was shockingly high – more than 850 people.

The sinking of the Estonia was Europe’s worst postwar maritime disaster.

## Problem Definition

The problem here will be Binary Classification, Given data about passeneger can we predict whether they lived or died

## Data 
The dataset contains the name, age, sex, category and fate of the 989 passengers aboard the Estonia on the night of the sinking.


## Data Dictionary

* Country - Country of origin
* Firstname - Firstname of passenger
* Lastname - Lastname of passenger
* Sex - Gender of passenger - M = Male, F = Female
* Age - Age of passenger
* Category - The type of passenger - C = Crew, P = Passenger
* Survived - 0 = No, 1 = Yes

## Evaluation

The death toll of the Estonia disaster is well above 80%. If we were to make a naive baseline classifier that simply assumed all passengers aboard the Estonia died, it would be right about 86% of the time. Can you make a more sophisticated classifier that beats the baseline? (>85%) 

In [None]:
# Prepare required tools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

### Lets take a look on our dataset

In [None]:
# read CSV file

estonia = pd.read_csv ("../input/passenger-list-for-the-estonia-ferry-disaster/estonia-passenger-list.csv")
estonia.head()

In [None]:
estonia.tail()

In [None]:
estonia["Country"].value_counts()

In [None]:
estonia["Category"].value_counts()

In [None]:
estonia.Sex.value_counts()

In [None]:
# EDA _ Exploratory Data Analysis

estonia.info()

In [None]:
# check for any null features
estonia.isna().sum()

In [None]:
# describe dataset
estonia.describe()


In [None]:
# how many (survived = 1) and how many (did not survive = 0) 
estonia["Survived"].value_counts()

* The Dataset consist of 8 columns (3 numerical - 5 categorical)
* There is no null/empty features
* 90% of passenger were from (Sweden/Estonia)
* Crew memeber were 193, Passengers were 796 
* 503 Males and 486 Females were on board
* 852 Did not survive and 137 survived 

### Lets dive more into our dataset and get more info about survivals

In [None]:
pd.crosstab(estonia.Sex, estonia.Survived)

In [None]:
pd.crosstab(estonia.Sex, estonia.Survived).plot.bar()
plt.title ("SEX VS Survival")
plt.ylabel ("Number of passengers")
plt.xticks (rotation = 0)
plt.legend (["Didn't Survive", "Survive"]);

In [None]:
pd.crosstab(estonia.Category, estonia.Survived)

In [None]:
pd.crosstab(estonia.Category, estonia.Survived).plot.bar()
plt.title ("Category VS Survival")
plt.ylabel ("Number os passengers")
plt.xticks (rotation = 0)
plt.legend (["Didn't survive", "Survive"]);

In [None]:
estonia.Age.plot.hist();

### we notice Age distribution is between 20 to 70 years old, younger and older than these ages can be neglected 

In [None]:
# plot Age & Country VS survival
plt.figure (figsize = (10, 6))
plt.scatter (estonia.Age[estonia.Survived ==1],
             estonia.Country[estonia.Survived ==1],
             c = "orange")
plt.title ("Age and Country VS Survival")
plt.legend (["Survive"])
plt.xlabel ("Age");

In [None]:
# plot Age & Country VS No Survival
plt.figure (figsize = (10, 6))
plt.scatter (estonia.Age[estonia.Survived ==0],
             estonia.Country[estonia.Survived ==0],
             c = "blue")
plt.title ("Age and Country VS No Survival")
plt.legend (["Didn't Survive"])
plt.xlabel ("Age");

In [None]:
# lets drop less important columns in our data set to prepare our correlation matrix
estonia.drop(["PassengerId", "Firstname", "Lastname", "Country"], axis=1,inplace = True)
estonia.head()

### sex and Category we need to transform from categorical to numerical 
### (Sex - Male = 1, Female = 0)
### (Category - Passenger = 1 , Crew = 0)

In [None]:
# change Sex and Category to numerical 
change_dict = {"Sex" : {"M" : 1,
                        "F" : 0},
              "Category" : {"P" : 1,
                            "C" : 0}}
estonia.replace(change_dict, inplace = True)
estonia.head()

In [None]:
# We can get the correlation matrix now 

corr_matrix = estonia.corr()
corr_matrix

In [None]:
# lets make the correlation matrix more visual
fig, ax = plt.subplots(figsize = (8, 6))
plt.ax = (sns.heatmap(corr_matrix,
                      cmap = "YlGnBu",
                      cbar = False,
                      annot = True,
                      fmt = ".2f"));

### The highest positive correlation with survival is the sex and the higest negative correlation with the survival is the age

In [None]:
# Start Modeling 
# keep all featires as X and target as y
X = estonia.drop("Survived", axis = 1)
y = estonia["Survived"]

In [None]:
X

In [None]:
# split data to train and test 
np.random.seed (99)
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.2)

In [None]:
# we are going to be testing on Three models

models = {"Ridge classifier" : RidgeClassifier(),
          "Random Forest" : RandomForestClassifier(),
          "KNN" : KNeighborsClassifier()}
def fit_and_score(models, X_train, X_test, y_train, y_test):
    models_score = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        models_score[name] = model.score(X_test, y_test)
    return models_score

In [None]:
models_score = fit_and_score(models, X_train, X_test, y_train, y_test)
models_score