In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
est = pd.read_csv('../input/passenger-list-for-the-estonia-ferry-disaster/estonia-passenger-list.csv')

> Preparation

In [None]:
est.head()

In [None]:
est.shape

In [None]:
est.info()

Exploratory Data Analysis

In [None]:
est.Survived.value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(5,4))
sns.countplot(x='Survived',data=est, palette='summer', ax=ax)
ax.set_title('Distribution of people deceased/survived\n(0=deceased/1=survived')

From the 989 passengers, 852 died and 137 survived.
In percent:
(852/989) * 100 = 86 |
(137/989) * 100 = 14 |
86% died, 14% survived

In [None]:
est.Country.value_counts()

In [None]:
est.Country.value_counts().plot.barh(title='Countries represented', cmap='summer')

Among the passengers, people from Sweden and Estonia were the most represented groups.
With a total of 894 people, they represent 90% of all passengers.

In [None]:
est.Sex.value_counts()

In [None]:
est.Sex.value_counts().plot.bar(title='Gender Distribution', cmap='summer',legend=True)

In [None]:
est.Age.mean()

In [None]:
fig, ax = plt.subplots(figsize=(5,4))
sns.distplot(est.Age, color='darkgreen')
ax.set_title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
sns.countplot(x='Age', hue='Survived',data=est, palette='summer',ax=ax)
ax.set_title('Survival Distribution by Age')

Younger people had a greater chance to survive than older people.

In [None]:
fig, ax = plt.subplots(figsize=(5,4))
sns.countplot(x='Sex', hue='Survived',data=est,palette='summer', ax=ax)
ax.set_title('Survival Distribution by Gender')

Does age or gender affect the chances of survival? More women than men died. At the same time less women then men survived. And, as shown in the gender distribution, 
men and women are pretty evenly distributed. This means, that women had a higher risk of dying with lower chances of survival than men. 

comparison of passengers and crew members

In [None]:
est.Category.value_counts()

In [None]:
est.groupby('Category')['Survived'].value_counts()

In [None]:
crew_died = (154/193) * 100
crew_surv = (39/193) * 100
pass_died = (698/796) * 100
pass_surv = (98/796) * 100

In [None]:
crew_died, crew_surv, pass_died, pass_surv

In [None]:
labels = ['deceased', 'survived']
crew = [crew_died, crew_surv]
cmap = plt.get_cmap("tab20c")
colors = cmap(np.array([9, 10]))
explode = (0, 0.1)
plt.pie(crew, labels=labels,explode=explode, colors=colors,autopct='%1.1f%%', shadow=True, startangle=90)
plt.title('Crew')

In [None]:
labels = ['deceased', 'survived']
pas = [pass_died, pass_surv]
cmap = plt.get_cmap("tab20c")
colors = cmap(np.array([9, 10]))
plt.pie(pas, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', shadow=True, startangle=90)
plt.title('Passengers')

A comparison of passengers and crew members shows that in the group of crew members the chances of 
survival (20.2%) were higher than those of passengers (12.3%).
Crew members may have worked more often in the service areas near the deck and knew the ship and corridors better.

Correlation between Age and Survival

In [None]:
correlation = est.corr()
sns.heatmap(correlation, annot=True, cmap='summer')

There is a negativ correlation between Age and Survival. This means, that the chance of 
survival is higher by younger people. 

Predict Survival with a logistic regression model

In [None]:
#map variable Sex to int
est['Sex'] = est['Sex'].map({'M': 0, 'F': 1})

In [None]:
# map column Category to int and make new columns
est['Crew'] = est['Category'].apply(lambda x: 1 if x =='C' else 0)
est['Passenger'] = est['Category'].apply(lambda x: 1 if x == 'P' else 0)
X = est[['Sex', 'Age', 'Crew', 'Passenger']]
y = est['Survived']

In [None]:
#perform train/test-split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.4)

In [None]:
#scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
# Create and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
#check accuracy of the train-data
model.score(X_train, y_train)

In [None]:
model.coef_

In [None]:
# predict survival with some new data
a = np.array([0, 70, 0, 1])
b = np.array([1, 23, 1, 0])
c = np.array([0, 53, 0, 1])
d = np.array([1, 21, 0, 1])
e = np.array([0, 20, 0, 1])
f = np.array([0, 20, 1, 0])

In [None]:
new_predictions = np.array([a, b, c, d, e, f])

In [None]:
new_predictions = scaler.transform(new_predictions)

In [None]:
#probabilities: left p for decease, right p for survival
model.predict(new_predictions)
model.predict_proba(new_predictions)
   

probablilities for all passengers and chance of survive > 50%

In [None]:
X = np.array(est[['Sex', 'Age', 'Crew', 'Passenger']])

In [None]:
X = scaler.transform(X)

In [None]:
model.predict(X)

In [None]:
prob = model.predict_proba(X)

In [None]:
print(prob)

In [None]:
# List with survial > 50%
prob_surv = []
for i in prob:
    if i[1] > 0.5:
        prob_surv.append(i[1])

In [None]:
print(prob_surv)