<a href="https://colab.research.google.com/github/sebaspector/Titanic-Voyage-Predicting-Passenger-Survival-in-Python/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [3]:
df = sns.load_dataset('titanic')

In [100]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [9]:
#Exploratory Data Analysis (EDA)

from tabulate import tabulate

# Overall survival count
survivors = df[df['survived'] == 1].shape[0]
not_survivors = df[df['survived'] == 0].shape[0]
survivors_rate = 100.0 * (survivors / (survivors + not_survivors))

# Survival count by sex
male_survivors = df[(df['survived'] == 1) & (df['sex'] == 'male')].shape[0]
female_survivors = df[(df['survived'] == 1) & (df['sex'] == 'female')].shape[0]
male_not_survivors = df[(df['survived'] == 0) & (df['sex'] == 'male')].shape[0]
female_not_survivors = df[(df['survived'] == 0) & (df['sex'] == 'female')].shape[0]

# Survival count by class
class_1_survivors = df[(df['survived'] == 1) & (df['class'] == 'First')].shape[0]
class_2_survivors = df[(df['survived'] == 1) & (df['class'] == 'Second')].shape[0]
class_3_survivors = df[(df['survived'] == 1) & (df['class'] == 'Third')].shape[0]
class_1_not_survivors = df[(df['survived'] == 0) & (df['class'] == 'First')].shape[0]
class_2_not_survivors = df[(df['survived'] == 0) & (df['class'] == 'Second')].shape[0]
class_3_not_survivors = df[(df['survived'] == 0) & (df['class'] == 'Third')].shape[0]

# Calculate percentages based on survivors
male_survivors_percentage = 100.0 * (male_survivors / survivors)
female_survivors_percentage = 100.0 * (female_survivors / survivors)
class_1_survivors_percentage = 100.0 * (class_1_survivors / survivors)
class_2_survivors_percentage = 100.0 * (class_2_survivors / survivors)
class_3_survivors_percentage = 100.0 * (class_3_survivors / survivors)

# Calculate percentages based on not survivors
male_not_survivors_percentage = 100.0 * (male_not_survivors / not_survivors)
female_not_survivors_percentage = 100.0 * (female_not_survivors / not_survivors)
class_1_not_survivors_percentage = 100.0 * (class_1_not_survivors / not_survivors)
class_2_not_survivors_percentage = 100.0 * (class_2_not_survivors / not_survivors)
class_3_not_survivors_percentage = 100.0 * (class_3_not_survivors / not_survivors)

# Create data for the table
table_data = [
    ["Overall", f"{survivors} ({survivors_rate:.2f}%)", f"{not_survivors} ({100 - survivors_rate:.2f}%)"],
    ["By Sex", "", ""],
    ["Male", f"{male_survivors} ({male_survivors_percentage:.2f}%)", f"{male_not_survivors} ({male_not_survivors_percentage:.2f}%)"],
    ["Female", f"{female_survivors} ({female_survivors_percentage:.2f}%)", f"{female_not_survivors} ({female_not_survivors_percentage:.2f}%)"],
    ["By Class", "", ""],
    ["1st Class", f"{class_1_survivors} ({class_1_survivors_percentage:.2f}%)", f"{class_1_not_survivors} ({class_1_not_survivors_percentage:.2f}%)"],
    ["2nd Class", f"{class_2_survivors} ({class_2_survivors_percentage:.2f}%)", f"{class_2_not_survivors} ({class_2_not_survivors_percentage:.2f}%)"],
    ["3rd Class", f"{class_3_survivors} ({class_3_survivors_percentage:.2f}%)", f"{class_3_not_survivors} ({class_3_not_survivors_percentage:.2f}%)"]
]

table_headers = ["", "Survivors", "Not Survivors"]
table = tabulate(table_data, headers=table_headers, tablefmt="fancy_grid")
table_lines = table.split('\n')
table_lines[3] = table_lines[3].replace("─", "")
table_lines[5] = table_lines[5].replace("-", "")
table_lines[7] = table_lines[7].replace("-", "")
print('\n'.join(table_lines))


╒═══════════╤══════════════╤═════════════════╕
│           │ Survivors    │ Not Survivors   │
╞═══════════╪══════════════╪═════════════════╡
│ Overall   │ 123 (67.58%) │ 59 (32.42%)     │
├───────────┼──────────────┼─────────────────┤
│ By Sex    │              │                 │
├───────────┼──────────────┼─────────────────┤
│ Male      │ 41 (33.33%)  │ 53 (89.83%)     │
├───────────┼──────────────┼─────────────────┤
│ Female    │ 82 (66.67%)  │ 6 (10.17%)      │
├───────────┼──────────────┼─────────────────┤
│ By Class  │              │                 │
├───────────┼──────────────┼─────────────────┤
│ 1st Class │ 106 (86.18%) │ 51 (86.44%)     │
├───────────┼──────────────┼─────────────────┤
│ 2nd Class │ 12 (9.76%)   │ 3 (5.08%)       │
├───────────┼──────────────┼─────────────────┤
│ 3rd Class │ 5 (4.07%)    │ 5 (8.47%)       │
╘═══════════╧══════════════╧═════════════════╛


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

#Determine which factors influenced the passengers' survival. The goal is to understand which features or variables can better predict the probability of survival.

#Removing NaN values for regression
df = df.dropna()

#Select predictor variables and the target variable
X = df[["age", "sex", "pclass"]]
y = df["survived"]

#Convert the categorical variable "sex" into numerical variables (one-hot encoding)
X_encoded = pd.get_dummies(X, drop_first=True)

#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

#Create the logistic regression model
model = LogisticRegression()

#Fit the model to the training data
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Calculate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.7297297297297297
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.69      0.64        13
           1       0.82      0.75      0.78        24

    accuracy                           0.73        37
   macro avg       0.71      0.72      0.71        37
weighted avg       0.74      0.73      0.73        37



In [None]:
#Analysis of 1st Model

#Accuracy: The overall accuracy of the model on the test set is 72.9%. Overall accuracy is a measure of the proportion of correct predictions across the entire test set.
#Precision: Proportion of positive cases that were correctly identified. 60% of cases for class 0 (non-survivors) and 82% for class 1 (survivors) were correctly identified. This means that the model performed better in predicting class 1 compared to class 0.
#Recall (Sensitivity): Proportion of positive cases that were correctly identified relative to the total actual positive cases. Recall for class 0 is 69% and for class 1 is 75%. This means that the model performed better in correctly identifying survivors (class 1) compared to non-survivors (class 0).
#F1-score: F1-score is a measure that combines precision and recall into a single metric. It provides a balanced measure of the model's performance. In this report, the F1-score for class 0 is 0.64 and for class 1 is 0.78.
#Support: There were 13 instances of class 0 (non-survivors) and 24 instances of class 1 (survivors).

In [7]:
#Increasing model accuracy by including age buckets.

#Data preprocessing
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 30, 40, 50, float('inf')], labels=['<18', '18-30', '31-40', '41-50', '50+'])
df_encoded = pd.get_dummies(df[['sex', 'class', 'age_group']], drop_first=True)

#Select predictor variables and target variable
X = df_encoded
y = df['survived']

#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Create the logistic regression model
model = LogisticRegression()

#Fit the model to the training data
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Calculate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.7988826815642458
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.87      0.83       105
           1       0.79      0.70      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



In [None]:
#Analysis of 2nd Model

#Accuracy: The overall accuracy of the model is 79.89%, which means the model correctly classifies 79.89% of the cases in the test set and improved accuracy compared to the previous 72.97% of 1st Model.
#Precision: For class 0 (non-survivors), the precision is 81%, indicating that 81% of the predictions for non-survivors are correct. For class 1 (survivors), the precision is 79%, indicating that 79% of the predictions for survivors are correct.
#Recall (Sensitivity): For class 0, the recall is 87%, meaning the model correctly identifies 87% of the actual non-survivor cases. For class 1, the recall is 70%, meaning the model correctly identifies 70% of the actual survivor cases.
#F1-score: The F1-score is a metric that combines precision and recall into a single measure. For class 0, the F1-score is 83%, and for class 1, the F1-score is 74%.
#Support: Indicates the number of cases for each class in the test set. There are 105 cases of non-survivors (class 0) and 74 cases of survivors (class 1).

In [5]:
#Modelo coefficients
coefficients = model.coef_[0]
feature_names = X.columns

#DataFrame for coefficients
coef_df = pd.DataFrame({'Variable': feature_names, 'Coeficcients': coefficients})
coef_df = coef_df.sort_values(by='Coeficcients', ascending=False)

print(coef_df)

          Variable  Coeficcients
4  age_group_31-40      0.170013
3  age_group_18-30     -0.115515
6    age_group_50+     -0.427423
5  age_group_41-50     -0.468966
1     class_Second     -0.556242
2      class_Third     -1.746494
0         sex_male     -2.497852


In [6]:
#Interpretation of regression results

#1) The reference group for gender is Female. Being male, the odds of survival were more than 12 times lower than being female (8.2%).
#2) The reference group for class is 1st class. The odds of survival for 2nd class were 57.3% and for 3rd class were 17.4%, both compared to 1st class.
#3) Regarding the age groups, compared to the reference group (<18), the age group 31-40 is associated with a +17.0% increase in the probability of survival. On the other hand, the age group 18-30
#is associated with a -11.5% decrease in the probability of survival. The age group 50+ is associated with a larger decrease of -42.7%, and the age group 41-50 is associated with the largest decrease
#of -46.9% in the probability of survival. These percentages are all in comparison to the <18 age group.