In [25]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
# Loading datasets
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

# Reasoning

After abandoning this project for a long time, I wanted to finish it. However, despite my notes, I was not really sure what my thought process was when I last worked on it. Therefore, I wanted to start the project again, incorporating my old code wherever I could, but with a clearer understanding of what I wanted to do and how I would achieve it. I started again on 5/15/24 and aim to finish by 5/20/24. The steps I intend to complete are:

- Beginning model using rule-based constraints
- decision tree
- random forest
- cleaning and more thorough EDA
- rerun dt and rf models
- feature engineering
- rerun dt and rf models
- hyperparamter tuning
- final dt and rf models

I also intend to submit my models to kaggle with an aim of getting to at least 97.5% accuracy. I am unsure how many times I will submit, but hopefully at least once per data improvement stage.

In [3]:
# Overall

overall_survival_rate = sum(train['Survived'])/len(train['Survived'])
print(f'The overall survival rate for the test set: {overall_survival_rate:.2%}')

The overall survival rate for the test set: 38.38%


In [4]:
# men vs female

women = train.loc[train['Sex'] == 'female']['Survived']
women_survival_rate = sum(women)/len(women)
print(f'The percentage of women that survived: {women_survival_rate:.2%}')

men = train.loc[train['Sex'] == 'male']['Survived']
men_survival_rate = sum(men)/len(men)
print(f'The percentage of men that survived: {men_survival_rate:.2%}')

The percentage of women that survived: 74.20%
The percentage of men that survived: 18.89%


In [5]:
# young vs old

young = train.loc[train['Age'] < 16]['Survived']
young_survival_rate = sum(young)/len(young)
print(f'The percentage of young passengers that survived: {young_survival_rate:.2%}')

old = train.loc[train['Age'] >= 16]['Survived']
old_survival_rate = sum(old)/len(old)
print(f'The percentage of old passengers that survived: {old_survival_rate:.2%}')

The percentage of young passengers that survived: 59.04%
The percentage of old passengers that survived: 38.19%


In [6]:
# Women and Children

women_and_children = train.loc[(train['Age'] < 16) | (train['Sex'] == 'female')]['Survived']
women_and_children_survival_rate = sum(women_and_children)/len(women_and_children)
print(f'The percentage of women and children that survived: {women_and_children_survival_rate:.2%}')


not_women_and_children = train.loc[~((train['Age'] < 16) | (train['Sex'] == 'female'))]['Survived']
not_women_and_children_survival_rate = sum(not_women_and_children)/len(not_women_and_children)
print(f'The percentage of older men that survived: {not_women_and_children_survival_rate:.2%}')

The percentage of women and children that survived: 71.75%
The percentage of older men that survived: 16.39%


In [7]:
# Confusion Matrices
 
pred_women_list = [1 if row['Sex'] == 'female' else 0 for index, row in train.iterrows()]
pred_young_list = [1 if row['Age'] < 16 else 0 for index, row in train.iterrows()]
pred_women_and_children_list = [1 if (row['Sex'] == 'female' or row['Age'] < 16) else 0 for index, row in train.iterrows()]

# add columns into df
train['pred_women'], train['pred_young'], train['pred_women_and_children'] = pred_women_list, pred_young_list, pred_women_and_children_list

In [15]:
def benchmark_metrics(y_true, y_pred, label):
    conf_matrix = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f'{label} Confusion Matrix:\n', conf_matrix)
    print(f'{label} Accuracy: {accuracy:.4f}')
    print(f'{label} Precision: {precision:.4f}')
    print(f'{label} Recall: {recall:.4f}')
    print(f'{label} F1 Score: {f1:.4f}')

In [16]:
# Benchmark Metrics for Women Rule
benchmark_metrics(train['Survived'], train['pred_women'], 'Women')

Women Confusion Matrix:
 [[468  81]
 [109 233]]
Women Accuracy: 0.7868
Women Precision: 0.7420
Women Recall: 0.6813
Women F1 Score: 0.7104


In [17]:
# Benchmark Metrics for Young Rule
benchmark_metrics(train['Survived'], train['pred_young'], 'Young')

Young Confusion Matrix:
 [[515  34]
 [293  49]]
Young Accuracy: 0.6330
Young Precision: 0.5904
Young Recall: 0.1433
Young F1 Score: 0.2306


In [18]:
# Benchmark Metrics for Women and Children Rule
benchmark_metrics(train['Survived'], train['pred_women_and_children'], 'Women and Children')

Women and Children Confusion Matrix:
 [[449 100]
 [ 88 254]]
Women and Children Accuracy: 0.7890
Women and Children Precision: 0.7175
Women and Children Recall: 0.7427
Women and Children F1 Score: 0.7299


Not a great improvement over just women, but does increase accuracy slightly. Precision is lower, indicating more false positives, so maybe age isn't the best indicator for survival

In [11]:
# creating predictions for test data
pred_test_list = [1 if (row['Sex'] == 'female' or row['Age'] < 16) else 0 for index, row in test.iterrows()]

# putting the predictions in the correct format for Kaggle
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_test_list})
submission.to_csv('./submissions/women_and_children_submission.csv', index=False)

The rule based predictions for the adage "women and children first" scored 0.75837. This score is not great, but it is a baseline and will hopefully improve as I continue my work.

In [35]:
# train test split

from sklearn.model_selection import train_test_split

train_columns = train.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
train_data = pd.get_dummies(train_columns)
test_data = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(train_data, test_data, test_size=0.2)

In [39]:
# decision tree

from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier()

dt_classifier.fit(X_train, y_train)

y_pred_dt = dt_classifier.predict(X_test)

benchmark_metrics(y_test, y_pred_dt, 'Decision Tree')

Decision Tree Confusion Matrix:
 [[90 25]
 [19 45]]
Decision Tree Accuracy: 0.7542
Decision Tree Precision: 0.6429
Decision Tree Recall: 0.7031
Decision Tree F1 Score: 0.6716


In [42]:
# random forest

from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()

rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)

benchmark_metrics(y_test, y_pred_rf, 'Random Forest')

Random Forest Confusion Matrix:
 [[107   8]
 [ 19  45]]
Random Forest Accuracy: 0.8492
Random Forest Precision: 0.8491
Random Forest Recall: 0.7031
Random Forest F1 Score: 0.7692


In [43]:
# Xgboost

from xgboost import XGBClassifier

xgb_classifier = XGBClassifier()

xgb_classifier.fit(X_train, y_train)

y_pred_xgb = xgb_classifier.predict(X_test)

benchmark_metrics(y_test, y_pred_xgb, 'Gradient Boosting')

Gradient Boosting Confusion Matrix:
 [[104  11]
 [ 18  46]]
Gradient Boosting Accuracy: 0.8380
Gradient Boosting Precision: 0.8070
Gradient Boosting Recall: 0.7188
Gradient Boosting F1 Score: 0.7603
