In [13]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [3]:
# Loading datasets
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

# Reasoning

After abandoning this project for a long time, I wanted to finish it. However, despite my notes, I was not really sure what my thought process was when I last worked on it. Therefore, I wanted to start the project again, incorporating my old code wherever I could, but with a clearer understanding of what I wanted to do and how I would achieve it. I started again on 5/15/24 and aim to finish by 5/20/24. The steps I intend to complete are:

- Beginning model using rule-based constraints
- decision tree
- random forest
- cleaning and more thorough EDA
- rerun dt and rf models
- feature engineering
- rerun dt and rf models
- hyperparamter tuning
- final dt and rf models

I also intend to submit my models to kaggle with an aim of getting to at least 97.5% accuracy. I am unsure how many times I will submit, but hopefully at least once per data improvement stage.

In [4]:
# Overall

overall_survival_rate = sum(train['Survived'])/len(train['Survived'])
print(f'The overall survival rate for the test set: {overall_survival_rate:.2%}')

The overall survival rate for the test set: 38.38%


In [5]:
# men vs female

women = train.loc[train['Sex'] == 'female']['Survived']
women_survival_rate = sum(women)/len(women)
print(f'The percentage of women that survived: {women_survival_rate:.2%}')

men = train.loc[train['Sex'] == 'male']['Survived']
men_survival_rate = sum(men)/len(men)
print(f'The percentage of men that survived: {men_survival_rate:.2%}')

The percentage of women that survived: 74.20%
The percentage of men that survived: 18.89%


In [6]:
# young vs old

young = train.loc[train['Age'] < 16]['Survived']
young_survival_rate = sum(young)/len(young)
print(f'The percentage of young passengers that survived: {young_survival_rate:.2%}')

old = train.loc[train['Age'] >= 16]['Survived']
old_survival_rate = sum(old)/len(old)
print(f'The percentage of old passengers that survived: {old_survival_rate:.2%}')

The percentage of young passengers that survived: 59.04%
The percentage of old passengers that survived: 38.19%


In [7]:
# Women and Children

women_and_children = train.loc[(train['Age'] < 16) | (train['Sex'] == 'female')]['Survived']
women_and_children_survival_rate = sum(women_and_children)/len(women_and_children)
print(f'The percentage of women and children that survived: {women_and_children_survival_rate:.2%}')


not_women_and_children = train.loc[~((train['Age'] < 16) | (train['Sex'] == 'female'))]['Survived']
not_women_and_children_survival_rate = sum(not_women_and_children)/len(not_women_and_children)
print(f'The percentage of older men that survived: {not_women_and_children_survival_rate:.2%}')

The percentage of women and children that survived: 71.75%
The percentage of older men that survived: 16.39%


In [8]:
# Confusion Matrices
 
pred_women_list = [1 if row['Sex'] == 'female' else 0 for index, row in train.iterrows()]
pred_young_list = [1 if row['Age'] < 16 else 0 for index, row in train.iterrows()]
pred_women_and_children_list = [1 if (row['Sex'] == 'female' or row['Age'] < 16) else 0 for index, row in train.iterrows()]

# add columns into df
train['pred_women'], train['pred_young'], train['pred_women_and_children'] = pred_women_list, pred_young_list, pred_women_and_children_list

In [25]:
# confusion matrix for women rule
women_confusion_matrix = confusion_matrix(train['Survived'], train['pred_women'])
women_accuracy = accuracy_score(train['Survived'], train['pred_women'])
women_precision = precision_score(train['Survived'], train['pred_women'])
women_recall = recall_score(train['Survived'], train['pred_women'])
women_f1 = f1_score(train['Survived'], train['pred_women'])

print('Women Confusion Matrix:\n', women_confusion_matrix)
print(f'Women Accuracy: {women_accuracy:.4f}')
print(f'Women Precision: {women_precision:.4f}')
print(f'Women Recall: {women_recall:.4f}')
print(f'Women F1 Score: {women_f1:.4f}')

Women Confusion Matrix:
 [[468  81]
 [109 233]]
Women Accuracy: 0.7868
Women Precision: 0.7420
Women Recall: 0.6813
Women F1 Score: 0.7104


In [26]:
# confusion matrix for young rule
young_confusion_matrix = confusion_matrix(train['Survived'], train['pred_young'])
young_accuracy = accuracy_score(train['Survived'], train['pred_young'])
young_precision = precision_score(train['Survived'], train['pred_young'])
young_recall = recall_score(train['Survived'], train['pred_young'])
young_f1 = f1_score(train['Survived'], train['pred_young'])

print('Young Confusion Matrix:\n', young_confusion_matrix)
print(f'Young Accuracy: {young_accuracy:.4f}')
print(f'Young Precision: {young_precision:.4f}')
print(f'Young Recall: {young_recall:.4f}')
print(f'Young F1 Score: {young_f1:.4f}')

Young Confusion Matrix:
 [[515  34]
 [293  49]]
Young Accuracy: 0.6330
Young Precision: 0.5904
Young Recall: 0.1433
Young F1 Score: 0.2306


In [27]:
# confusion matrix for women and children rule
women_and_children_confusion_matrix = confusion_matrix(train['Survived'], train['pred_women_and_children'])
women_and_children_accuracy = accuracy_score(train['Survived'], train['pred_women_and_children'])
women_and_children_precision = precision_score(train['Survived'], train['pred_women_and_children'])
women_and_children_recall = recall_score(train['Survived'], train['pred_women_and_children'])
women_and_children_f1 = f1_score(train['Survived'], train['pred_women_and_children'])

print('Women and Children Confusion Matrix:\n', women_and_children_confusion_matrix)
print(f'Women and Children Accuracy: {women_and_children_accuracy:.4f}')
print(f'Women and Children Precision: {women_and_children_precision:.4f}')
print(f'Women and Children Recall: {women_and_children_recall:.4f}')
print(f'Women and Children F1 Score: {women_and_children_f1:.4f}')

Women and Children Confusion Matrix:
 [[449 100]
 [ 88 254]]
Women and Children Accuracy: 0.7890
Women and Children Precision: 0.7175
Women and Children Recall: 0.7427
Women and Children F1 Score: 0.7299


In [None]:
# TODO use filter for test data and upload to kaggle for baseline