In [36]:
#Import applicable libraries
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 1. Dataset Selection (source: https://www.kaggle.com/datasets/yasserh/titanic-dataset)

# Import Dataset
df = pd.read_csv('Titanic-Dataset.csv')

# 2. Data Preprocessing

#delete rows from the dataset that contain missing values
df.dropna(inplace=True)

# delete columns with unnecessary data, repetative data, and those with mostly null values
df.drop(['Ticket', 'Cabin'], axis=1, inplace=True)

#round values in fare column to 2 decimal places

df['Fare'] = df['Fare'].round(2)

# Recode Embarked Data

df["Embarked"] = df["Embarked"].map({"C": 1, "Q": 2, "S": 3})

# Recode Sex Data

df["Sex"] = df["Sex"].map({"male": 1, "female": 2})

#select features and target variables

x = df[['Sex', 'Age', 'Fare', 'Pclass', 'Embarked']]
y = df['Survived']

#split data intp training and testing sets

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#create and train the linear regression model

model = LinearRegression()
model.fit(x_train, y_train)

In [37]:
# Model Implementation: Implement a Random Forest classifier using scikit-learn or any other Python machine learning library of your choice.

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Model implementation and trainng

forest = RandomForestClassifier(n_estimators=500, random_state=42)
forest.fit(x_train, y_train)

# Make Predictions

forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

#Feature Performance

feature_importances = pd.DataFrame(forest.feature_importances_, index = x_train.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

[[ 6  8]
 [ 3 20]]
              precision    recall  f1-score   support

           0       0.67      0.43      0.52        14
           1       0.71      0.87      0.78        23

    accuracy                           0.70        37
   macro avg       0.69      0.65      0.65        37
weighted avg       0.70      0.70      0.68        37

          importance
Age         0.353597
Fare        0.341957
Sex         0.240757
Embarked    0.041972
Pclass      0.021718


In [38]:
# Model Evaluation: Evaluate the performance of the trained model using appropriate evaluation metrics such as accuracy, precision, recall, and F1-score. Use the testing data for evaluation.

# Precision: The ratio of correctly predicted positive observations to the total predicted positives for class 0 is 0.67; the ratio for class 1 is 0.71.
# Recall: The ratio of correctly predicted positive observations to all actual positives in the dataset for class 0 is 0.43; the ration for class 1 is 0.87.
# F1 Score: The mean of precision and recall for class 0 is 0.52; the mean of precision and recall for class 1 is 0.78.
# Support: The actual number of occurrences of the class in the specified dataset: class 0 = 14; class 1 = 23.
# Accuracy: The ratio of correctly predicted instances to the total instances in the dataset is 0.70.
# Macro Avg: Macro average precision: 0.69
#Macro average recall = 0.65
#Macro average F1-score = 0.65
#Weighted Ave:
#Weighted average precision = 0.70
#Weighted average recall = 0.70
#Weighted average F1-score = 0.68


In [39]:
# Feature Importance: Build and interpret a feature importance for the variables in your model and dataset to determine the variables with the largest impact on your outcome.

feature_importances = pd.DataFrame(forest.feature_importances_, index = x_train.columns, columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

          importance
Age         0.353597
Fare        0.341957
Sex         0.240757
Embarked    0.041972
Pclass      0.021718


In [40]:
# Results Analysis: Analyze the results of the model evaluation and discuss the impact of different hyperparameters on the model performance.

# From the Titanic dataset, age, fare, and sex seem to have the biggest impact on survivability. While age and sex might be attributed to strength and ability, fare is a different story. Potentially the wealthy were able to secure spots higher on the ship and were thus able to have quicker access to the lifeboats.