In [None]:
# Get the data

import requests

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
response = requests.get(url)
with open("titanic.csv", "wb") as file:
    file.write(response.content)


#wget https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the Titanic dataset
df = pd.read_csv('titanic.csv')

In [None]:
# Data Exploration -- we can do a lot more than this of course
#print(df.describe())
#print(df.info())
print(df.head())
df.plot(kind='scatter', x='Age', y='Fare')  # Is there a relationship between Age and Fare?

In [None]:
# Data Cleaning
df = df.drop(columns=['Name', 'Cabin', 'Ticket', 'PassengerId'])
df['Age'] = df['Age'].fillna(df['Age'].median())

In [None]:
# Feature Engineering
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

In [None]:
# Imputation
imputer = SimpleImputer(strategy='most_frequent')
df[['Sex', 'Embarked']] = imputer.fit_transform(df[['Sex', 'Embarked']])

In [None]:
# One-hot encoding for Embarked
pd.get_dummies(df, columns=['Embarked'], prefix='Embarked', drop_first=True)
df = df.drop(columns=['Embarked'])

In [None]:
df['Sex'] = df['Sex'].map({'female': 1, 'male': 0})

In [None]:
# Examine correlation matrix
corr_matrix = df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
# Feature Selection -- highly correlated with the FamilySize variable which we engineered
df.drop(columns=['SibSp', 'Parch'], inplace=True)

In [None]:
df

In [None]:
#- Dealing with Imbalanced Classes
X = df.drop(columns=['Survived'])
y = df['Survived']

ros = RandomOverSampler()
rus = RandomUnderSampler()
smote = SMOTE()

ä
X_ros, y_ros = ros.fit_resample(X, y)
X_rus, y_rus = rus.fit_resample(X, y)
X_smote, y_smote = smote.fit_resample(X, y)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

In [None]:
# Data Scaling -- fit the Scaler only on the training set, so as not to cause data leakage from the testing set!
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid_lr = GridSearchCV(lr, param_grid, cv=5)
grid_lr.fit(X_train, y_train)

print('Best Parameters for Logistic Regression:', grid_lr.best_params_)

param_grid = {'max_depth': [3, 5, 7, 10], 'min_samples_leaf': [2, 5, 10]}
grid_dt = GridSearchCV(dt, param_grid, cv=5)
grid_dt.fit(X_train, y_train)

print('Best Parameters for Decision Tree:', grid_dt.best_params_)

param_grid = {'n_estimators': [50, 100, 150], 'max_depth': [3, 5, 7, 10]}
grid_rf = GridSearchCV(rf, param_grid, cv=5)
grid_rf.fit(X_train, y_train)

print('Best Parameters for Random Forest:', grid_rf.best_params_)

In [None]:
# Final Model
final_model = grid_rf.best_estimator_
final_model.fit(X_train, y_train)

In [None]:
# Model Evaluation on Test Set
y_pred = final_model.predict(X_test)
print('Final Model Evaluation on Test Set')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))

In [None]:
# Feature Importance
feature_importance = final_model.feature_importances_
feature_names = X.columns
sorted_idx = feature_importance.argsort()

import matplotlib.pyplot as plt
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx])
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.show()