In [None]:
# STEP 1: Load the libraries we'll use
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    mean_squared_error, r2_score,
    confusion_matrix, ConfusionMatrixDisplay
)

In [None]:
# STEP 2: Load the Titanic dataset

data = sns.load_dataset("titanic")
df = data.copy()

In [None]:
# STEP 3: Clean the data
df.drop(columns=["deck", "embark_town", "alive"], inplace=True)

# Fill missing ages with the median value
df["age"].fillna(df["age"].median(), inplace=True)

# Drop any other rows with missing values
df.dropna(inplace=True)

In [None]:
# STEP 4: Convert text columns to numbers
encoder = LabelEncoder()
df["sex"]      = encoder.fit_transform(df["sex"])
df["embarked"] = encoder.fit_transform(df["embarked"])

In [None]:
# STEP 5: Standardize some numeric features
columns_to_scale = ["age", "fare", "pclass"]
scaler = StandardScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

In [None]:
# STEP 6: Prepare inputs (X) and target (y)
X = df[["age", "fare", "pclass", "sex", "embarked"]]
y = df["survived"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# STEP 7: Try Linear Regression (just for fun)
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_preds = linear_model.predict(X_test)

print("🔹 Linear Regression Results")
print(f"Mean Squared Error: {mean_squared_error(y_test, linear_preds):.4f}")
print(f"R² Score          : {r2_score(y_test, linear_preds):.4f}\n")

In [None]:
# STEP 8: Try Logistic Regression (better for classification)
log_model = LogisticRegression(max_iter=500)
log_model.fit(X_train, y_train)
log_preds = log_model.predict(X_test)

print("🔹 Logistic Regression Results")
print(f"Accuracy : {accuracy_score(y_test, log_preds):.4f}")
print(f"Precision: {precision_score(y_test, log_preds):.4f}")
print(f"Recall   : {recall_score(y_test, log_preds):.4f}")

In [None]:
# STEP 9: Show a confusion matrix to see how well it did
cm = confusion_matrix(y_test, log_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

In [None]:
# STEP 10: Some fun visualizations!
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

# Age distribution
sns.histplot(df["age"], kde=True, ax=axs[0])
axs[0].set_title("Age Distribution of Passengers")

# Survival by gender
sns.countplot(data=df, x="sex", hue="survived", ax=axs[1])
axs[1].set_title("Survival by Gender")

plt.tight_layout()
plt.show()