In [None]:
'''
Titanic Survival Prediction
------------------------------------------------
Goal:
  - Predict passenger survival using a simple Logistic Regression model.
  - Demonstrate a standard ML workflow: load → inspect → clean → EDA → encode → split → train → evaluate.

Notes:
  - Keep random_state fixed for reproducibility.
  - Use simple, sensible imputations (median for Age, mode for Embarked).
  - Encode categoricals with one-hot (drop_first=True to avoid dummy trap).
'''

In [None]:
# =============== 1) Imports ===============

import numpy as np               # numerical helpers
import pandas as pd              # data wrangling
import matplotlib.pyplot as plt  # plotting
import seaborn as sns            # better looking plots on top of matplotlib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# =============== 2) Load & Inspect ===============
df = pd.read_csv('/users/pranaypakki/Tech/CSVfiles/titanic/train.csv')

# Quick, standard checks to understand the data shape and types
print(df.head())     # a peek at the first 5 rows
print("-" * 50)
print(df.info())     # column types + non-null counts
print(df.describe()) # summary stats for numeric columns

In [None]:
# =============== 3) Clean Missing Data ===============
# See where nulls are as this guides the cleaning strategy
print(df.isnull().sum())

In [None]:
# filling the missing values with corresponding mean values

# Cabin Column removed because too many missing values
df.drop(columns=['Cabin'], errors='ignore', inplace=True)

# Missing age values filled with median
df['Age'] = df['Age'].fillna(df['Age'].median())

# Missing embarked values filled with mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# check
print(df.isnull().sum())

In [None]:
# =============== 4) Exploratory Data Analysis (EDA) ===============
# Class balance, how many survived vs not

sns.countplot(x='Survived', data = df, hue='Survived')
plt.title('Survived')
plt.legend(labels = ['No','Yes'])

In [None]:
# Survival by passenger class, higher class tends to have higher survival

sns.countplot(x='Pclass', hue='Survived', data = df)
plt.title('Survival by Class')
plt.legend(labels = ['No','Yes'])

In [None]:
# Survival by Gender
sns.barplot(data = df, x='Survived', hue='Sex')
plt.xlabel('Rate of Survival')
plt.title('Survival Rate by Gender')

In [None]:
# Age distribution split by survival, younger passengers survived a bit better
sns.histplot(data = df, x = 'Age', hue = 'Survived', kde = True, multiple = 'dodge')
plt.legend(labels = ['No','Yes'])
plt.title('Age Distribution by Survival')

In [None]:
# =============== 5) Feature Engineering ===============
# One-hot encode categoricals. drop_first=True removes one column per category
# Can only be run once, reset kernel to run error free
df = pd.get_dummies(df, columns = ['Sex','Embarked'], drop_first = True)

# Drop columns that are identifiers or not useful for this model
df.drop(columns=['PassengerId','Name','Ticket'], inplace=True)

In [None]:
# Splitting features and target
X = df.drop('Survived', axis = 1) # take all columns EXCEPT 'Survived'; all predictors
y = df['Survived']                # target we want to predict

In [None]:
# =============== 6) Train / Test Split ===============
# Hold out 20% for unbiased evaluation; fix random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print("X_train:", X_train.shape, "X_test:", X_test.shape) # See the shapes to understand how many rows/columns there are

In [None]:
# =============== 7) Model: Logistic Regression ===============
# max_iter set to 1000 ensure convergence on this small dataset
model = LogisticRegression(max_iter = 1000)

model.fit(X_train, y_train)

# Generate predictions for the unseen test set
y_pred = model.predict(X_test)

# =============== 8) Evaluation ===============
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cbar=False)
plt.title('Confusion Matrix (Test Set)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
# =============== 9) Conclusion ===============
'''
- Logistic Regression achieved an **accuracy of 81% on the test set.  
- The confusion matrix shows the model balanced both survival and non-survival predictions.  
- Key insights from EDA and model:  
  - Gender: Women had significantly higher survival rates than men.  
  - Class: Passengers in **1st class** had much better survival chances compared to 3rd class.  
  - Age: Younger passengers had slightly better survival odds.
'''