In [1]:
# Notebook 7: Key Insights

# Example insights
# 1. Women survived more than men.
# 2. Passengers in 1st class had higher survival rates.
# 3. Children had a higher chance of survival than adults.
# 4. Fare and class show strong correlation with survival.

# Next steps:
# - Optional predictive modeling
# - Visualizations for report
# - Export figures for Reports/figures/


In [2]:
# Notebook 7: Key Insights

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --------------------------
# Step 1: Load Cleaned Data
# --------------------------
train_clean_path = r"C:\Users\HP\Documents\GitHub\DataMining-Portfolio\01_Titanic_EDA\Data\processed\train_clean.csv"
train = pd.read_csv(train_clean_path)

# --------------------------
# Step 2: Generate Key Insights
# --------------------------

# Women survived more than men
female_survival = train[train['Sex']=='female']['Survived'].mean()
male_survival   = train[train['Sex']=='male']['Survived'].mean()
print(f"Female survival rate: {female_survival:.2f}")
print(f"Male survival rate: {male_survival:.2f}")

# 1st class survival vs 2nd & 3rd
pclass_survival = train.groupby('Pclass')['Survived'].mean()
print("\nSurvival rate by passenger class:\n", pclass_survival)

# Children vs Adults survival
children_survival = train[train['Age'] < 18]['Survived'].mean()
adult_survival    = train[train['Age'] >= 18]['Survived'].mean()
print(f"\nChildren survival rate: {children_survival:.2f}")
print(f"Adult survival rate: {adult_survival:.2f}")

# --------------------------
# Step 3: Save Plots for Reports
# --------------------------
figures_path = r"C:\Users\HP\Documents\GitHub\DataMining-Portfolio\01_Titanic_EDA\Reports\figures\\"

# Gender plot
plt.figure(figsize=(6,4))
sns.countplot(x='Sex', hue='Survived', data=train)
plt.title("Survival by Gender")
plt.savefig(figures_path + "sex_vs_survival.png")
plt.close()

# Class plot
plt.figure(figsize=(6,4))
sns.countplot(x='Pclass', hue='Survived', data=train)
plt.title("Survival by Class")
plt.savefig(figures_path + "class_vs_survival.png")
plt.close()

# Age histogram
plt.figure(figsize=(8,4))
sns.histplot(train['Age'], bins=30, kde=True)
plt.title("Age Distribution")
plt.savefig(figures_path + "age_distribution.png")
plt.close()


Female survival rate: 0.74
Male survival rate: 0.19

Survival rate by passenger class:
 Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

Children survival rate: 0.54
Adult survival rate: 0.36


In [3]:
# Notebook 7: Key Insights
# Fully corrected and ready-to-run

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --------------------------
# Step 1: Load Cleaned Data
# --------------------------
train_clean_path = r"C:\Users\HP\Documents\GitHub\DataMining-Portfolio\01_Titanic_EDA\Data\processed\train_clean.csv"
train = pd.read_csv(train_clean_path)

# --------------------------
# Step 2: Key Insights Calculations
# --------------------------

# Survival by Gender
female_survival = train[train['Sex']=='female']['Survived'].mean()
male_survival   = train[train['Sex']=='male']['Survived'].mean()
print(f"Female survival rate: {female_survival:.2f}")
print(f"Male survival rate: {male_survival:.2f}")

# Survival by Passenger Class
pclass_survival = train.groupby('Pclass')['Survived'].mean()
print("\nSurvival rate by passenger class:\n", pclass_survival)

# Survival by Age group (Children vs Adults)
children_survival = train[train['Age'] < 18]['Survived'].mean()
adult_survival    = train[train['Age'] >= 18]['Survived'].mean()
print(f"\nChildren survival rate: {children_survival:.2f}")
print(f"Adult survival rate: {adult_survival:.2f}")

# --------------------------
# Step 3: Save Plots for Reports
# --------------------------
figures_path = r"C:\Users\HP\Documents\GitHub\DataMining-Portfolio\01_Titanic_EDA\Reports\figures\\"

# Gender plot
plt.figure(figsize=(6,4))
sns.countplot(x='Sex', hue='Survived', data=train)
plt.title("Survival by Gender")
plt.savefig(figures_path + "sex_vs_survival.png")
plt.close()

# Class plot
plt.figure(figsize=(6,4))
sns.countplot(x='Pclass', hue='Survived', data=train)
plt.title("Survival by Class")
plt.savefig(figures_path + "class_vs_survival.png")
plt.close()

# Age distribution plot
plt.figure(figsize=(8,4))
sns.histplot(train['Age'], bins=30, kde=True)
plt.title("Age Distribution")
plt.savefig(figures_path + "age_distribution.png")
plt.close()

print("\nAll plots saved to Reports/figures/")


Female survival rate: 0.74
Male survival rate: 0.19

Survival rate by passenger class:
 Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

Children survival rate: 0.54
Adult survival rate: 0.36

All plots saved to Reports/figures/
