In [2]:
# eda_titanic.py (or run as notebook cells)

import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# 1) Data loading
# Option A: load Seaborn's built-in Titanic (categorical-coded)
titanic = sns.load_dataset("titanic")

# Option B: if using Kaggle CSV, uncomment and set path
# titanic = pd.read_csv("titanic.csv")

os.makedirs("figures", exist_ok=True)

# 2) Quick overview
print("Shape:", titanic.shape)
print("\nHead:\n", titanic.head())
print("\nInfo:")
print(titanic.info())
print("\nMissing values per column:\n", titanic.isna().sum())

# 3) Basic cleaning / type handling (minimal for EDA)
# Example: For seaborn dataset, age has NaNs; fare numeric; embarked class categorical.
# Create a numeric-only frame for quick stats
numeric_cols = titanic.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = titanic.select_dtypes(exclude=[np.number]).columns.tolist()

# 4) Summary statistics
desc = titanic[numeric_cols].describe().T
print("\nSummary statistics:\n", desc)

# Save summary as CSV
desc.to_csv("figures/summary_statistics.csv", index=True)

# 5) Univariate distributions: histograms and boxplots
for col in numeric_cols:
    fig, ax = plt.subplots(figsize=(6,4))
    sns.histplot(titanic[col].dropna(), kde=True, ax=ax)
    ax.set_title(f"Histogram: {col}")
    plt.tight_layout()
    plt.savefig(f"figures/hist_{col}.png", dpi=200)
    plt.close()

    fig, ax = plt.subplots(figsize=(6,4))
    sns.boxplot(x=titanic[col], ax=ax)
    ax.set_title(f"Boxplot: {col}")
    plt.tight_layout()
    plt.savefig(f"figures/box_{col}.png", dpi=200)
    plt.close()

# 6) Categorical distributions
for col in cat_cols:
    vc = titanic[col].value_counts(dropna=False)
    print(f"\nValue counts: {col}\n", vc)
    fig, ax = plt.subplots(figsize=(6,4))
    sns.countplot(data=titanic, x=col)
    plt.title(f"Countplot: {col}")
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.savefig(f"figures/count_{col}.png", dpi=200)
    plt.close()

# 7) Bivariate relationships
# Survived by sex/class
fig, ax = plt.subplots(figsize=(6,4))
sns.barplot(data=titanic, x="sex", y="survived", estimator=np.mean, errorbar=None, ax=ax)
ax.set_title("Survival rate by sex")
plt.tight_layout()
plt.savefig("figures/survival_by_sex.png", dpi=200)
plt.close()

fig, ax = plt.subplots(figsize=(7,4))
sns.barplot(data=titanic, x="class", y="survived", estimator=np.mean, errorbar=None, ax=ax)
ax.set_title("Survival rate by class")
plt.tight_layout()
plt.savefig("figures/survival_by_class.png", dpi=200)
plt.close()

# Fare vs Survived
fig, ax = plt.subplots(figsize=(6,4))
sns.boxplot(data=titanic, x="survived", y="fare", ax=ax)
ax.set_title("Fare by survival")
plt.tight_layout()
plt.savefig("figures/fare_by_survival.png", dpi=200)
plt.close()

# Age vs Survived
fig, ax = plt.subplots(figsize=(6,4))
sns.boxplot(data=titanic, x="survived", y="age", ax=ax)
ax.set_title("Age by survival")
plt.tight_layout()
plt.savefig("figures/age_by_survival.png", dpi=200)
plt.close()

# 8) Correlation and pairwise relationships
corr = titanic[numeric_cols].corr(numeric_only=True)
plt.figure(figsize=(7,6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", square=True)
plt.title("Correlation matrix (numeric)")
plt.tight_layout()
plt.savefig("figures/correlation_matrix.png", dpi=200)
plt.close()

# Pairplot on selected columns to keep it light
pp_cols = [c for c in ["age", "fare", "pclass", "survived"] if c in titanic.columns]
if len(pp_cols) >= 2:
    g = sns.pairplot(titanic[pp_cols].dropna(), hue="survived" if "survived" in pp_cols else None, diag_kind="hist")
    g.fig.suptitle("Pairplot (selected)", y=1.02)
    g.savefig("figures/pairplot_selected.png", dpi=200)
    plt.close()

# 9) Skewness, outliers, anomalies
skews = titanic[numeric_cols].skew(numeric_only=True).sort_values(ascending=False)
print("\nSkewness:\n", skews)
skews.to_csv("figures/skewness.csv", index=True)

# Simple IQR-based outlier count per numeric col
outlier_report = []
for col in numeric_cols:
    s = titanic[col].dropna()
    if s.empty:
        continue
    Q1, Q3 = s.quantile(0.25), s.quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    outliers = ((s < lower) | (s > upper)).sum()
    outlier_report.append((col, int(outliers)))
outlier_df = pd.DataFrame(outlier_report, columns=["column", "iqr_outlier_count"]).sort_values("iqr_outlier_count", ascending=False)
print("\nOutlier counts (IQR):\n", outlier_df)
outlier_df.to_csv("figures/outlier_counts.csv", index=False)

# 10) Basic feature-level inferences (printed)
def inference_notes(df):
    notes = []
    if "sex" in df and "survived" in df:
        notes.append("Females have a higher mean survival rate than males in this sample.")
    if "class" in df and "survived" in df:
        notes.append("Higher passenger class associates with higher survival probability.")
    if "fare" in df and "survived" in df:
        notes.append("Higher fare correlates modestly with survival; fare distribution is right-skewed with outliers.")
    if "age" in df:
        notes.append("Age is mildly right-skewed and has missing values; consider imputation strategies.")
    if "pclass" in df:
        notes.append("pclass is ordinal; check multicollinearity with fare and embarked/class features.")
    return notes

notes = inference_notes(titanic.columns)
print("\nInferences:")
for n in notes:
    print("-", n)

# 11) Optional Plotly interactive example
if {"fare", "survived", "class"} <= set(titanic.columns):
    fig = px.scatter(titanic, x="age", y="fare", color="survived", facet_col="class", title="Age vs Fare by Survival and Class", template="plotly_white")
    fig.write_html("figures/age_fare_survival_plotly.html")

print("\nAll figures saved in ./figures/")


Shape: (891, 15)

Head:
    survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----