# Exploratory Data Analysis (EDA)
##### Over Titanic Dataset (Source: [Kaggle](https://www.kaggle.com/competitions/titanic/data))

Notes:
- After running this notebook you should find 'titanic_with_features.csv' in the repository root.

### Step 1: Setup  and Data Loading

##### Import all necessary libraries and set global plot styles

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

plt.style.use("default")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

##### Load the Titanic Dataset

In [None]:
df = pd.read_csv('data/titanic.csv')
print("Data loaded successfully")
print(f"Dataset shape: {df.shape}")

### Step 2: Initial Data Exploration

##### Basic information about the dataset

In [None]:
print("Dataset Information:")
print(df.info())

##### Display first few rows        

In [None]:
print("First 5 rows:")
print(df.head())

##### Basic statistical summary

In [None]:
print("Statistical Summary:")
print(df.describe())

##### Check for Missing Values

In [None]:
print("Missing Values:")
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percentage
})
print(missing_df[missing_df['Missing Count'] > 0])

### Step 3: Categorical Variables Analysis

##### Survival rate analysis

In [None]:
print("=== SURVIVAL ANALYSIS ===")
survival_counts = df['Survived'].value_counts()
survival_rate = df['Survived'].mean()
print(f"Survival Counts:\n{survival_counts}")
print(f"Overall Survival Rate: {survival_rate:.2%}")

# Visualize survival
plt.figure(figsize=(8, 5))
plt.subplot(1, 2, 1)
df['Survived'].value_counts().plot(kind='bar', color=['red', 'green'])
plt.title('Survival Count')
plt.xlabel('Survived (0=No, 1=Yes)')
plt.ylabel('Count')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
df['Survived'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['red', 'green'])
plt.title('Survival Percentage')
plt.ylabel('')
plt.tight_layout()
plt.show()

##### Gender analysis

In [None]:
print("=== GENDER ANALYSIS ===")
gender_counts = df['Sex'].value_counts()
print(f"Gender Distribution:\n{gender_counts}")

# Gender vs Survival
gender_survival = pd.crosstab(df['Sex'], df['Survived'], margins=True)
print(f"\nGender vs Survival:\n{gender_survival}")

# Survival rate by gender
survival_by_gender = df.groupby('Sex')['Survived'].agg(['count', 'sum', 'mean'])
survival_by_gender.columns = ['Total', 'Survived', 'Survival_Rate']
print(f"\nSurvival Rate by Gender:\n{survival_by_gender}")

# Visualize
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.countplot(data=df, x='Sex', hue='Survived')
plt.title('Survival by Gender')

plt.subplot(1, 2, 2)
sns.barplot(data=df, x='Sex', y='Survived', ci=None)
plt.title('Survival Rate by Gender')
plt.ylabel('Survival Rate')
plt.tight_layout()
plt.show()

##### Passenger Class analysis

In [None]:
print("=== PASSENGER CLASS ANALYSIS ===")
class_counts = df['Pclass'].value_counts().sort_index()
print(f"Class Distribution:\n{class_counts}")

# Class vs Survival
class_survival = pd.crosstab(df['Pclass'], df['Survived'], margins=True)
print(f"\nClass vs Survival:\n{class_survival}")

# Survival rate by class
survival_by_class = df.groupby('Pclass')['Survived'].agg(['count', 'sum', 'mean'])
survival_by_class.columns = ['Total', 'Survived', 'Survival_Rate']
print(f"\nSurvival Rate by Class:\n{survival_by_class}")

# Visualize
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.countplot(data=df, x='Pclass', hue='Survived')
plt.title('Survival by Passenger Class')

plt.subplot(1, 2, 2)
sns.barplot(data=df, x='Pclass', y='Survived', ci=None)
plt.title('Survival Rate by Passenger Class')
plt.ylabel('Survival Rate')
plt.tight_layout()
plt.show()

##### Embarkation Port analysis

In [None]:
# Embarkation port analysis
print("=== EMBARKATION ANALYSIS ===")
embark_counts = df['Embarked'].value_counts()
print(f"Embarkation Distribution:\n{embark_counts}")

# Embarked vs Survival
embark_survival = pd.crosstab(df['Embarked'], df['Survived'], margins=True)
print(f"\nEmbarkation vs Survival:\n{embark_survival}")

# Survival rate by embarkation
survival_by_embark = df.groupby('Embarked')['Survived'].agg(['count', 'sum', 'mean'])
survival_by_embark.columns = ['Total', 'Survived', 'Survival_Rate']
print(f"\nSurvival Rate by Embarkation:\n{survival_by_embark}")

# Visualize
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.countplot(data=df, x='Embarked', hue='Survived')
plt.title('Survival by Embarkation Port')

plt.subplot(1, 2, 2)
sns.barplot(data=df, x='Embarked', y='Survived', ci=None)
plt.title('Survival Rate by Embarkation Port')
plt.ylabel('Survival Rate')
plt.tight_layout()
plt.show()

### Step 4: Numerical Variables Analysis

##### Age analysis

In [None]:
print("=== AGE ANALYSIS ===")
age_stats = df['Age'].describe()
print(f"Age Statistics:\n{age_stats}")
print(f"Missing Age values: {df['Age'].isnull().sum()}")

# Age distribution
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.hist(df['Age'].dropna(), bins=30, edgecolor='black', alpha=0.7)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
sns.boxplot(y=df['Age'])
plt.title('Age Boxplot')

plt.subplot(1, 3, 3)
sns.boxplot(data=df, x='Survived', y='Age')
plt.title('Age vs Survival')
plt.tight_layout()
plt.show()

# Age vs Survival analysis
age_survival = df.groupby('Survived')['Age'].agg(['count', 'mean', 'median', 'std'])
print(f"\nAge vs Survival:\n{age_survival}")

##### Fare analysis

In [None]:
print("=== FARE ANALYSIS ===")
fare_stats = df['Fare'].describe()
print(f"Fare Statistics:\n{fare_stats}")

# Fare distribution
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.hist(df['Fare'], bins=30, edgecolor='black', alpha=0.7)
plt.title('Fare Distribution')
plt.xlabel('Fare')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
sns.boxplot(y=df['Fare'])
plt.title('Fare Boxplot')

plt.subplot(1, 3, 3)
sns.boxplot(data=df, x='Survived', y='Fare')
plt.title('Fare vs Survival')
plt.tight_layout()
plt.show()

# Fare vs Survival analysis
fare_survival = df.groupby('Survived')['Fare'].agg(['count', 'mean', 'median', 'std'])
print(f"\nFare vs Survival:\n{fare_survival}")

##### Family Size analysis

In [None]:
# Family size analysis (SibSp + Parch)
print("=== FAMILY SIZE ANALYSIS ===")
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1  # +1 for the passenger themselves
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

family_stats = df['FamilySize'].describe()
print(f"Family Size Statistics:\n{family_stats}")

# Family size distribution
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.countplot(data=df, x='FamilySize')
plt.title('Family Size Distribution')

plt.subplot(1, 3, 2)
sns.barplot(data=df, x='FamilySize', y='Survived', ci=None)
plt.title('Survival Rate by Family Size')

plt.subplot(1, 3, 3)
sns.barplot(data=df, x='IsAlone', y='Survived', ci=None)
plt.title('Survival Rate: Alone vs With Family')
plt.xticks([0, 1], ['With Family', 'Alone'])
plt.tight_layout()
plt.show()

# Family size vs survival
family_survival = df.groupby('FamilySize')['Survived'].agg(['count', 'sum', 'mean'])
family_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(f"\nSurvival Rate by Family Size:\n{family_survival}")

### Step 5: Advanced Visualizations

##### Correlation Analysis (Correlation heatmap)

In [None]:
print("=== CORRELATION ANALYSIS ===")
# Select only numeric columns for correlation
numeric_cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone']
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

print("Strong correlations with Survival:")
survival_corr = correlation_matrix['Survived'].abs().sort_values(ascending=False)
print(survival_corr[survival_corr > 0.1])

##### Pairplot for key numeric variables

In [None]:
print("=== PAIRPLOT ANALYSIS ===")
plt.figure(figsize=(12, 10))
key_vars = ['Survived', 'Pclass', 'Age', 'Fare', 'FamilySize']
sns.pairplot(df[key_vars], hue='Survived', diag_kind='hist')
plt.suptitle('Pairplot of Key Variables', y=1.02)
plt.show()

##### Multi-dimensional analysis

In [None]:
print("=== MULTI-DIMENSIONAL ANALYSIS ===")

# Gender + Class vs Survival
plt.figure(figsize=(15, 10))
plt.subplot(2, 3, 1)
survival_by_gender_class = df.groupby(['Sex', 'Pclass'])['Survived'].mean().unstack()
sns.heatmap(survival_by_gender_class, annot=True, fmt='.2f', cmap='RdYlGn')
plt.title('Survival Rate by Gender and Class')

# Age groups analysis
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                        labels=['Child', 'Teen', 'Adult', 'Middle-aged', 'Senior'])

plt.subplot(2, 3, 2)
sns.barplot(data=df, x='AgeGroup', y='Survived', ci=None)
plt.title('Survival Rate by Age Group')
plt.xticks(rotation=45)

# Fare groups analysis
df['FareGroup'] = pd.cut(df['Fare'], bins=[0, 10, 25, 50, 100, 600], 
                         labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

plt.subplot(2, 3, 3)
sns.barplot(data=df, x='FareGroup', y='Survived', ci=None)
plt.title('Survival Rate by Fare Group')
plt.xticks(rotation=45)

# Gender + Age Group vs Survival
plt.subplot(2, 3, 4)
gender_age_survival = df.groupby(['Sex', 'AgeGroup'])['Survived'].mean().unstack()
sns.heatmap(gender_age_survival, annot=True, fmt='.2f', cmap='RdYlGn')
plt.title('Survival Rate by Gender and Age Group')

# Class + Embarkation vs Survival
plt.subplot(2, 3, 5)
class_embark_survival = df.groupby(['Pclass', 'Embarked'])['Survived'].mean().unstack()
sns.heatmap(class_embark_survival, annot=True, fmt='.2f', cmap='RdYlGn')
plt.title('Survival Rate by Class and Embarkation')

# Family Size + Class vs Survival
plt.subplot(2, 3, 6)
df['FamilySizeGroup'] = df['FamilySize'].apply(lambda x: 'Small' if x <= 2 else ('Medium' if x <= 4 else 'Large'))
family_class_survival = df.groupby(['FamilySizeGroup', 'Pclass'])['Survived'].mean().unstack()
sns.heatmap(family_class_survival, annot=True, fmt='.2f', cmap='RdYlGn')
plt.title('Survival Rate by Family Size and Class')

plt.tight_layout()
plt.show()

### Step 6: Summary Statistics and Key Findings

In [None]:
print("="*60)
print("COMPREHENSIVE EDA SUMMARY - TITANIC DATASET")
print("="*60)

print("\n1. DATASET OVERVIEW:")
print(f"   - Total passengers: {len(df)}")
print(f"   - Features: {len(df.columns)}")
print(f"   - Overall survival rate: {df['Survived'].mean():.2%}")

print("\n2. MISSING DATA:")
missing_summary = df.isnull().sum()[df.isnull().sum() > 0]
for col, missing in missing_summary.items():
    print(f"   - {col}: {missing} ({missing/len(df)*100:.1f}%)")

print("\n3. KEY SURVIVAL FACTORS:")
print("   Gender Impact:")
female_survival = df[df['Sex'] == 'female']['Survived'].mean()
male_survival = df[df['Sex'] == 'male']['Survived'].mean()
print(f"   - Female survival rate: {female_survival:.2%}")
print(f"   - Male survival rate: {male_survival:.2%}")
print(f"   - Gender difference: {female_survival - male_survival:.2%}")

print("\n   Class Impact:")
for pclass in [1, 2, 3]:
    class_survival = df[df['Pclass'] == pclass]['Survived'].mean()
    print(f"   - Class {pclass} survival rate: {class_survival:.2%}")

print("\n   Age Impact:")
child_survival = df[df['Age'] <= 12]['Survived'].mean()
adult_survival = df[df['Age'] > 12]['Survived'].mean()
print(f"   - Children (≤12) survival rate: {child_survival:.2%}")
print(f"   - Adults (>12) survival rate: {adult_survival:.2%}")

print("\n   Family Size Impact:")
alone_survival = df[df['IsAlone'] == 1]['Survived'].mean()
family_survival = df[df['IsAlone'] == 0]['Survived'].mean()
print(f"   - Traveling alone survival rate: {alone_survival:.2%}")
print(f"   - Traveling with family survival rate: {family_survival:.2%}")

print("\n4. STATISTICAL INSIGHTS:")
print(f"   - Average age: {df['Age'].mean():.1f} years")
print(f"   - Average fare: ${df['Fare'].mean():.2f}")
print(f"   - Most common embarkation port: {df['Embarked'].mode().iloc[0]}")
print(f"   - Most common class: {df['Pclass'].mode().iloc[0]}")

print("\n5. CORRELATIONS WITH SURVIVAL:")
correlations = df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']].corr()['Survived'].abs().sort_values(ascending=False)
for feature, corr in correlations.items():
    if feature != 'Survived' and corr > 0.1:
        print(f"   - {feature}: {corr:.3f}")

### Step 7: Save our Work

In [None]:
# Save the enhanced dataset with new features
df.to_csv('data/titanic_with_features.csv', index=False)
print("Enhanced dataset with new features saved as 'titanic_with_features.csv'")