# Titanic Dataset Exploration

This notebook explores the famous Titanic dataset from Kaggle to understand patterns in passenger survival.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 2. Load the Dataset

Note: You'll need to download the Titanic dataset from Kaggle and place it in this directory.

In [None]:
# Load the Titanic dataset
# Replace 'titanic.csv' with your actual file path
try:
    df = pd.read_csv('titanic.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Please download the Titanic dataset from Kaggle and save it as 'titanic.csv'")
    print("You can get it from: https://www.kaggle.com/c/titanic/data")

## 3. Basic Data Exploration

In [None]:
# Display first few rows
df.head()

In [None]:
# Dataset shape
print(f"Dataset shape: {df.shape}")
print(f"Number of passengers: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

In [None]:
# Column information
df.info()

In [None]:
# Statistical summary
df.describe()

In [None]:
# Check for missing values
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percent
})
print("Missing Data Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

## 4. Survival Analysis

In [None]:
# Overall survival rate
if 'Survived' in df.columns:
    survival_rate = df['Survived'].mean()
    print(f"Overall survival rate: {survival_rate:.2%}")
    
    # Survival counts
    plt.figure(figsize=(6, 4))
    df['Survived'].value_counts().plot(kind='bar')
    plt.title('Survival Distribution')
    plt.xlabel('Survived (0 = No, 1 = Yes)')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.show()

## 5. Feature Analysis

In [None]:
# Survival by Gender
if 'Sex' in df.columns and 'Survived' in df.columns:
    plt.figure(figsize=(8, 5))
    sns.countplot(data=df, x='Sex', hue='Survived')
    plt.title('Survival by Gender')
    plt.show()
    
    # Survival rate by gender
    print("\nSurvival Rate by Gender:")
    print(df.groupby('Sex')['Survived'].mean())

In [None]:
# Survival by Passenger Class
if 'Pclass' in df.columns and 'Survived' in df.columns:
    plt.figure(figsize=(8, 5))
    sns.countplot(data=df, x='Pclass', hue='Survived')
    plt.title('Survival by Passenger Class')
    plt.xlabel('Passenger Class')
    plt.show()
    
    # Survival rate by class
    print("\nSurvival Rate by Class:")
    print(df.groupby('Pclass')['Survived'].mean())

In [None]:
# Age distribution
if 'Age' in df.columns:
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    df['Age'].hist(bins=30, edgecolor='black')
    plt.title('Age Distribution')
    plt.xlabel('Age')
    plt.ylabel('Count')
    
    if 'Survived' in df.columns:
        plt.subplot(1, 2, 2)
        df.boxplot(column='Age', by='Survived')
        plt.title('Age Distribution by Survival')
        plt.suptitle('')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Fare distribution
if 'Fare' in df.columns:
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    df['Fare'].hist(bins=50, edgecolor='black')
    plt.title('Fare Distribution')
    plt.xlabel('Fare')
    plt.ylabel('Count')
    
    if 'Survived' in df.columns:
        plt.subplot(1, 2, 2)
        df.boxplot(column='Fare', by='Survived')
        plt.title('Fare Distribution by Survival')
        plt.suptitle('')
    
    plt.tight_layout()
    plt.show()

## 6. Correlation Analysis

In [None]:
# Correlation heatmap for numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numerical_cols) > 1:
    plt.figure(figsize=(10, 8))
    correlation_matrix = df[numerical_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Heatmap of Numerical Features')
    plt.show()

## 7. Family Size Analysis

In [None]:
# Create family size feature
if 'SibSp' in df.columns and 'Parch' in df.columns:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    if 'Survived' in df.columns:
        plt.figure(figsize=(10, 5))
        family_survival = df.groupby('FamilySize')['Survived'].mean()
        family_survival.plot(kind='bar')
        plt.title('Survival Rate by Family Size')
        plt.xlabel('Family Size')
        plt.ylabel('Survival Rate')
        plt.xticks(rotation=0)
        plt.show()

## 8. Port of Embarkation Analysis

In [None]:
# Embarkation port analysis
if 'Embarked' in df.columns:
    print("Port of Embarkation:")
    print("C = Cherbourg, Q = Queenstown, S = Southampton")
    print("\nDistribution:")
    print(df['Embarked'].value_counts())
    
    if 'Survived' in df.columns:
        plt.figure(figsize=(8, 5))
        sns.countplot(data=df, x='Embarked', hue='Survived')
        plt.title('Survival by Port of Embarkation')
        plt.show()
        
        print("\nSurvival Rate by Port:")
        print(df.groupby('Embarked')['Survived'].mean())

## 9. Cabin Analysis

In [None]:
# Cabin information
if 'Cabin' in df.columns:
    # Check if passenger had cabin information
    df['HasCabin'] = df['Cabin'].notna().astype(int)
    
    print(f"Passengers with cabin info: {df['HasCabin'].sum()} ({df['HasCabin'].mean():.2%})")
    
    if 'Survived' in df.columns:
        print("\nSurvival rate by cabin availability:")
        print(df.groupby('HasCabin')['Survived'].mean())

## 10. Summary Statistics

In [None]:
# Create a summary of key insights
if 'Survived' in df.columns:
    print("=== KEY INSIGHTS ===")
    print(f"\n1. Overall survival rate: {df['Survived'].mean():.2%}")
    
    if 'Sex' in df.columns:
        female_survival = df[df['Sex'] == 'female']['Survived'].mean()
        male_survival = df[df['Sex'] == 'male']['Survived'].mean()
        print(f"\n2. Gender impact:")
        print(f"   - Female survival rate: {female_survival:.2%}")
        print(f"   - Male survival rate: {male_survival:.2%}")
    
    if 'Pclass' in df.columns:
        print(f"\n3. Class impact:")
        for pclass in sorted(df['Pclass'].unique()):
            class_survival = df[df['Pclass'] == pclass]['Survived'].mean()
            print(f"   - Class {pclass} survival rate: {class_survival:.2%}")
    
    if 'Age' in df.columns:
        children = df[df['Age'] < 18]['Survived'].mean()
        adults = df[df['Age'] >= 18]['Survived'].mean()
        print(f"\n4. Age impact:")
        print(f"   - Children (<18) survival rate: {children:.2%}")
        print(f"   - Adults (>=18) survival rate: {adults:.2%}")